diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index e1103fb848..5df13ad9b5 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -1,5 +1,5 @@ -name: Broken site -description: Report broken or misfunctioning site +name: Broken site support +description: Report issue with yt-dlp on a supported site labels: [triage, site-bug] body: - type: checkboxes @@ -16,9 +16,9 @@ body: description: | Carefully read and work through this check list in order to prevent the most common mistakes and misuse of yt-dlp: options: - - label: I'm reporting a broken site + - label: I'm reporting that yt-dlp is broken on a **supported** site required: true - - label: I've verified that I'm running yt-dlp version **2023.02.17** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -50,6 +50,8 @@ body: options: - label: Run **your** yt-dlp command with **-vU** flag added (`yt-dlp -vU `) required: true + - label: "If using API, add `'verbose': True` to `YoutubeDL` params instead" + required: false - label: Copy the WHOLE output (starting with `[debug] Command-line config`) and insert it below required: true - type: textarea @@ -59,19 +61,18 @@ body: description: | It should start like this: placeholder: | - [debug] Command-line config: ['-vU', 'test:youtube'] - [debug] Portable config "yt-dlp.conf": ['-i'] + [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.02.17 [9d339c4] (win32_exe) + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] Checking exe version: ffmpeg -bsfs - [debug] Checking exe version: ffprobe -bsfs [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.02.17, Current version: 2023.02.17 - yt-dlp is up to date (2023.02.17) + [debug] Request Handlers: urllib, requests + [debug] Loaded 1893 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) + [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index 90d7294ac0..644c87a7ed 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **2023.02.17** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -62,6 +62,8 @@ body: options: - label: Run **your** yt-dlp command with **-vU** flag added (`yt-dlp -vU `) required: true + - label: "If using API, add `'verbose': True` to `YoutubeDL` params instead" + required: false - label: Copy the WHOLE output (starting with `[debug] Command-line config`) and insert it below required: true - type: textarea @@ -71,19 +73,18 @@ body: description: | It should start like this: placeholder: | - [debug] Command-line config: ['-vU', 'test:youtube'] - [debug] Portable config "yt-dlp.conf": ['-i'] + [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.02.17 [9d339c4] (win32_exe) + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] Checking exe version: ffmpeg -bsfs - [debug] Checking exe version: ffprobe -bsfs [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.02.17, Current version: 2023.02.17 - yt-dlp is up to date (2023.02.17) + [debug] Request Handlers: urllib, requests + [debug] Loaded 1893 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) + [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index 5b59852c70..59d0474c28 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm requesting a site-specific feature required: true - - label: I've verified that I'm running yt-dlp version **2023.02.17** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -58,6 +58,8 @@ body: options: - label: Run **your** yt-dlp command with **-vU** flag added (`yt-dlp -vU `) required: true + - label: "If using API, add `'verbose': True` to `YoutubeDL` params instead" + required: false - label: Copy the WHOLE output (starting with `[debug] Command-line config`) and insert it below required: true - type: textarea @@ -67,19 +69,18 @@ body: description: | It should start like this: placeholder: | - [debug] Command-line config: ['-vU', 'test:youtube'] - [debug] Portable config "yt-dlp.conf": ['-i'] + [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.02.17 [9d339c4] (win32_exe) + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] Checking exe version: ffmpeg -bsfs - [debug] Checking exe version: ffprobe -bsfs [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.02.17, Current version: 2023.02.17 - yt-dlp is up to date (2023.02.17) + [debug] Request Handlers: urllib, requests + [debug] Loaded 1893 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) + [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index bd4695f878..e207396737 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -1,4 +1,4 @@ -name: Bug report +name: Core bug report description: Report a bug unrelated to any particular site or extractor labels: [triage, bug] body: @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **2023.02.17** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -43,6 +43,8 @@ body: options: - label: Run **your** yt-dlp command with **-vU** flag added (`yt-dlp -vU `) required: true + - label: "If using API, add `'verbose': True` to `YoutubeDL` params instead" + required: false - label: Copy the WHOLE output (starting with `[debug] Command-line config`) and insert it below required: true - type: textarea @@ -52,19 +54,18 @@ body: description: | It should start like this: placeholder: | - [debug] Command-line config: ['-vU', 'test:youtube'] - [debug] Portable config "yt-dlp.conf": ['-i'] + [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.02.17 [9d339c4] (win32_exe) + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] Checking exe version: ffmpeg -bsfs - [debug] Checking exe version: ffprobe -bsfs [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.02.17, Current version: 2023.02.17 - yt-dlp is up to date (2023.02.17) + [debug] Request Handlers: urllib, requests + [debug] Loaded 1893 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) + [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index 8c7f315e9e..e06db9ccf8 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -20,7 +20,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.02.17** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true @@ -40,6 +40,8 @@ body: label: Provide verbose output that clearly demonstrates the problem options: - label: Run **your** yt-dlp command with **-vU** flag added (`yt-dlp -vU `) + - label: "If using API, add `'verbose': True` to `YoutubeDL` params instead" + required: false - label: Copy the WHOLE output (starting with `[debug] Command-line config`) and insert it below - type: textarea id: log @@ -48,18 +50,17 @@ body: description: | It should start like this: placeholder: | - [debug] Command-line config: ['-vU', 'test:youtube'] - [debug] Portable config "yt-dlp.conf": ['-i'] + [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.02.17 [9d339c4] (win32_exe) + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] Checking exe version: ffmpeg -bsfs - [debug] Checking exe version: ffprobe -bsfs [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.02.17, Current version: 2023.02.17 - yt-dlp is up to date (2023.02.17) + [debug] Request Handlers: urllib, requests + [debug] Loaded 1893 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) + [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 4a13446286..571223a9c5 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -26,7 +26,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.02.17** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true @@ -46,6 +46,8 @@ body: label: Provide verbose output that clearly demonstrates the problem options: - label: Run **your** yt-dlp command with **-vU** flag added (`yt-dlp -vU `) + - label: "If using API, add `'verbose': True` to `YoutubeDL` params instead" + required: false - label: Copy the WHOLE output (starting with `[debug] Command-line config`) and insert it below - type: textarea id: log @@ -54,18 +56,17 @@ body: description: | It should start like this: placeholder: | - [debug] Command-line config: ['-vU', 'test:youtube'] - [debug] Portable config "yt-dlp.conf": ['-i'] + [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.02.17 [9d339c4] (win32_exe) + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] Checking exe version: ffmpeg -bsfs - [debug] Checking exe version: ffprobe -bsfs [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.02.17, Current version: 2023.02.17 - yt-dlp is up to date (2023.02.17) + [debug] Request Handlers: urllib, requests + [debug] Loaded 1893 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) + [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell diff --git a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml index 85900e92e2..bff28ae4e9 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml @@ -1,5 +1,5 @@ -name: Broken site -description: Report broken or misfunctioning site +name: Broken site support +description: Report issue with yt-dlp on a supported site labels: [triage, site-bug] body: %(no_skip)s @@ -10,9 +10,9 @@ body: description: | Carefully read and work through this check list in order to prevent the most common mistakes and misuse of yt-dlp: options: - - label: I'm reporting a broken site + - label: I'm reporting that yt-dlp is broken on a **supported** site required: true - - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml index 75d62e7bb2..2bffe738d0 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml @@ -12,7 +12,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml index 18b30f5783..6c31279830 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml @@ -12,7 +12,7 @@ body: options: - label: I'm requesting a site-specific feature required: true - - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml index 90f59e70b0..5f357d96e9 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml @@ -1,4 +1,4 @@ -name: Bug report +name: Core bug report description: Report a bug unrelated to any particular site or extractor labels: [triage, bug] body: @@ -12,7 +12,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml index ef3bb22693..99107ff584 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml @@ -14,7 +14,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/6_question.yml b/.github/ISSUE_TEMPLATE_tmpl/6_question.yml index 4bef82d5af..bd742109a4 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/6_question.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/6_question.yml @@ -20,7 +20,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index c4d3e812e2..4deee572f4 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -28,7 +28,6 @@ # PLEASE FOLLOW THE GUIDE BELOW ### Before submitting a *pull request* make sure you have: - [ ] At least skimmed through [contributing guidelines](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#developer-instructions) including [yt-dlp coding conventions](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#yt-dlp-coding-conventions) - [ ] [Searched](https://github.com/yt-dlp/yt-dlp/search?q=is%3Apr&type=Issues) the bugtracker for similar pull requests -- [ ] Checked the code with [flake8](https://pypi.python.org/pypi/flake8) and [ran relevant tests](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#developer-instructions) ### In order to be accepted and merged into yt-dlp each piece of code must be in public domain or released under [Unlicense](http://unlicense.org/). Check all of the following options that apply: - [ ] I am the original author of this code and I am willing to release it under [Unlicense](http://unlicense.org/) diff --git a/.github/banner.svg b/.github/banner.svg index 35dc93eaea..ea7f9e306e 100644 --- a/.github/banner.svg +++ b/.github/banner.svg @@ -1,4 +1,4 @@ - + - - - - + + + + diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6041376a4d..12ec5b0d8c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,393 +1,567 @@ -name: Build -on: workflow_dispatch +name: Build Artifacts +on: + workflow_call: + inputs: + version: + required: true + type: string + channel: + required: false + default: stable + type: string + unix: + default: true + type: boolean + linux_static: + default: true + type: boolean + linux_arm: + default: true + type: boolean + macos: + default: true + type: boolean + macos_legacy: + default: true + type: boolean + windows: + default: true + type: boolean + windows32: + default: true + type: boolean + origin: + required: false + default: '' + type: string + secrets: + GPG_SIGNING_KEY: + required: false + + workflow_dispatch: + inputs: + version: + description: | + VERSION: yyyy.mm.dd[.rev] or rev + required: true + type: string + channel: + description: | + SOURCE of this build's updates: stable/nightly/master/ + required: true + default: stable + type: string + unix: + description: yt-dlp, yt-dlp.tar.gz + default: true + type: boolean + linux_static: + description: yt-dlp_linux + default: true + type: boolean + linux_arm: + description: yt-dlp_linux_aarch64, yt-dlp_linux_armv7l + default: true + type: boolean + macos: + description: yt-dlp_macos, yt-dlp_macos.zip + default: true + type: boolean + macos_legacy: + description: yt-dlp_macos_legacy + default: true + type: boolean + windows: + description: yt-dlp.exe, yt-dlp_min.exe, yt-dlp_win.zip + default: true + type: boolean + windows32: + description: yt-dlp_x86.exe + default: true + type: boolean + origin: + description: Origin + required: false + default: 'current repo' + type: choice + options: + - 'current repo' + permissions: contents: read jobs: - prepare: - permissions: - contents: write # for push_release + process: runs-on: ubuntu-latest outputs: - version_suffix: ${{ steps.version_suffix.outputs.version_suffix }} - ytdlp_version: ${{ steps.bump_version.outputs.ytdlp_version }} - head_sha: ${{ steps.push_release.outputs.head_sha }} + origin: ${{ steps.process_origin.outputs.origin }} steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - uses: actions/setup-python@v4 - with: - python-version: '3.10' + - name: Process origin + id: process_origin + run: | + echo "origin=${{ inputs.origin == 'current repo' && github.repository || inputs.origin }}" | tee "$GITHUB_OUTPUT" - - name: Set version suffix - id: version_suffix - env: - PUSH_VERSION_COMMIT: ${{ secrets.PUSH_VERSION_COMMIT }} - if: "env.PUSH_VERSION_COMMIT == ''" - run: echo "version_suffix=$(date -u +"%H%M%S")" >> "$GITHUB_OUTPUT" - - name: Bump version - id: bump_version - run: | - python devscripts/update-version.py ${{ steps.version_suffix.outputs.version_suffix }} - make issuetemplates - - - name: Push to release - id: push_release - run: | - git config --global user.name github-actions - git config --global user.email github-actions@example.com - git add -u - git commit -m "[version] update" -m "Created by: ${{ github.event.sender.login }}" -m ":ci skip all :ci run dl" - git push origin --force ${{ github.event.ref }}:release - echo "head_sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT" - - name: Update master - env: - PUSH_VERSION_COMMIT: ${{ secrets.PUSH_VERSION_COMMIT }} - if: "env.PUSH_VERSION_COMMIT != ''" - run: git push origin ${{ github.event.ref }} - - - build_unix: - needs: prepare + unix: + needs: process + if: inputs.unix runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: '3.10' - - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - use-mamba: true - channels: conda-forge - auto-update-conda: true - activate-environment: '' - auto-activate-base: false - - name: Install Requirements - run: | - sudo apt-get -y install zip pandoc man sed - python -m pip install -U pip setuptools wheel twine - python -m pip install -U Pyinstaller -r requirements.txt - reqs=$(mktemp) - echo -e 'python=3.10.*\npyinstaller' >$reqs - sed 's/^brotli.*/brotli-python/' >$reqs - mamba create -n build --file $reqs - - - name: Prepare - run: | - python devscripts/update-version.py ${{ needs.prepare.outputs.version_suffix }} + - uses: actions/checkout@v4 + with: + fetch-depth: 0 # Needed for changelog + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + - name: Install Requirements + run: | + sudo apt -y install zip pandoc man sed + - name: Prepare + run: | + python devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}" + python devscripts/update_changelog.py -vv python devscripts/make_lazy_extractors.py - - name: Build Unix platform-independent binary - run: | + - name: Build Unix platform-independent binary + run: | make all tar - - name: Build Unix standalone binary - shell: bash -l {0} - run: | - unset LD_LIBRARY_PATH # Harmful; set by setup-python - conda activate build - python pyinst.py --onedir - (cd ./dist/yt-dlp_linux && zip -r ../yt-dlp_linux.zip .) - python pyinst.py + - name: Verify --update-to + if: vars.UPDATE_TO_VERIFICATION + run: | + chmod +x ./yt-dlp + cp ./yt-dlp ./yt-dlp_downgraded + version="$(./yt-dlp --version)" + ./yt-dlp_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04 + downgraded_version="$(./yt-dlp_downgraded --version)" + [[ "$version" != "$downgraded_version" ]] + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: build-bin-${{ github.job }} + path: | + yt-dlp + yt-dlp.tar.gz + compression-level: 0 - - name: Upload artifacts - uses: actions/upload-artifact@v3 - with: - path: | - yt-dlp - yt-dlp.tar.gz - dist/yt-dlp_linux - dist/yt-dlp_linux.zip - - - name: Build and publish on PyPi - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} - if: "env.TWINE_PASSWORD != ''" - run: | - rm -rf dist/* - python devscripts/set-variant.py pip -M "You installed yt-dlp with pip or using the wheel from PyPi; Use that to update" - python setup.py sdist bdist_wheel - twine upload dist/* - - - name: Install SSH private key for Homebrew - env: - BREW_TOKEN: ${{ secrets.BREW_TOKEN }} - if: "env.BREW_TOKEN != ''" - uses: yt-dlp/ssh-agent@v0.5.3 - with: - ssh-private-key: ${{ env.BREW_TOKEN }} - - name: Update Homebrew Formulae - env: - BREW_TOKEN: ${{ secrets.BREW_TOKEN }} - if: "env.BREW_TOKEN != ''" - run: | - git clone git@github.com:yt-dlp/homebrew-taps taps/ - python devscripts/update-formulae.py taps/Formula/yt-dlp.rb "${{ needs.prepare.outputs.ytdlp_version }}" - git -C taps/ config user.name github-actions - git -C taps/ config user.email github-actions@example.com - git -C taps/ commit -am 'yt-dlp: ${{ needs.prepare.outputs.ytdlp_version }}' - git -C taps/ push - - - build_linux_arm: - permissions: - packages: write # for Creating cache + linux_static: + needs: process + if: inputs.linux_static + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Build static executable + env: + channel: ${{ inputs.channel }} + origin: ${{ needs.process.outputs.origin }} + version: ${{ inputs.version }} + run: | + mkdir ~/build + cd bundle/docker + docker compose up --build static + sudo chown "${USER}:docker" ~/build/yt-dlp_linux + - name: Verify --update-to + if: vars.UPDATE_TO_VERIFICATION + run: | + chmod +x ~/build/yt-dlp_linux + cp ~/build/yt-dlp_linux ~/build/yt-dlp_linux_downgraded + version="$(~/build/yt-dlp_linux --version)" + ~/build/yt-dlp_linux_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04 + downgraded_version="$(~/build/yt-dlp_linux_downgraded --version)" + [[ "$version" != "$downgraded_version" ]] + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: build-bin-${{ github.job }} + path: | + ~/build/yt-dlp_linux + compression-level: 0 + + linux_arm: + needs: process + if: inputs.linux_arm + permissions: + contents: read + packages: write # for creating cache runs-on: ubuntu-latest - needs: prepare strategy: matrix: architecture: - - armv7 - - aarch64 + - armv7 + - aarch64 steps: - - uses: actions/checkout@v3 - with: - path: ./repo - - name: Virtualized Install, Prepare & Build - uses: yt-dlp/run-on-arch-action@v2 - with: - githubToken: ${{ github.token }} # To cache image - arch: ${{ matrix.architecture }} - distro: ubuntu18.04 # Standalone executable should be built on minimum supported OS - dockerRunArgs: --volume "${PWD}/repo:/repo" - install: | # Installing Python 3.10 from the Deadsnakes repo raises errors - apt update - apt -y install zlib1g-dev python3.8 python3.8-dev python3.8-distutils python3-pip - python3.8 -m pip install -U pip setuptools wheel - # Cannot access requirements.txt from the repo directory at this stage - python3.8 -m pip install -U Pyinstaller mutagen pycryptodomex websockets brotli certifi + - uses: actions/checkout@v4 + with: + path: ./repo + - name: Virtualized Install, Prepare & Build + uses: yt-dlp/run-on-arch-action@v2 + with: + # Ref: https://github.com/uraimo/run-on-arch-action/issues/55 + env: | + GITHUB_WORKFLOW: build + githubToken: ${{ github.token }} # To cache image + arch: ${{ matrix.architecture }} + distro: ubuntu18.04 # Standalone executable should be built on minimum supported OS + dockerRunArgs: --volume "${PWD}/repo:/repo" + install: | # Installing Python 3.10 from the Deadsnakes repo raises errors + apt update + apt -y install zlib1g-dev libffi-dev python3.8 python3.8-dev python3.8-distutils python3-pip + python3.8 -m pip install -U pip setuptools wheel + # Cannot access any files from the repo directory at this stage + python3.8 -m pip install -U Pyinstaller mutagen pycryptodomex websockets brotli certifi secretstorage cffi + run: | + cd repo + python3.8 devscripts/install_deps.py -o --include build + python3.8 devscripts/install_deps.py --include pyinstaller --include secretstorage # Cached version may be out of date + python3.8 devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}" + python3.8 devscripts/make_lazy_extractors.py + python3.8 -m bundle.pyinstaller + + if ${{ vars.UPDATE_TO_VERIFICATION && 'true' || 'false' }}; then + arch="${{ (matrix.architecture == 'armv7' && 'armv7l') || matrix.architecture }}" + chmod +x ./dist/yt-dlp_linux_${arch} + cp ./dist/yt-dlp_linux_${arch} ./dist/yt-dlp_linux_${arch}_downgraded + version="$(./dist/yt-dlp_linux_${arch} --version)" + ./dist/yt-dlp_linux_${arch}_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04 + downgraded_version="$(./dist/yt-dlp_linux_${arch}_downgraded --version)" + [[ "$version" != "$downgraded_version" ]] + fi + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: build-bin-linux_${{ matrix.architecture }} + path: | # run-on-arch-action designates armv7l as armv7 + repo/dist/yt-dlp_linux_${{ (matrix.architecture == 'armv7' && 'armv7l') || matrix.architecture }} + compression-level: 0 + + macos: + needs: process + if: inputs.macos + permissions: + contents: read + actions: write # For cleaning up cache + runs-on: macos-12 + + steps: + - uses: actions/checkout@v4 + # NB: Building universal2 does not work with python from actions/setup-python + + - name: Restore cached requirements + id: restore-cache + uses: actions/cache/restore@v4 + env: + SEGMENT_DOWNLOAD_TIMEOUT_MINS: 1 + with: + path: | + ~/yt-dlp-build-venv + key: cache-reqs-${{ github.job }} + + - name: Install Requirements run: | - cd repo - python3.8 -m pip install -U Pyinstaller -r requirements.txt # Cached version may be out of date - python3.8 devscripts/update-version.py ${{ needs.prepare.outputs.version_suffix }} - python3.8 devscripts/make_lazy_extractors.py - python3.8 pyinst.py - - - name: Upload artifacts - uses: actions/upload-artifact@v3 - with: - path: | # run-on-arch-action designates armv7l as armv7 - repo/dist/yt-dlp_linux_${{ (matrix.architecture == 'armv7' && 'armv7l') || matrix.architecture }} - - - build_macos: - runs-on: macos-11 - needs: prepare - - steps: - - uses: actions/checkout@v3 - # NB: In order to create a universal2 application, the version of python3 in /usr/bin has to be used - - name: Install Requirements - run: | brew install coreutils - /usr/bin/python3 -m pip install -U --user pip Pyinstaller -r requirements.txt + python3 -m venv ~/yt-dlp-build-venv + source ~/yt-dlp-build-venv/bin/activate + python3 devscripts/install_deps.py -o --include build + python3 devscripts/install_deps.py --print --include pyinstaller > requirements.txt + # We need to ignore wheels otherwise we break universal2 builds + python3 -m pip install -U --no-binary :all: -r requirements.txt + # We need to fuse our own universal2 wheels for curl_cffi + python3 -m pip install -U delocate + mkdir curl_cffi_whls curl_cffi_universal2 + python3 devscripts/install_deps.py --print -o --include curl-cffi > requirements.txt + for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do + python3 -m pip download \ + --only-binary=:all: \ + --platform "${platform}" \ + -d curl_cffi_whls \ + -r requirements.txt + done + ( # Overwrite x86_64-only libs with fat/universal2 libs or else Pyinstaller will do the opposite + # See https://github.com/yt-dlp/yt-dlp/pull/10069 + cd curl_cffi_whls + mkdir -p curl_cffi/.dylibs + python_libdir=$(python3 -c 'import sys; from pathlib import Path; print(Path(sys.path[1]).parent)') + for dylib in lib{ssl,crypto}.3.dylib; do + cp "${python_libdir}/${dylib}" "curl_cffi/.dylibs/${dylib}" + for wheel in curl_cffi*macos*x86_64.whl; do + zip "${wheel}" "curl_cffi/.dylibs/${dylib}" + done + done + ) + python3 -m delocate.cmd.delocate_fuse curl_cffi_whls/curl_cffi*.whl -w curl_cffi_universal2 + python3 -m delocate.cmd.delocate_fuse curl_cffi_whls/cffi*.whl -w curl_cffi_universal2 + for wheel in curl_cffi_universal2/*cffi*.whl; do + mv -n -- "${wheel}" "${wheel/x86_64/universal2}" + done + python3 -m pip install --force-reinstall -U curl_cffi_universal2/*cffi*.whl - - name: Prepare - run: | - /usr/bin/python3 devscripts/update-version.py ${{ needs.prepare.outputs.version_suffix }} - /usr/bin/python3 devscripts/make_lazy_extractors.py - - name: Build - run: | - /usr/bin/python3 pyinst.py --target-architecture universal2 --onedir + - name: Prepare + run: | + python3 devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}" + python3 devscripts/make_lazy_extractors.py + - name: Build + run: | + source ~/yt-dlp-build-venv/bin/activate + python3 -m bundle.pyinstaller --target-architecture universal2 --onedir (cd ./dist/yt-dlp_macos && zip -r ../yt-dlp_macos.zip .) - /usr/bin/python3 pyinst.py --target-architecture universal2 + python3 -m bundle.pyinstaller --target-architecture universal2 - - name: Upload artifacts - uses: actions/upload-artifact@v3 - with: - path: | - dist/yt-dlp_macos - dist/yt-dlp_macos.zip + - name: Verify --update-to + if: vars.UPDATE_TO_VERIFICATION + run: | + chmod +x ./dist/yt-dlp_macos + cp ./dist/yt-dlp_macos ./dist/yt-dlp_macos_downgraded + version="$(./dist/yt-dlp_macos --version)" + ./dist/yt-dlp_macos_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04 + downgraded_version="$(./dist/yt-dlp_macos_downgraded --version)" + [[ "$version" != "$downgraded_version" ]] + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: build-bin-${{ github.job }} + path: | + dist/yt-dlp_macos + dist/yt-dlp_macos.zip + compression-level: 0 - build_macos_legacy: - runs-on: macos-latest - needs: prepare + - name: Cleanup cache + if: steps.restore-cache.outputs.cache-hit == 'true' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + cache_key: cache-reqs-${{ github.job }} + repository: ${{ github.repository }} + branch: ${{ github.ref }} + run: | + gh extension install actions/gh-actions-cache + gh actions-cache delete "${cache_key}" -R "${repository}" -B "${branch}" --confirm + + - name: Cache requirements + uses: actions/cache/save@v4 + with: + path: | + ~/yt-dlp-build-venv + key: cache-reqs-${{ github.job }} + + macos_legacy: + needs: process + if: inputs.macos_legacy + runs-on: macos-12 steps: - - uses: actions/checkout@v3 - - name: Install Python - # We need the official Python, because the GA ones only support newer macOS versions - env: - PYTHON_VERSION: 3.10.5 - MACOSX_DEPLOYMENT_TARGET: 10.9 # Used up by the Python build tools - run: | + - uses: actions/checkout@v4 + - name: Install Python + # We need the official Python, because the GA ones only support newer macOS versions + env: + PYTHON_VERSION: 3.10.5 + MACOSX_DEPLOYMENT_TARGET: 10.9 # Used up by the Python build tools + run: | # Hack to get the latest patch version. Uncomment if needed #brew install python@3.10 #export PYTHON_VERSION=$( $(brew --prefix)/opt/python@3.10/bin/python3 --version | cut -d ' ' -f 2 ) - curl https://www.python.org/ftp/python/${PYTHON_VERSION}/python-${PYTHON_VERSION}-macos11.pkg -o "python.pkg" + curl "https://www.python.org/ftp/python/${PYTHON_VERSION}/python-${PYTHON_VERSION}-macos11.pkg" -o "python.pkg" sudo installer -pkg python.pkg -target / python3 --version - - name: Install Requirements - run: | + - name: Install Requirements + run: | brew install coreutils - python3 -m pip install -U --user pip Pyinstaller -r requirements.txt + python3 devscripts/install_deps.py --user -o --include build + python3 devscripts/install_deps.py --user --include pyinstaller - - name: Prepare - run: | - python3 devscripts/update-version.py ${{ needs.prepare.outputs.version_suffix }} + - name: Prepare + run: | + python3 devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}" python3 devscripts/make_lazy_extractors.py - - name: Build - run: | - python3 pyinst.py + - name: Build + run: | + python3 -m bundle.pyinstaller mv dist/yt-dlp_macos dist/yt-dlp_macos_legacy - - name: Upload artifacts - uses: actions/upload-artifact@v3 - with: - path: | - dist/yt-dlp_macos_legacy + - name: Verify --update-to + if: vars.UPDATE_TO_VERIFICATION + run: | + chmod +x ./dist/yt-dlp_macos_legacy + cp ./dist/yt-dlp_macos_legacy ./dist/yt-dlp_macos_legacy_downgraded + version="$(./dist/yt-dlp_macos_legacy --version)" + ./dist/yt-dlp_macos_legacy_downgraded -v --update-to yt-dlp/yt-dlp@2023.03.04 + downgraded_version="$(./dist/yt-dlp_macos_legacy_downgraded --version)" + [[ "$version" != "$downgraded_version" ]] + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: build-bin-${{ github.job }} + path: | + dist/yt-dlp_macos_legacy + compression-level: 0 - build_windows: + windows: + needs: process + if: inputs.windows runs-on: windows-latest - needs: prepare steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: # 3.8 is used for Win7 support - python-version: '3.8' - - name: Install Requirements - run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds - python -m pip install -U pip setuptools wheel py2exe - pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-5.8.0-py3-none-any.whl" -r requirements.txt + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: # 3.8 is used for Win7 support + python-version: "3.8" + - name: Install Requirements + run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds + python devscripts/install_deps.py -o --include build + python devscripts/install_deps.py --include curl-cffi + python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-6.7.0-py3-none-any.whl" - - name: Prepare - run: | - python devscripts/update-version.py ${{ needs.prepare.outputs.version_suffix }} + - name: Prepare + run: | + python devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}" python devscripts/make_lazy_extractors.py - - name: Build - run: | - python setup.py py2exe - Move-Item ./dist/yt-dlp.exe ./dist/yt-dlp_min.exe - python pyinst.py - python pyinst.py --onedir + - name: Build + run: | + python -m bundle.pyinstaller + python -m bundle.pyinstaller --onedir + Move-Item ./dist/yt-dlp.exe ./dist/yt-dlp_real.exe Compress-Archive -Path ./dist/yt-dlp/* -DestinationPath ./dist/yt-dlp_win.zip - - name: Upload artifacts - uses: actions/upload-artifact@v3 - with: - path: | - dist/yt-dlp.exe - dist/yt-dlp_min.exe - dist/yt-dlp_win.zip + - name: Install Requirements (py2exe) + run: | + python devscripts/install_deps.py --include py2exe + - name: Build (py2exe) + run: | + python -m bundle.py2exe + Move-Item ./dist/yt-dlp.exe ./dist/yt-dlp_min.exe + Move-Item ./dist/yt-dlp_real.exe ./dist/yt-dlp.exe + - name: Verify --update-to + if: vars.UPDATE_TO_VERIFICATION + run: | + foreach ($name in @("yt-dlp","yt-dlp_min")) { + Copy-Item "./dist/${name}.exe" "./dist/${name}_downgraded.exe" + $version = & "./dist/${name}.exe" --version + & "./dist/${name}_downgraded.exe" -v --update-to yt-dlp/yt-dlp@2023.03.04 + $downgraded_version = & "./dist/${name}_downgraded.exe" --version + if ($version -eq $downgraded_version) { + exit 1 + } + } - build_windows32: + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: build-bin-${{ github.job }} + path: | + dist/yt-dlp.exe + dist/yt-dlp_min.exe + dist/yt-dlp_win.zip + compression-level: 0 + + windows32: + needs: process + if: inputs.windows32 runs-on: windows-latest - needs: prepare steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: # 3.7 is used for Vista support. See https://github.com/yt-dlp/yt-dlp/issues/390 - python-version: '3.7' - architecture: 'x86' - - name: Install Requirements - run: | - python -m pip install -U pip setuptools wheel - pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-5.8.0-py3-none-any.whl" -r requirements.txt + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.8" + architecture: "x86" + - name: Install Requirements + run: | + python devscripts/install_deps.py -o --include build + python devscripts/install_deps.py + python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-6.7.0-py3-none-any.whl" - - name: Prepare - run: | - python devscripts/update-version.py ${{ needs.prepare.outputs.version_suffix }} + - name: Prepare + run: | + python devscripts/update-version.py -c "${{ inputs.channel }}" -r "${{ needs.process.outputs.origin }}" "${{ inputs.version }}" python devscripts/make_lazy_extractors.py - - name: Build - run: | - python pyinst.py + - name: Build + run: | + python -m bundle.pyinstaller - - name: Upload artifacts - uses: actions/upload-artifact@v3 - with: - path: | - dist/yt-dlp_x86.exe + - name: Verify --update-to + if: vars.UPDATE_TO_VERIFICATION + run: | + foreach ($name in @("yt-dlp_x86")) { + Copy-Item "./dist/${name}.exe" "./dist/${name}_downgraded.exe" + $version = & "./dist/${name}.exe" --version + & "./dist/${name}_downgraded.exe" -v --update-to yt-dlp/yt-dlp@2023.03.04 + $downgraded_version = & "./dist/${name}_downgraded.exe" --version + if ($version -eq $downgraded_version) { + exit 1 + } + } + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: build-bin-${{ github.job }} + path: | + dist/yt-dlp_x86.exe + compression-level: 0 - publish_release: - permissions: - contents: write # for action-gh-release + meta_files: + if: always() && !cancelled() + needs: + - process + - unix + - linux_static + - linux_arm + - macos + - macos_legacy + - windows + - windows32 runs-on: ubuntu-latest - needs: [prepare, build_unix, build_linux_arm, build_windows, build_windows32, build_macos, build_macos_legacy] - steps: - - uses: actions/checkout@v3 - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 + with: + path: artifact + pattern: build-bin-* + merge-multiple: true - - name: Get Changelog - run: | - changelog=$(grep -oPz '(?s)(?<=### ${{ needs.prepare.outputs.ytdlp_version }}\n{2}).+?(?=\n{2,3}###)' Changelog.md) || true - echo "changelog<> $GITHUB_ENV - echo "$changelog" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - name: Make Update spec - run: | - echo "# This file is used for regulating self-update" >> _update_spec - echo "lock 2022.07.18 .+ Python 3.6" >> _update_spec - - name: Make SHA2-SUMS files - run: | - sha256sum artifact/yt-dlp | awk '{print $1 " yt-dlp"}' >> SHA2-256SUMS - sha256sum artifact/yt-dlp.tar.gz | awk '{print $1 " yt-dlp.tar.gz"}' >> SHA2-256SUMS - sha256sum artifact/yt-dlp.exe | awk '{print $1 " yt-dlp.exe"}' >> SHA2-256SUMS - sha256sum artifact/yt-dlp_win.zip | awk '{print $1 " yt-dlp_win.zip"}' >> SHA2-256SUMS - sha256sum artifact/yt-dlp_min.exe | awk '{print $1 " yt-dlp_min.exe"}' >> SHA2-256SUMS - sha256sum artifact/yt-dlp_x86.exe | awk '{print $1 " yt-dlp_x86.exe"}' >> SHA2-256SUMS - sha256sum artifact/yt-dlp_macos | awk '{print $1 " yt-dlp_macos"}' >> SHA2-256SUMS - sha256sum artifact/yt-dlp_macos.zip | awk '{print $1 " yt-dlp_macos.zip"}' >> SHA2-256SUMS - sha256sum artifact/yt-dlp_macos_legacy | awk '{print $1 " yt-dlp_macos_legacy"}' >> SHA2-256SUMS - sha256sum artifact/yt-dlp_linux_armv7l | awk '{print $1 " yt-dlp_linux_armv7l"}' >> SHA2-256SUMS - sha256sum artifact/yt-dlp_linux_aarch64 | awk '{print $1 " yt-dlp_linux_aarch64"}' >> SHA2-256SUMS - sha256sum artifact/dist/yt-dlp_linux | awk '{print $1 " yt-dlp_linux"}' >> SHA2-256SUMS - sha256sum artifact/dist/yt-dlp_linux.zip | awk '{print $1 " yt-dlp_linux.zip"}' >> SHA2-256SUMS - sha512sum artifact/yt-dlp | awk '{print $1 " yt-dlp"}' >> SHA2-512SUMS - sha512sum artifact/yt-dlp.tar.gz | awk '{print $1 " yt-dlp.tar.gz"}' >> SHA2-512SUMS - sha512sum artifact/yt-dlp.exe | awk '{print $1 " yt-dlp.exe"}' >> SHA2-512SUMS - sha512sum artifact/yt-dlp_win.zip | awk '{print $1 " yt-dlp_win.zip"}' >> SHA2-512SUMS - sha512sum artifact/yt-dlp_min.exe | awk '{print $1 " yt-dlp_min.exe"}' >> SHA2-512SUMS - sha512sum artifact/yt-dlp_x86.exe | awk '{print $1 " yt-dlp_x86.exe"}' >> SHA2-512SUMS - sha512sum artifact/yt-dlp_macos | awk '{print $1 " yt-dlp_macos"}' >> SHA2-512SUMS - sha512sum artifact/yt-dlp_macos.zip | awk '{print $1 " yt-dlp_macos.zip"}' >> SHA2-512SUMS - sha512sum artifact/yt-dlp_macos_legacy | awk '{print $1 " yt-dlp_macos_legacy"}' >> SHA2-512SUMS - sha512sum artifact/yt-dlp_linux_armv7l | awk '{print $1 " yt-dlp_linux_armv7l"}' >> SHA2-512SUMS - sha512sum artifact/yt-dlp_linux_aarch64 | awk '{print $1 " yt-dlp_linux_aarch64"}' >> SHA2-512SUMS - sha512sum artifact/dist/yt-dlp_linux | awk '{print $1 " yt-dlp_linux"}' >> SHA2-512SUMS - sha512sum artifact/dist/yt-dlp_linux.zip | awk '{print $1 " yt-dlp_linux.zip"}' >> SHA2-512SUMS + - name: Make SHA2-SUMS files + run: | + cd ./artifact/ + # make sure SHA sums are also printed to stdout + sha256sum -- * | tee ../SHA2-256SUMS + sha512sum -- * | tee ../SHA2-512SUMS + # also print as permanent annotations to the summary page + while read -r shasum; do + echo "::notice title=${shasum##* }::sha256: ${shasum% *}" + done < ../SHA2-256SUMS - - name: Publish Release - uses: yt-dlp/action-gh-release@v1 - with: - tag_name: ${{ needs.prepare.outputs.ytdlp_version }} - name: yt-dlp ${{ needs.prepare.outputs.ytdlp_version }} - target_commitish: ${{ needs.prepare.outputs.head_sha }} - body: | - #### [A description of the various files]((https://github.com/yt-dlp/yt-dlp#release-files)) are in the README + - name: Make Update spec + run: | + cat >> _update_spec << EOF + # This file is used for regulating self-update + lock 2022.08.18.36 .+ Python 3\.6 + lock 2023.11.16 (?!win_x86_exe).+ Python 3\.7 + lock 2023.11.16 win_x86_exe .+ Windows-(?:Vista|2008Server) + lockV2 yt-dlp/yt-dlp 2022.08.18.36 .+ Python 3\.6 + lockV2 yt-dlp/yt-dlp 2023.11.16 (?!win_x86_exe).+ Python 3\.7 + lockV2 yt-dlp/yt-dlp 2023.11.16 win_x86_exe .+ Windows-(?:Vista|2008Server) + lockV2 yt-dlp/yt-dlp-nightly-builds 2023.11.15.232826 (?!win_x86_exe).+ Python 3\.7 + lockV2 yt-dlp/yt-dlp-nightly-builds 2023.11.15.232826 win_x86_exe .+ Windows-(?:Vista|2008Server) + lockV2 yt-dlp/yt-dlp-master-builds 2023.11.15.232812 (?!win_x86_exe).+ Python 3\.7 + lockV2 yt-dlp/yt-dlp-master-builds 2023.11.15.232812 win_x86_exe .+ Windows-(?:Vista|2008Server) + EOF - --- -

Changelog -

+ - name: Sign checksum files + env: + GPG_SIGNING_KEY: ${{ secrets.GPG_SIGNING_KEY }} + if: env.GPG_SIGNING_KEY != '' + run: | + gpg --batch --import <<< "${{ secrets.GPG_SIGNING_KEY }}" + for signfile in ./SHA*SUMS; do + gpg --batch --detach-sign "$signfile" + done - ${{ env.changelog }} - -

- - files: | - SHA2-256SUMS - SHA2-512SUMS - artifact/yt-dlp - artifact/yt-dlp.tar.gz - artifact/yt-dlp.exe - artifact/yt-dlp_win.zip - artifact/yt-dlp_min.exe - artifact/yt-dlp_x86.exe - artifact/yt-dlp_macos - artifact/yt-dlp_macos.zip - artifact/yt-dlp_macos_legacy - artifact/yt-dlp_linux_armv7l - artifact/yt-dlp_linux_aarch64 - artifact/dist/yt-dlp_linux - artifact/dist/yt-dlp_linux.zip - _update_spec + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: build-${{ github.job }} + path: | + _update_spec + SHA*SUMS* + compression-level: 0 + overwrite: true diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000000..170a6ac19f --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,65 @@ +name: "CodeQL" + +on: + push: + branches: [ 'master', 'gh-pages', 'release' ] + pull_request: + # The branches below must be a subset of the branches above + branches: [ 'master' ] + schedule: + - cron: '59 11 * * 5' + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ 'python' ] + # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] + # Use only 'java' to analyze code written in Java, Kotlin or both + # Use only 'javascript' to analyze code written in JavaScript, TypeScript or both + # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + + # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v2 + + # ℹ️ Command-line programs to run using the OS shell. + # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + + # If the Autobuild fails above, remove it and uncomment the following three lines. + # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. + + # - run: | + # echo "Run, Build Application using script" + # ./location_of_script_within_repo/buildscript.sh + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 + with: + category: "/language:${{matrix.language}}" diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index dead444c0b..fdfdebc65d 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -1,8 +1,32 @@ name: Core Tests -on: [push, pull_request] +on: + push: + paths: + - .github/** + - devscripts/** + - test/** + - yt_dlp/**.py + - '!yt_dlp/extractor/*.py' + - yt_dlp/extractor/__init__.py + - yt_dlp/extractor/common.py + - yt_dlp/extractor/extractors.py + pull_request: + paths: + - .github/** + - devscripts/** + - test/** + - yt_dlp/**.py + - '!yt_dlp/extractor/*.py' + - yt_dlp/extractor/__init__.py + - yt_dlp/extractor/common.py + - yt_dlp/extractor/extractors.py permissions: contents: read +concurrency: + group: core-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + jobs: tests: name: Core Tests @@ -12,27 +36,26 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - # CPython 3.11 is in quick-test - python-version: ['3.8', '3.9', '3.10', pypy-3.7, pypy-3.8] - run-tests-ext: [sh] + # CPython 3.8 is in quick-test + python-version: ['3.9', '3.10', '3.11', '3.12', pypy-3.8, pypy-3.10] include: # atleast one of each CPython/PyPy tests must be in windows - os: windows-latest - python-version: '3.7' - run-tests-ext: bat + python-version: '3.8' + - os: windows-latest + python-version: '3.12' - os: windows-latest python-version: pypy-3.9 - run-tests-ext: bat steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Install pytest - run: pip install pytest + - name: Install test requirements + run: python3 ./devscripts/install_deps.py --include test --include curl-cffi - name: Run tests continue-on-error: False run: | python3 -m yt_dlp -v || true # Print debug head - ./devscripts/run_tests.${{ matrix.run-tests-ext }} core + python3 ./devscripts/run_tests.py core diff --git a/.github/workflows/download.yml b/.github/workflows/download.yml index 2b2387d4f1..7256804d93 100644 --- a/.github/workflows/download.yml +++ b/.github/workflows/download.yml @@ -9,16 +9,16 @@ jobs: if: "contains(github.event.head_commit.message, 'ci run dl')" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: 3.9 - name: Install test requirements - run: pip install pytest + run: python3 ./devscripts/install_deps.py --include dev - name: Run tests continue-on-error: true - run: ./devscripts/run_tests.sh download + run: python3 ./devscripts/run_tests.py download full: name: Full Download Tests @@ -28,24 +28,21 @@ jobs: fail-fast: true matrix: os: [ubuntu-latest] - python-version: ['3.7', '3.10', 3.11-dev, pypy-3.7, pypy-3.8] - run-tests-ext: [sh] + python-version: ['3.10', '3.11', '3.12', pypy-3.8, pypy-3.10] include: # atleast one of each CPython/PyPy tests must be in windows - os: windows-latest python-version: '3.8' - run-tests-ext: bat - os: windows-latest python-version: pypy-3.9 - run-tests-ext: bat steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Install pytest - run: pip install pytest + - name: Install test requirements + run: python3 ./devscripts/install_deps.py --include dev - name: Run tests continue-on-error: true - run: ./devscripts/run_tests.${{ matrix.run-tests-ext }} download + run: python3 ./devscripts/run_tests.py download diff --git a/.github/workflows/quick-test.yml b/.github/workflows/quick-test.yml index 930e58152d..3afb51a308 100644 --- a/.github/workflows/quick-test.yml +++ b/.github/workflows/quick-test.yml @@ -9,27 +9,31 @@ jobs: if: "!contains(github.event.head_commit.message, 'ci skip all')" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: Set up Python 3.11 - uses: actions/setup-python@v4 + - uses: actions/checkout@v4 + - name: Set up Python 3.8 + uses: actions/setup-python@v5 with: - python-version: '3.11' + python-version: '3.8' - name: Install test requirements - run: pip install pytest pycryptodomex + run: python3 ./devscripts/install_deps.py --include test - name: Run tests run: | python3 -m yt_dlp -v || true - ./devscripts/run_tests.sh core - flake8: - name: Linter + python3 ./devscripts/run_tests.py core + check: + name: Code check if: "!contains(github.event.head_commit.message, 'ci skip all')" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - - name: Install flake8 - run: pip install flake8 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.8' + - name: Install dev dependencies + run: python3 ./devscripts/install_deps.py -o --include static-analysis - name: Make lazy extractors - run: python devscripts/make_lazy_extractors.py - - name: Run flake8 - run: flake8 . + run: python3 ./devscripts/make_lazy_extractors.py + - name: Run ruff + run: ruff check --output-format github . + - name: Run autopep8 + run: autopep8 --diff . diff --git a/.github/workflows/release-master.yml b/.github/workflows/release-master.yml new file mode 100644 index 0000000000..c49319b171 --- /dev/null +++ b/.github/workflows/release-master.yml @@ -0,0 +1,30 @@ +name: Release (master) +on: + push: + branches: + - master + paths: + - "yt_dlp/**.py" + - "!yt_dlp/version.py" + - "bundle/*.py" + - "pyproject.toml" + - "Makefile" + - ".github/workflows/build.yml" +concurrency: + group: release-master +permissions: + contents: read + +jobs: + release: + if: vars.BUILD_MASTER != '' + uses: ./.github/workflows/release.yml + with: + prerelease: true + source: master + permissions: + contents: write + packages: write # For package cache + actions: write # For cleaning up cache + id-token: write # mandatory for trusted publishing + secrets: inherit diff --git a/.github/workflows/release-nightly.yml b/.github/workflows/release-nightly.yml new file mode 100644 index 0000000000..b536c50669 --- /dev/null +++ b/.github/workflows/release-nightly.yml @@ -0,0 +1,43 @@ +name: Release (nightly) +on: + schedule: + - cron: '23 23 * * *' +permissions: + contents: read + +jobs: + check_nightly: + if: vars.BUILD_NIGHTLY != '' + runs-on: ubuntu-latest + outputs: + commit: ${{ steps.check_for_new_commits.outputs.commit }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Check for new commits + id: check_for_new_commits + run: | + relevant_files=( + "yt_dlp/*.py" + ':!yt_dlp/version.py' + "bundle/*.py" + "pyproject.toml" + "Makefile" + ".github/workflows/build.yml" + ) + echo "commit=$(git log --format=%H -1 --since="24 hours ago" -- "${relevant_files[@]}")" | tee "$GITHUB_OUTPUT" + + release: + needs: [check_nightly] + if: ${{ needs.check_nightly.outputs.commit }} + uses: ./.github/workflows/release.yml + with: + prerelease: true + source: nightly + permissions: + contents: write + packages: write # For package cache + actions: write # For cleaning up cache + id-token: write # mandatory for trusted publishing + secrets: inherit diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000000..fa5ad7e515 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,384 @@ +name: Release +on: + workflow_call: + inputs: + prerelease: + required: false + default: true + type: boolean + source: + required: false + default: '' + type: string + target: + required: false + default: '' + type: string + version: + required: false + default: '' + type: string + workflow_dispatch: + inputs: + source: + description: | + SOURCE of this release's updates: + channel, repo, tag, or channel/repo@tag + (default: ) + required: false + default: '' + type: string + target: + description: | + TARGET to publish this release to: + channel, tag, or channel@tag + (default: if writable else [@source_tag]) + required: false + default: '' + type: string + version: + description: | + VERSION: yyyy.mm.dd[.rev] or rev + (default: auto-generated) + required: false + default: '' + type: string + prerelease: + description: Pre-release + default: false + type: boolean + +permissions: + contents: read + +jobs: + prepare: + permissions: + contents: write + runs-on: ubuntu-latest + outputs: + channel: ${{ steps.setup_variables.outputs.channel }} + version: ${{ steps.setup_variables.outputs.version }} + target_repo: ${{ steps.setup_variables.outputs.target_repo }} + target_repo_token: ${{ steps.setup_variables.outputs.target_repo_token }} + target_tag: ${{ steps.setup_variables.outputs.target_tag }} + pypi_project: ${{ steps.setup_variables.outputs.pypi_project }} + pypi_suffix: ${{ steps.setup_variables.outputs.pypi_suffix }} + head_sha: ${{ steps.get_target.outputs.head_sha }} + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Process inputs + id: process_inputs + run: | + cat << EOF + ::group::Inputs + prerelease=${{ inputs.prerelease }} + source=${{ inputs.source }} + target=${{ inputs.target }} + version=${{ inputs.version }} + ::endgroup:: + EOF + IFS='@' read -r source_repo source_tag <<<"${{ inputs.source }}" + IFS='@' read -r target_repo target_tag <<<"${{ inputs.target }}" + cat << EOF >> "$GITHUB_OUTPUT" + source_repo=${source_repo} + source_tag=${source_tag} + target_repo=${target_repo} + target_tag=${target_tag} + EOF + + - name: Setup variables + id: setup_variables + env: + source_repo: ${{ steps.process_inputs.outputs.source_repo }} + source_tag: ${{ steps.process_inputs.outputs.source_tag }} + target_repo: ${{ steps.process_inputs.outputs.target_repo }} + target_tag: ${{ steps.process_inputs.outputs.target_tag }} + run: | + # unholy bash monstrosity (sincere apologies) + fallback_token () { + if ${{ !secrets.ARCHIVE_REPO_TOKEN }}; then + echo "::error::Repository access secret ${target_repo_token^^} not found" + exit 1 + fi + target_repo_token=ARCHIVE_REPO_TOKEN + return 0 + } + + source_is_channel=0 + [[ "${source_repo}" == 'stable' ]] && source_repo='yt-dlp/yt-dlp' + if [[ -z "${source_repo}" ]]; then + source_repo='${{ github.repository }}' + elif [[ '${{ vars[format('{0}_archive_repo', env.source_repo)] }}' ]]; then + source_is_channel=1 + source_channel='${{ vars[format('{0}_archive_repo', env.source_repo)] }}' + elif [[ -z "${source_tag}" && "${source_repo}" != */* ]]; then + source_tag="${source_repo}" + source_repo='${{ github.repository }}' + fi + resolved_source="${source_repo}" + if [[ "${source_tag}" ]]; then + resolved_source="${resolved_source}@${source_tag}" + elif [[ "${source_repo}" == 'yt-dlp/yt-dlp' ]]; then + resolved_source='stable' + fi + + revision="${{ (inputs.prerelease || !vars.PUSH_VERSION_COMMIT) && '$(date -u +"%H%M%S")' || '' }}" + version="$( + python devscripts/update-version.py \ + -c "${resolved_source}" -r "${{ github.repository }}" ${{ inputs.version || '$revision' }} | \ + grep -Po "version=\K\d+\.\d+\.\d+(\.\d+)?")" + + if [[ "${target_repo}" ]]; then + if [[ -z "${target_tag}" ]]; then + if [[ '${{ vars[format('{0}_archive_repo', env.target_repo)] }}' ]]; then + target_tag="${source_tag:-${version}}" + else + target_tag="${target_repo}" + target_repo='${{ github.repository }}' + fi + fi + if [[ "${target_repo}" != '${{ github.repository}}' ]]; then + target_repo='${{ vars[format('{0}_archive_repo', env.target_repo)] }}' + target_repo_token='${{ env.target_repo }}_archive_repo_token' + ${{ !!secrets[format('{0}_archive_repo_token', env.target_repo)] }} || fallback_token + pypi_project='${{ vars[format('{0}_pypi_project', env.target_repo)] }}' + pypi_suffix='${{ vars[format('{0}_pypi_suffix', env.target_repo)] }}' + fi + else + target_tag="${source_tag:-${version}}" + if ((source_is_channel)); then + target_repo="${source_channel}" + target_repo_token='${{ env.source_repo }}_archive_repo_token' + ${{ !!secrets[format('{0}_archive_repo_token', env.source_repo)] }} || fallback_token + pypi_project='${{ vars[format('{0}_pypi_project', env.source_repo)] }}' + pypi_suffix='${{ vars[format('{0}_pypi_suffix', env.source_repo)] }}' + else + target_repo='${{ github.repository }}' + fi + fi + + if [[ "${target_repo}" == '${{ github.repository }}' ]] && ${{ !inputs.prerelease }}; then + pypi_project='${{ vars.PYPI_PROJECT }}' + fi + + echo "::group::Output variables" + cat << EOF | tee -a "$GITHUB_OUTPUT" + channel=${resolved_source} + version=${version} + target_repo=${target_repo} + target_repo_token=${target_repo_token} + target_tag=${target_tag} + pypi_project=${pypi_project} + pypi_suffix=${pypi_suffix} + EOF + echo "::endgroup::" + + - name: Update documentation + env: + version: ${{ steps.setup_variables.outputs.version }} + target_repo: ${{ steps.setup_variables.outputs.target_repo }} + if: | + !inputs.prerelease && env.target_repo == github.repository + run: | + python devscripts/update_changelog.py -vv + make doc + + - name: Push to release + id: push_release + env: + version: ${{ steps.setup_variables.outputs.version }} + target_repo: ${{ steps.setup_variables.outputs.target_repo }} + if: | + !inputs.prerelease && env.target_repo == github.repository + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com" + git add -u + git commit -m "Release ${{ env.version }}" \ + -m "Created by: ${{ github.event.sender.login }}" -m ":ci skip all :ci run dl" + git push origin --force ${{ github.event.ref }}:release + + - name: Get target commitish + id: get_target + run: | + echo "head_sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT" + + - name: Update master + env: + target_repo: ${{ steps.setup_variables.outputs.target_repo }} + if: | + vars.PUSH_VERSION_COMMIT != '' && !inputs.prerelease && env.target_repo == github.repository + run: git push origin ${{ github.event.ref }} + + build: + needs: prepare + uses: ./.github/workflows/build.yml + with: + version: ${{ needs.prepare.outputs.version }} + channel: ${{ needs.prepare.outputs.channel }} + origin: ${{ needs.prepare.outputs.target_repo }} + permissions: + contents: read + packages: write # For package cache + actions: write # For cleaning up cache + secrets: + GPG_SIGNING_KEY: ${{ secrets.GPG_SIGNING_KEY }} + + publish_pypi: + needs: [prepare, build] + if: ${{ needs.prepare.outputs.pypi_project }} + runs-on: ubuntu-latest + permissions: + id-token: write # mandatory for trusted publishing + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install Requirements + run: | + sudo apt -y install pandoc man + python devscripts/install_deps.py -o --include build + + - name: Prepare + env: + version: ${{ needs.prepare.outputs.version }} + suffix: ${{ needs.prepare.outputs.pypi_suffix }} + channel: ${{ needs.prepare.outputs.channel }} + target_repo: ${{ needs.prepare.outputs.target_repo }} + pypi_project: ${{ needs.prepare.outputs.pypi_project }} + run: | + python devscripts/update-version.py -c "${{ env.channel }}" -r "${{ env.target_repo }}" -s "${{ env.suffix }}" "${{ env.version }}" + python devscripts/update_changelog.py -vv + python devscripts/make_lazy_extractors.py + sed -i -E '0,/(name = ")[^"]+(")/s//\1${{ env.pypi_project }}\2/' pyproject.toml + + - name: Build + run: | + rm -rf dist/* + make pypi-files + printf '%s\n\n' \ + 'Official repository: ' \ + '**PS**: Some links in this document will not work since this is a copy of the README.md from Github' > ./README.md.new + cat ./README.md >> ./README.md.new && mv -f ./README.md.new ./README.md + python devscripts/set-variant.py pip -M "You installed yt-dlp with pip or using the wheel from PyPi; Use that to update" + make clean-cache + python -m build --no-isolation . + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + verbose: true + + publish: + needs: [prepare, build] + permissions: + contents: write + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - uses: actions/download-artifact@v4 + with: + path: artifact + pattern: build-* + merge-multiple: true + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Generate release notes + env: + head_sha: ${{ needs.prepare.outputs.head_sha }} + target_repo: ${{ needs.prepare.outputs.target_repo }} + target_tag: ${{ needs.prepare.outputs.target_tag }} + run: | + printf '%s' \ + '[![Installation](https://img.shields.io/badge/-Which%20file%20to%20download%3F-white.svg?style=for-the-badge)]' \ + '(https://github.com/${{ github.repository }}#installation "Installation instructions") ' \ + '[![Discord](https://img.shields.io/discord/807245652072857610?color=blue&labelColor=555555&label=&logo=discord&style=for-the-badge)]' \ + '(https://discord.gg/H5MNcFW63r "Discord") ' \ + '[![Donate](https://img.shields.io/badge/_-Donate-red.svg?logo=githubsponsors&labelColor=555555&style=for-the-badge)]' \ + '(https://github.com/yt-dlp/yt-dlp/blob/master/Collaborators.md#collaborators "Donate") ' \ + '[![Documentation](https://img.shields.io/badge/-Docs-brightgreen.svg?style=for-the-badge&logo=GitBook&labelColor=555555)]' \ + '(https://github.com/${{ github.repository }}' \ + '${{ env.target_repo == github.repository && format('/tree/{0}', env.target_tag) || '' }}#readme "Documentation") ' \ + ${{ env.target_repo == 'yt-dlp/yt-dlp' && '\ + "[![Nightly](https://img.shields.io/badge/Nightly%20builds-purple.svg?style=for-the-badge)]" \ + "(https://github.com/yt-dlp/yt-dlp-nightly-builds/releases/latest \"Nightly builds\") " \ + "[![Master](https://img.shields.io/badge/Master%20builds-lightblue.svg?style=for-the-badge)]" \ + "(https://github.com/yt-dlp/yt-dlp-master-builds/releases/latest \"Master builds\")"' || '' }} > ./RELEASE_NOTES + printf '\n\n' >> ./RELEASE_NOTES + cat >> ./RELEASE_NOTES << EOF + #### A description of the various files are in the [README](https://github.com/${{ github.repository }}#release-files) + --- + $(python ./devscripts/make_changelog.py -vv --collapsible) + EOF + printf '%s\n\n' '**This is a pre-release build**' >> ./PRERELEASE_NOTES + cat ./RELEASE_NOTES >> ./PRERELEASE_NOTES + printf '%s\n\n' 'Generated from: https://github.com/${{ github.repository }}/commit/${{ env.head_sha }}' >> ./ARCHIVE_NOTES + cat ./RELEASE_NOTES >> ./ARCHIVE_NOTES + + - name: Publish to archive repo + env: + GH_TOKEN: ${{ secrets[needs.prepare.outputs.target_repo_token] }} + GH_REPO: ${{ needs.prepare.outputs.target_repo }} + version: ${{ needs.prepare.outputs.version }} + channel: ${{ needs.prepare.outputs.channel }} + if: | + inputs.prerelease && env.GH_TOKEN != '' && env.GH_REPO != '' && env.GH_REPO != github.repository + run: | + title="${{ startswith(env.GH_REPO, 'yt-dlp/') && 'yt-dlp ' || '' }}${{ env.channel }}" + gh release create \ + --notes-file ARCHIVE_NOTES \ + --title "${title} ${{ env.version }}" \ + ${{ env.version }} \ + artifact/* + + - name: Prune old release + env: + GH_TOKEN: ${{ github.token }} + version: ${{ needs.prepare.outputs.version }} + target_repo: ${{ needs.prepare.outputs.target_repo }} + target_tag: ${{ needs.prepare.outputs.target_tag }} + if: | + env.target_repo == github.repository && env.target_tag != env.version + run: | + gh release delete --yes --cleanup-tag "${{ env.target_tag }}" || true + git tag --delete "${{ env.target_tag }}" || true + sleep 5 # Enough time to cover deletion race condition + + - name: Publish release + env: + GH_TOKEN: ${{ github.token }} + version: ${{ needs.prepare.outputs.version }} + target_repo: ${{ needs.prepare.outputs.target_repo }} + target_tag: ${{ needs.prepare.outputs.target_tag }} + head_sha: ${{ needs.prepare.outputs.head_sha }} + if: | + env.target_repo == github.repository + run: | + title="${{ github.repository == 'yt-dlp/yt-dlp' && 'yt-dlp ' || '' }}" + title+="${{ env.target_tag != env.version && format('{0} ', env.target_tag) || '' }}" + gh release create \ + --notes-file ${{ inputs.prerelease && 'PRERELEASE_NOTES' || 'RELEASE_NOTES' }} \ + --target ${{ env.head_sha }} \ + --title "${title}${{ env.version }}" \ + ${{ inputs.prerelease && '--prerelease' || '' }} \ + ${{ env.target_tag }} \ + artifact/* diff --git a/.gitignore b/.gitignore index 507ba8c7f1..fdd904f7fe 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,7 @@ cookies *.gif *.jpeg *.jpg +*.lrc *.m4a *.m4v *.mhtml @@ -40,6 +41,7 @@ cookies *.mov *.mp3 *.mp4 +*.mpg *.mpga *.oga *.ogg @@ -47,8 +49,8 @@ cookies *.png *.sbv *.srt +*.ssa *.swf -*.swp *.tt *.ttml *.url @@ -64,7 +66,7 @@ cookies # Python *.pyc *.pyo -.pytest_cache +.*_cache wine-py2exe/ py2exe.log build/ @@ -116,6 +118,7 @@ yt-dlp.zip .vscode *.sublime-* *.code-workspace +*.swp # Lazy extractors */extractor/lazy_extractors.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..a821eeefb1 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,14 @@ +repos: +- repo: local + hooks: + - id: linter + name: Apply linter fixes + entry: ruff check --fix . + language: system + types: [python] + require_serial: true + - id: format + name: Apply formatting fixes + entry: autopep8 --in-place . + language: system + types: [python] diff --git a/.pre-commit-hatch.yaml b/.pre-commit-hatch.yaml new file mode 100644 index 0000000000..fb7d25e1db --- /dev/null +++ b/.pre-commit-hatch.yaml @@ -0,0 +1,9 @@ +repos: +- repo: local + hooks: + - id: fix + name: Apply code fixes + entry: hatch fmt + language: system + types: [python] + require_serial: true diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 551db674e2..dbae6476f6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -79,7 +79,7 @@ ### Are you using the latest version? ### Is the issue already documented? -Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/yt-dlp/yt-dlp/search?type=Issues) of this repository. If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2021.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity. +Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/yt-dlp/yt-dlp/search?type=Issues) of this repository. If there is an issue, subscribe to it to be notified when there is any progress. Unless you have something useful to add to the conversation, please refrain from commenting. Additionally, it is also helpful to see if the issue has already been documented in the [youtube-dl issue tracker](https://github.com/ytdl-org/youtube-dl/issues). If similar issues have already been reported in youtube-dl (but not in our issue tracker), links to them can be included in your issue report here. @@ -127,34 +127,66 @@ ### Are you willing to share account details if needed? ### Is the website primarily used for piracy? -We follow [youtube-dl's policy](https://github.com/ytdl-org/youtube-dl#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free) to not support services that is primarily used for infringing copyright. Additionally, it has been decided to not to support porn sites that specialize in deep fake. We also cannot support any service that serves only [DRM protected content](https://en.wikipedia.org/wiki/Digital_rights_management). +We follow [youtube-dl's policy](https://github.com/ytdl-org/youtube-dl#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free) to not support services that is primarily used for infringing copyright. Additionally, it has been decided to not to support porn sites that specialize in fakes. We also cannot support any service that serves only [DRM protected content](https://en.wikipedia.org/wiki/Digital_rights_management). # DEVELOPER INSTRUCTIONS -Most users do not need to build yt-dlp and can [download the builds](https://github.com/yt-dlp/yt-dlp/releases) or get them via [the other installation methods](README.md#installation). +Most users do not need to build yt-dlp and can [download the builds](https://github.com/yt-dlp/yt-dlp/releases), get them via [the other installation methods](README.md#installation) or directly run it using `python -m yt_dlp`. -To run yt-dlp as a developer, you don't need to build anything either. Simply execute +`yt-dlp` uses [`hatch`]() as a project management tool. +You can easily install it using [`pipx`]() via `pipx install hatch`, or else via `pip` or your package manager of choice. Make sure you are using at least version `1.10.0`, otherwise some functionality might not work as expected. - python -m yt_dlp +If you plan on contributing to `yt-dlp`, best practice is to start by running the following command: -To run the test, simply invoke your favorite test runner, or execute a test file directly; any of the following work: +```shell +$ hatch run setup +``` - python -m unittest discover - python test/test_download.py - nosetests - pytest +The above command will install a `pre-commit` hook so that required checks/fixes (linting, formatting) will run automatically before each commit. If any code needs to be linted or formatted, then the commit will be blocked and the necessary changes will be made; you should review all edits and re-commit the fixed version. + +After this you can use `hatch shell` to enable a virtual environment that has `yt-dlp` and its development dependencies installed. + +In addition, the following script commands can be used to run simple tasks such as linting or testing (without having to run `hatch shell` first): +* `hatch fmt`: Automatically fix linter violations and apply required code formatting changes + * See `hatch fmt --help` for more info +* `hatch test`: Run extractor or core tests + * See `hatch test --help` for more info See item 6 of [new extractor tutorial](#adding-support-for-a-new-site) for how to run extractor specific test cases. +While it is strongly recommended to use `hatch` for yt-dlp development, if you are unable to do so, alternatively you can manually create a virtual environment and use the following commands: + +```shell +# To only install development dependencies: +$ python -m devscripts.install_deps --include dev + +# Or, for an editable install plus dev dependencies: +$ python -m pip install -e ".[default,dev]" + +# To setup the pre-commit hook: +$ pre-commit install + +# To be used in place of `hatch test`: +$ python -m devscripts.run_tests + +# To be used in place of `hatch fmt`: +$ ruff check --fix . +$ autopep8 --in-place . + +# To only check code instead of applying fixes: +$ ruff check . +$ autopep8 --diff . +``` + If you want to create a build of yt-dlp yourself, you can follow the instructions [here](README.md#compile). ## Adding new feature or making overarching changes -Before you start writing code for implementing a new feature, open an issue explaining your feature request and atleast one use case. This allows the maintainers to decide whether such a feature is desired for the project in the first place, and will provide an avenue to discuss some implementation details. If you open a pull request for a new feature without discussing with us first, do not be surprised when we ask for large changes to the code, or even reject it outright. +Before you start writing code for implementing a new feature, open an issue explaining your feature request and at least one use case. This allows the maintainers to decide whether such a feature is desired for the project in the first place, and will provide an avenue to discuss some implementation details. If you open a pull request for a new feature without discussing with us first, do not be surprised when we ask for large changes to the code, or even reject it outright. The same applies for changes to the documentation, code style, or overarching changes to the architecture @@ -168,41 +200,51 @@ ## Adding support for a new site 1. [Fork this repository](https://github.com/yt-dlp/yt-dlp/fork) 1. Check out the source code with: - git clone git@github.com:YOUR_GITHUB_USERNAME/yt-dlp.git + ```shell + $ git clone git@github.com:YOUR_GITHUB_USERNAME/yt-dlp.git + ``` 1. Start a new git branch with - cd yt-dlp - git checkout -b yourextractor + ```shell + $ cd yt-dlp + $ git checkout -b yourextractor + ``` 1. Start with this simple template and save it to `yt_dlp/extractor/yourextractor.py`: ```python from .common import InfoExtractor - - + + class YourExtractorIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P[0-9]+)' _TESTS = [{ 'url': 'https://yourextractor.com/watch/42', 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', 'info_dict': { + # For videos, only the 'id' and 'ext' fields are required to RUN the test: 'id': '42', 'ext': 'mp4', - 'title': 'Video title goes here', - 'thumbnail': r're:^https?://.*\.jpg$', - # TODO more properties, either as: - # * A value - # * MD5 checksum; start the string with md5: - # * A regular expression; start the string with re: - # * Any Python type, e.g. int or float + # Then if the test run fails, it will output the missing/incorrect fields. + # Properties can be added as: + # * A value, e.g. + # 'title': 'Video title goes here', + # * MD5 checksum; start the string with 'md5:', e.g. + # 'description': 'md5:098f6bcd4621d373cade4e832627b4f6', + # * A regular expression; start the string with 're:', e.g. + # 'thumbnail': r're:^https?://.*\.jpg$', + # * A count of elements in a list; start the string with 'count:', e.g. + # 'tags': 'count:10', + # * Any Python type, e.g. + # 'view_count': int, } }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - + # TODO more code goes here, for example ... title = self._html_search_regex(r'

(.+?)

', webpage, 'title') @@ -214,27 +256,33 @@ ## Adding support for a new site # TODO more properties (see yt_dlp/extractor/common.py) } ``` -1. Add an import in [`yt_dlp/extractor/_extractors.py`](yt_dlp/extractor/_extractors.py). Note that the class name must end with `IE`. -1. Run `python test/test_download.py TestDownload.test_YourExtractor` (note that `YourExtractor` doesn't end with `IE`). This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, the tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. You can also run all the tests in one go with `TestDownload.test_YourExtractor_all` -1. Make sure you have atleast one test for your extractor. Even if all videos covered by the extractor are expected to be inaccessible for automated testing, tests should still be added with a `skip` parameter indicating why the particular test is disabled from running. -1. Have a look at [`yt_dlp/extractor/common.py`](yt_dlp/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](yt_dlp/extractor/common.py#L91-L426). Add tests and code for as many as you want. -1. Make sure your code follows [yt-dlp coding conventions](#yt-dlp-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): +1. Add an import in [`yt_dlp/extractor/_extractors.py`](yt_dlp/extractor/_extractors.py). Note that the class name must end with `IE`. Also note that when adding a parenthesized import group, the last import in the group must have a trailing comma in order for this formatting to be respected by our code formatter. +1. Run `hatch test YourExtractor`. This *may fail* at first, but you can continually re-run it until you're done. Upon failure, it will output the missing fields and/or correct values which you can copy. If you decide to add more than one test, the tests will then be named `YourExtractor`, `YourExtractor_1`, `YourExtractor_2`, etc. Note that tests with an `only_matching` key in the test's dict are not included in the count. You can also run all the tests in one go with `YourExtractor_all` +1. Make sure you have at least one test for your extractor. Even if all videos covered by the extractor are expected to be inaccessible for automated testing, tests should still be added with a `skip` parameter indicating why the particular test is disabled from running. +1. Have a look at [`yt_dlp/extractor/common.py`](yt_dlp/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](yt_dlp/extractor/common.py#L119-L440). Add tests and code for as many as you want. +1. Make sure your code follows [yt-dlp coding conventions](#yt-dlp-coding-conventions), passes [ruff](https://docs.astral.sh/ruff/tutorial/#getting-started) code checks and is properly formatted: - $ flake8 yt_dlp/extractor/yourextractor.py + ```shell + $ hatch fmt --check + ``` -1. Make sure your code works under all [Python](https://www.python.org/) versions supported by yt-dlp, namely CPython and PyPy for Python 3.7 and above. Backward compatibility is not required for even older versions of Python. + You can use `hatch fmt` to automatically fix problems. Rules that the linter/formatter enforces should not be disabled with `# noqa` unless a maintainer requests it. The only exception allowed is for old/printf-style string formatting in GraphQL query templates (use `# noqa: UP031`). + +1. Make sure your code works under all [Python](https://www.python.org/) versions supported by yt-dlp, namely CPython and PyPy for Python 3.8 and above. Backward compatibility is not required for even older versions of Python. 1. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files, [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: - $ git add yt_dlp/extractor/_extractors.py - $ git add yt_dlp/extractor/yourextractor.py - $ git commit -m '[yourextractor] Add extractor' - $ git push origin yourextractor + ```shell + $ git add yt_dlp/extractor/_extractors.py + $ git add yt_dlp/extractor/yourextractor.py + $ git commit -m '[yourextractor] Add extractor' + $ git push origin yourextractor + ``` 1. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. In any case, thank you very much for your contributions! -**Tip:** To test extractors that require login information, create a file `test/local_parameters.json` and add `"usenetrc": true` or your username and password in it: +**Tip:** To test extractors that require login information, create a file `test/local_parameters.json` and add `"usenetrc": true` or your `username`&`password` or `cookiefile`/`cookiesfrombrowser` in it: ```json { "username": "your user name", @@ -246,12 +294,12 @@ ## yt-dlp coding conventions This section introduces a guide lines for writing idiomatic, robust and future-proof extractor code. -Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hosters out of your control and this layout tends to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize dependency on the source's layout and even to make the code foresee potential future changes and be ready for that. This is important because it will allow the extractor not to break on minor layout changes thus keeping old yt-dlp versions working. Even though this breakage issue may be easily fixed by a new version of yt-dlp, this could take some time, during which the the extractor will remain broken. +Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hosters out of your control and this layout tends to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize dependency on the source's layout and even to make the code foresee potential future changes and be ready for that. This is important because it will allow the extractor not to break on minor layout changes thus keeping old yt-dlp versions working. Even though this breakage issue may be easily fixed by a new version of yt-dlp, this could take some time, during which the extractor will remain broken. ### Mandatory and optional metafields -For extraction to work yt-dlp relies on metadata your extractor extracts and provides to yt-dlp expressed by an [information dictionary](yt_dlp/extractor/common.py#L91-L426) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by yt-dlp: +For extraction to work yt-dlp relies on metadata your extractor extracts and provides to yt-dlp expressed by an [information dictionary](yt_dlp/extractor/common.py#L119-L440) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by yt-dlp: - `id` (media identifier) - `title` (media title) @@ -261,7 +309,7 @@ ### Mandatory and optional metafields For pornographic sites, appropriate `age_limit` must also be returned. -The extractor is allowed to return the info dict without url or formats in some special cases if it allows the user to extract usefull information with `--ignore-no-formats-error` - e.g. when the video is a live stream that has not started yet. +The extractor is allowed to return the info dict without url or formats in some special cases if it allows the user to extract useful information with `--ignore-no-formats-error` - e.g. when the video is a live stream that has not started yet. [Any field](yt_dlp/extractor/common.py#219-L426) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. @@ -272,7 +320,7 @@ #### Example ```python meta = self._download_json(url, video_id) ``` - + Assume at this point `meta`'s layout is: ```python @@ -696,15 +744,15 @@ #### Examples ### Use convenience conversion and parsing functions -Wrap all extracted numeric data into safe functions from [`yt_dlp/utils.py`](yt_dlp/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. +Wrap all extracted numeric data into safe functions from [`yt_dlp/utils/`](yt_dlp/utils/): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. Use `url_or_none` for safe URL processing. Use `traverse_obj` and `try_call` (superseeds `dict_get` and `try_get`) for safe metadata extraction from parsed JSON. -Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction. +Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction. -Explore [`yt_dlp/utils.py`](yt_dlp/utils.py) for more useful convenience functions. +Explore [`yt_dlp/utils/`](yt_dlp/utils/) for more useful convenience functions. #### Examples diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 10fb5775bb..489ab7da8b 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -2,7 +2,6 @@ pukkandan (owner) shirt-dev (collaborator) coletdjnz/colethedj (collaborator) Ashish0804 (collaborator) -nao20010128nao/Lesmiscore (collaborator) bashonly (collaborator) Grub4K (collaborator) h-h-h-h @@ -405,3 +404,256 @@ road-master rohieb sdht0 seproDev +Hill-98 +LXYan2333 +mushbite +venkata-krishnas +7vlad7 +alexklapheke +arobase-che +bepvte +bergoid +blmarket +brandon-dacrib +c-basalt +CoryTibbettsDev +Cyberes +D0LLYNH0 +danog +DataGhost +falbrechtskirchinger +foreignBlade +garret1317 +hasezoey +hoaluvn +ItzMaxTV +ivanskodje +jo-nike +kangalio +linsui +makew0rld +menschel +mikf +mrscrapy +NDagestad +Neurognostic +NextFire +nick-cd +permunkle +pzhlkj6612 +ringus1 +rjy +Schmoaaaaah +sjthespian +theperfectpunk +toomyzoom +truedread +TxI5 +unbeatable-101 +vampirefrog +vidiot720 +viktor-enzell +zhgwn +barthelmannk +berkanteber +OverlordQ +rexlambert22 +Ti4eeT4e +AmanSal1 +bbilly1 +meliber +nnoboa +rdamas +RfadnjdExt +urectanc +nao20010128nao/Lesmiscore +04-pasha-04 +aaruni96 +aky-01 +AmirAflak +ApoorvShah111 +at-wat +davinkevin +demon071 +denhotte +FinnRG +fireattack +Frankgoji +GD-Slime +hatsomatt +ifan-t +kshitiz305 +kylegustavo +mabdelfattah +nathantouze +niemands +Rajeshwaran2001 +RedDeffender +Rohxn16 +sb0stn +SevenLives +simon300000 +snixon +soundchaser128 +szabyg +trainman261 +trislee +wader +Yalab7 +zhallgato +zhong-yiyu +Zprokkel +AS6939 +drzraf +handlerug +jiru +madewokherd +xofe +awalgarg +midnightveil +naginatana +Riteo +1100101 +aniolpages +bartbroere +CrendKing +Esokrates +HitomaruKonpaku +LoserFox +peci1 +saintliao +shubhexists +SirElderling +almx +elivinsky +starius +TravisDupes +amir16yp +Fymyte +Ganesh910 +hashFactory +kclauhk +Kyraminol +lstrojny +middlingphys +NickCis +nicodato +prettykool +S-Aarab +sonmezberkay +TSRBerry +114514ns +agibson-fl +alard +alien-developers +antonkesy +ArnauvGilotra +Arthurszzz +Bibhav48 +Bl4Cc4t +boredzo +Caesim404 +chkuendig +chtk +Danish-H +dasidiot +diman8 +divStar +DmitryScaletta +feederbox826 +gmes78 +gonzalezjo +hui1601 +infanf +jazz1611 +jingtra +jkmartindale +johnvictorfs +llistochek +marcdumais +martinxyz +michal-repo +mrmedieval +nbr23 +Nicals +Noor-5 +NurTasin +pompos02 +Pranaxcau +pwaldhauer +RaduManole +RalphORama +rrgomes +ruiminggu +rvsit +sefidel +shmohawk +Snack-X +src-tinkerer +stilor +syntaxsurge +t-nil +ufukk +vista-narvas +x11x +xpadev-net +Xpl0itU +YoshichikaAAA +zhijinwuu +alb +hruzgar +kasper93 +leoheitmannruiz +luiso1979 +nipotan +Offert4324 +sta1us +Tomoka1 +trwstin +alexhuot1 +clienthax +DaPotato69 +emqi +hugohaa +imanoreotwe +JakeFinley96 +lostfictions +minamotorin +ocococococ +Podiumnoche +RasmusAntons +roeniss +shoxie007 +Szpachlarz +The-MAGI +TuxCoder +voidful +vtexier +WyohKnott +trueauracoral +ASertacAkkaya +axpauls +chilinux +hafeoz +JSubelj +jucor +megumintyan +mgedmin +Niluge-KiWi +peisenwang +TheZ3ro +tippfehlr +varunchopra +DrakoCpp +PatrykMis +DinhHuy2010 +exterrestris +harbhim +LeSuisse +DunnesH +iancmy +mokrueger +luvyana +szantnerb +hugepower +scribblemaniac diff --git a/Changelog.md b/Changelog.md index 8d3ac089ce..0b96ab29cd 100644 --- a/Changelog.md +++ b/Changelog.md @@ -1,15 +1,1558 @@ # Changelog +### 2024.08.06 + +#### Core changes +- **jsinterp**: [Improve `slice` implementation](https://github.com/yt-dlp/yt-dlp/commit/bb8bf1db993f59752d20b73b861bd55e40cf0e31) ([#10664](https://github.com/yt-dlp/yt-dlp/issues/10664)) by [seproDev](https://github.com/seproDev) + +#### Extractor changes +- **discoveryplusitaly**: [Support sport and olympics URLs](https://github.com/yt-dlp/yt-dlp/commit/e7d73bc4531ee3f91a46b15e218dcc1fbeb6226c) ([#10655](https://github.com/yt-dlp/yt-dlp/issues/10655)) by [bashonly](https://github.com/bashonly) +- **gem.cbc.ca**: live: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/fc5eecfa31c9571b6031cc3968aaa0394be55d7a) ([#10565](https://github.com/yt-dlp/yt-dlp/issues/10565)) by [bashonly](https://github.com/bashonly), [scribblemaniac](https://github.com/scribblemaniac) +- **niconico**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/4d9231208332d4c32364b8cd814bff8b20232cae) ([#10677](https://github.com/yt-dlp/yt-dlp/issues/10677)) by [bashonly](https://github.com/bashonly) +- **olympics**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/919540a9644e55deb78cdd6751757ec8fdaf76f4) ([#10625](https://github.com/yt-dlp/yt-dlp/issues/10625)) by [bashonly](https://github.com/bashonly) +- **youku**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/0088c6de23d832b117061a33e984dc452d992e9c) ([#10626](https://github.com/yt-dlp/yt-dlp/issues/10626)) by [hugepower](https://github.com/hugepower) +- **youtube** + - [Change default player clients to `ios,web_creator`](https://github.com/yt-dlp/yt-dlp/commit/406f4c2e47502fffc1b0c210b4ee6487c89a44cb) ([#10674](https://github.com/yt-dlp/yt-dlp/issues/10674)) by [bashonly](https://github.com/bashonly) + - [Fix `n` function name extraction for player `b12cc44b`](https://github.com/yt-dlp/yt-dlp/commit/c86891eb9434b4d7eec426d38c0c625b5e13cb2f) ([#10668](https://github.com/yt-dlp/yt-dlp/issues/10668)) by [seproDev](https://github.com/seproDev) + +### 2024.08.01 + +#### Core changes +- **utils**: `unified_timestamp`: [Recognize Sunday](https://github.com/yt-dlp/yt-dlp/commit/6daf2c27c0464fba98337be30de0b66d520d0db1) ([#10589](https://github.com/yt-dlp/yt-dlp/issues/10589)) by [bashonly](https://github.com/bashonly) + +#### Extractor changes +- **abematv**: [Fix availability extraction](https://github.com/yt-dlp/yt-dlp/commit/ef36d517f9b05785d61abca7691d9ab7d63cc75c) ([#10569](https://github.com/yt-dlp/yt-dlp/issues/10569)) by [middlingphys](https://github.com/middlingphys) +- **cbc.ca**: player: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/94a1c5e642e468cebeb51f74c6c220434cb47d96) ([#10302](https://github.com/yt-dlp/yt-dlp/issues/10302)) by [bashonly](https://github.com/bashonly), [trainman261](https://github.com/trainman261) +- **discoveryplus**: [Support olympics URLs](https://github.com/yt-dlp/yt-dlp/commit/0b7728618417e1aa382722a4d29b916b594d4459) ([#10566](https://github.com/yt-dlp/yt-dlp/issues/10566)) by [bashonly](https://github.com/bashonly) +- **kick**: clips: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/bb3936ae2b3ce96d0b53f9e17cad1082058f032b) ([#10572](https://github.com/yt-dlp/yt-dlp/issues/10572)) by [luvyana](https://github.com/luvyana) +- **learningonscreen**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/fe15d3178e242803ae7a934b90137f13598eba2e) ([#10590](https://github.com/yt-dlp/yt-dlp/issues/10590)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) +- **mediaklikk**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/7e3e4779ad13e4511c9ba3869879e53f0267bd7a) ([#10605](https://github.com/yt-dlp/yt-dlp/issues/10605)) by [szantnerb](https://github.com/szantnerb) +- **mlbtv**: [Fix makeup game extraction](https://github.com/yt-dlp/yt-dlp/commit/4b69e1b53ea21e631cd5dd68ff531e2f1671ec17) ([#10607](https://github.com/yt-dlp/yt-dlp/issues/10607)) by [bashonly](https://github.com/bashonly) +- **olympics**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/2f1ddfe12a2c174bc777264c5c8ffe7ca0922d94) ([#10604](https://github.com/yt-dlp/yt-dlp/issues/10604)) by [bashonly](https://github.com/bashonly) +- **tva**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/28d485714fef88937c82635438afba5db81f9089) ([#10567](https://github.com/yt-dlp/yt-dlp/issues/10567)) by [bashonly](https://github.com/bashonly) +- **tver**: [Support olympic URLs](https://github.com/yt-dlp/yt-dlp/commit/5260696b1cba77161828941fdb38f09f14ac6c60) ([#10600](https://github.com/yt-dlp/yt-dlp/issues/10600)) by [vvto33](https://github.com/vvto33) +- **vimeo**: review: [Fix password-protected video extraction](https://github.com/yt-dlp/yt-dlp/commit/2b6df93a243bdfb9d6bb5c1e18020625cd02d465) ([#10598](https://github.com/yt-dlp/yt-dlp/issues/10598)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Change default player clients to `ios,tv`](https://github.com/yt-dlp/yt-dlp/commit/efb42763dec23ccf6a2e3bac3afbfefce8efd012) ([#10457](https://github.com/yt-dlp/yt-dlp/issues/10457)) by [seproDev](https://github.com/seproDev) + - [Fix `n` function name extraction for player `20dfca59`](https://github.com/yt-dlp/yt-dlp/commit/011b4a04db2a636c3ef0a0ad4e2d3ae482c9fd76) ([#10611](https://github.com/yt-dlp/yt-dlp/issues/10611)) by [bashonly](https://github.com/bashonly) + - [Fix age-verification workaround](https://github.com/yt-dlp/yt-dlp/commit/d19fcb934269465fd707e68a87f735ec6983e93d) ([#10610](https://github.com/yt-dlp/yt-dlp/issues/10610)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + - [Player client maintenance](https://github.com/yt-dlp/yt-dlp/commit/0e539617a41913c7da1edd74fb6543c10ad727b3) ([#10573](https://github.com/yt-dlp/yt-dlp/issues/10573)) by [bashonly](https://github.com/bashonly) + +#### Misc. changes +- **cleanup**: Miscellaneous: [ffd7781](https://github.com/yt-dlp/yt-dlp/commit/ffd7781d6588926f820b44a34b9e6e3068fb9f97) by [bashonly](https://github.com/bashonly) + +### 2024.07.25 + +#### Extractor changes +- **abematv**: [Adapt key retrieval to request handler framework](https://github.com/yt-dlp/yt-dlp/commit/a3bab4752a2b3d56e5a59b4e0411bb8f695c010b) ([#10491](https://github.com/yt-dlp/yt-dlp/issues/10491)) by [bashonly](https://github.com/bashonly) +- **facebook**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/1a34a802f44a1dab8f642c79c3cc810e21541d3b) ([#10531](https://github.com/yt-dlp/yt-dlp/issues/10531)) by [bashonly](https://github.com/bashonly) +- **mlbtv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f0993391e6052ec8f7aacc286609564f226943b9) ([#10515](https://github.com/yt-dlp/yt-dlp/issues/10515)) by [bashonly](https://github.com/bashonly) +- **tiktok**: [Fix and deprioritize JSON subtitles](https://github.com/yt-dlp/yt-dlp/commit/2f97779f335ac069ecccd9c7bf81abf4a83cfe7a) ([#10516](https://github.com/yt-dlp/yt-dlp/issues/10516)) by [bashonly](https://github.com/bashonly) +- **vimeo**: [Fix chapters extraction](https://github.com/yt-dlp/yt-dlp/commit/a0a1bc3d8d8e3bb9a48a06e835815a0460e90e77) ([#10544](https://github.com/yt-dlp/yt-dlp/issues/10544)) by [bashonly](https://github.com/bashonly) +- **youtube**: [Fix `n` function name extraction for player `3400486c`](https://github.com/yt-dlp/yt-dlp/commit/713b4cd18f00556771af8cfdd9cea6cc1a09e948) ([#10542](https://github.com/yt-dlp/yt-dlp/issues/10542)) by [bashonly](https://github.com/bashonly) + +#### Misc. changes +- **build**: [Pin `setuptools` version](https://github.com/yt-dlp/yt-dlp/commit/e046db8a116b1c320d4785daadd48ea0b22a3987) ([#10493](https://github.com/yt-dlp/yt-dlp/issues/10493)) by [bashonly](https://github.com/bashonly) + +### 2024.07.16 + +#### Core changes +- [Fix `noprogress` if `test=True` with `--quiet` and `--verbose`](https://github.com/yt-dlp/yt-dlp/commit/66ce3d76d87af3f81cc9dfec4be4704016cb1cdb) ([#10454](https://github.com/yt-dlp/yt-dlp/issues/10454)) by [Grub4K](https://github.com/Grub4K) +- [Support `auto-tty` and `no_color-tty` for `--color`](https://github.com/yt-dlp/yt-dlp/commit/d9cbced493cae2008508d94a2db5dd98be7c01fc) ([#10453](https://github.com/yt-dlp/yt-dlp/issues/10453)) by [Grub4K](https://github.com/Grub4K) +- **update**: [Fix network error handling](https://github.com/yt-dlp/yt-dlp/commit/ed1b9ed93dd90d2cc960c0d8eaa9d919db224203) ([#10486](https://github.com/yt-dlp/yt-dlp/issues/10486)) by [bashonly](https://github.com/bashonly) +- **utils**: `parse_codecs`: [Fix parsing of mixed case codec strings](https://github.com/yt-dlp/yt-dlp/commit/cc0070f6496e501d77352bad475fb02d6a86846a) by [bashonly](https://github.com/bashonly) + +#### Extractor changes +- **adn**: [Adjust for .com domain change](https://github.com/yt-dlp/yt-dlp/commit/959b7a379b8e5da059d110a63339c964b6265736) ([#10399](https://github.com/yt-dlp/yt-dlp/issues/10399)) by [infanf](https://github.com/infanf) +- **afreecatv**: [Fix login and use `legacy_ssl`](https://github.com/yt-dlp/yt-dlp/commit/4cd41469243624d90b7a2009b95cbe0609343efe) ([#10440](https://github.com/yt-dlp/yt-dlp/issues/10440)) by [bashonly](https://github.com/bashonly) +- **box**: [Support enterprise URLs](https://github.com/yt-dlp/yt-dlp/commit/705f5b84dec75cc7af97f42fd1530e8062735970) ([#10419](https://github.com/yt-dlp/yt-dlp/issues/10419)) by [seproDev](https://github.com/seproDev) +- **digitalconcerthall**: [Extract HEVC and FLAC formats](https://github.com/yt-dlp/yt-dlp/commit/e62fa6b0e0186f8c5666c2c5ab64cf191abdafc1) ([#10470](https://github.com/yt-dlp/yt-dlp/issues/10470)) by [bashonly](https://github.com/bashonly) +- **dplay**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/39e6c4cb44b9292e89ac0afec3cd0afc2ae8775f) ([#10471](https://github.com/yt-dlp/yt-dlp/issues/10471)) by [bashonly](https://github.com/bashonly) +- **epidemicsound**: [Support sound effects URLs](https://github.com/yt-dlp/yt-dlp/commit/8531d2b03bac9cc746f2ee8098aaf8f115505f5b) ([#10436](https://github.com/yt-dlp/yt-dlp/issues/10436)) by [iancmy](https://github.com/iancmy) +- **generic**: [Fix direct video link extensions](https://github.com/yt-dlp/yt-dlp/commit/b9afb99e7c34d0eb15ddc6689cd7d20eebfda68e) ([#10468](https://github.com/yt-dlp/yt-dlp/issues/10468)) by [bashonly](https://github.com/bashonly) +- **picarto**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/bacd18b7df08b4995644fd12cee1f8c8e8636bc7) ([#10414](https://github.com/yt-dlp/yt-dlp/issues/10414)) by [Frankgoji](https://github.com/Frankgoji) +- **soundcloud**: permalink, user: [Extract tracks only](https://github.com/yt-dlp/yt-dlp/commit/22870b81bad97dfa6307a7add44753b2dffc76a9) ([#10463](https://github.com/yt-dlp/yt-dlp/issues/10463)) by [DunnesH](https://github.com/DunnesH) +- **tiktok**: live: [Fix room ID extraction](https://github.com/yt-dlp/yt-dlp/commit/d2189d3d36987ebeac426fd70a60a5fe86325a2b) ([#10408](https://github.com/yt-dlp/yt-dlp/issues/10408)) by [mokrueger](https://github.com/mokrueger) +- **tv5monde**: [Support browser impersonation](https://github.com/yt-dlp/yt-dlp/commit/9b95a6765a5f6325af99c4aca961587f0c426e8c) ([#10417](https://github.com/yt-dlp/yt-dlp/issues/10417)) by [bashonly](https://github.com/bashonly) (With fixes in [cc1a309](https://github.com/yt-dlp/yt-dlp/commit/cc1a3098c00995c6aebc2a16bd1050a66bad64db)) +- **youtube** + - [Avoid poToken experiment player responses](https://github.com/yt-dlp/yt-dlp/commit/8b8b442cb005a8d85315f301615f83fb736b967a) ([#10456](https://github.com/yt-dlp/yt-dlp/issues/10456)) by [seproDev](https://github.com/seproDev) (With fixes in [16da8ef](https://github.com/yt-dlp/yt-dlp/commit/16da8ef9937ff76632dfef02e5062c5ba99c8ea2)) + - [Invalidate nsig cache from < 2024.07.09](https://github.com/yt-dlp/yt-dlp/commit/04e17ba20a139f1b3e30ec4bafa3fba26888f0b3) ([#10401](https://github.com/yt-dlp/yt-dlp/issues/10401)) by [bashonly](https://github.com/bashonly) + - [Reduce android client priority](https://github.com/yt-dlp/yt-dlp/commit/b85eef0a615a01304f88a3847309c667e09a20df) ([#10467](https://github.com/yt-dlp/yt-dlp/issues/10467)) by [seproDev](https://github.com/seproDev) + +#### Networking changes +- [Add `legacy_ssl` request extension](https://github.com/yt-dlp/yt-dlp/commit/150ecc45d9cacc919550c13b04fd998ac5103a6b) ([#10448](https://github.com/yt-dlp/yt-dlp/issues/10448)) by [coletdjnz](https://github.com/coletdjnz) +- **Request Handler**: curl_cffi: [Support `curl_cffi` 0.7.X](https://github.com/yt-dlp/yt-dlp/commit/42bfca00a6b460fc053514cdd7ac6f5b5daddf0c) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **build** + - [Include `curl_cffi` in `yt-dlp_linux`](https://github.com/yt-dlp/yt-dlp/commit/4521f30d1479315cd5c3bf4abdad19391952df98) by [bashonly](https://github.com/bashonly) + - [Pin `curl-cffi` to 0.5.10 for Windows](https://github.com/yt-dlp/yt-dlp/commit/ac30941ae682f71eab010877c9a977736a61d3cf) by [bashonly](https://github.com/bashonly) +- **cleanup**: Miscellaneous: [89a161e](https://github.com/yt-dlp/yt-dlp/commit/89a161e8c62569a662deda1c948664152efcb6b4) by [bashonly](https://github.com/bashonly) + +### 2024.07.09 + +#### Core changes +- [Do not alter default format selection when simulated](https://github.com/yt-dlp/yt-dlp/commit/0b570f2a90ce2363ba06089217514d644e7be2e0) ([#9862](https://github.com/yt-dlp/yt-dlp/issues/9862)) by [seproDev](https://github.com/seproDev) + +#### Extractor changes +- **youtube**: [Remove broken `n` function extraction fallback](https://github.com/yt-dlp/yt-dlp/commit/7ead7332af69422cee931aec3faa277288e9e212) ([#10396](https://github.com/yt-dlp/yt-dlp/issues/10396)) by [pukkandan](https://github.com/pukkandan), [seproDev](https://github.com/seproDev) + +### 2024.07.08 + +#### Core changes +- **jsinterp**: [Implement `Function.prototype` resolving for `call` and `apply`](https://github.com/yt-dlp/yt-dlp/commit/6c056ea7aeb03660281653a9668547f2548f194f) ([#10392](https://github.com/yt-dlp/yt-dlp/issues/10392)) by [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- **soundcloud**: [Fix rate-limit handling](https://github.com/yt-dlp/yt-dlp/commit/4b50b292cc98534fb8c7cdf0ae5cb85862f7ebfc) ([#10389](https://github.com/yt-dlp/yt-dlp/issues/10389)) by [bashonly](https://github.com/bashonly) +- **youtube**: [Fix JS `n` function name extraction](https://github.com/yt-dlp/yt-dlp/commit/297b0a379282a15c80d82d51f3757c961db2dae1) ([#10390](https://github.com/yt-dlp/yt-dlp/issues/10390)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + +### 2024.07.07 + +#### Important changes +- Security: [[ie/douyutv] Do not use dangerous javascript source/URL](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-3v33-3wmw-3785) + - A dependency on potentially malicious third-party JavaScript code has been removed from the Douyu extractors + +#### Core changes +- [Address gaps in allowed extensions](https://github.com/yt-dlp/yt-dlp/commit/2469119490d7e0397ebbf5c5ae327316f955eef2) ([#10362](https://github.com/yt-dlp/yt-dlp/issues/10362)) by [bashonly](https://github.com/bashonly) +- [Fix `--ignore-no-formats-error`](https://github.com/yt-dlp/yt-dlp/commit/cc767e9490056efaaa11c186b0d032e4b4969180) ([#10345](https://github.com/yt-dlp/yt-dlp/issues/10345)) by [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- **abematv**: [Extract availability](https://github.com/yt-dlp/yt-dlp/commit/2a1a1b8e67e864289ac7ba5d05ec63dbb19a639f) ([#10348](https://github.com/yt-dlp/yt-dlp/issues/10348)) by [middlingphys](https://github.com/middlingphys) +- **chzzk**: [Extract with API v3](https://github.com/yt-dlp/yt-dlp/commit/4862a29854d4044120e3f97b52199711ad04bee1) ([#10363](https://github.com/yt-dlp/yt-dlp/issues/10363)) by [hui1601](https://github.com/hui1601) +- **douyutv**: [Do not use dangerous javascript source/URL](https://github.com/yt-dlp/yt-dlp/commit/6075a029dba70a89675ae1250e7cdfd91f0eba41) ([#10347](https://github.com/yt-dlp/yt-dlp/issues/10347)) by [LeSuisse](https://github.com/LeSuisse) +- **jiosaavn**: playlist: [Support featured playlists](https://github.com/yt-dlp/yt-dlp/commit/f0f867f008a1728f5f6ac1224b9e014b5d27f817) ([#10382](https://github.com/yt-dlp/yt-dlp/issues/10382)) by [harbhim](https://github.com/harbhim) +- **vidyard**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/00766ece0c5c7a80781a4ff677198c5fb69d9dc0) ([#10155](https://github.com/yt-dlp/yt-dlp/issues/10155)) by [exterrestris](https://github.com/exterrestris) +- **vimeo**: [Fix password-protected video extraction](https://github.com/yt-dlp/yt-dlp/commit/c1c9bb4adb42d0d93a2fb5d93a7de0a87b6ba884) ([#10341](https://github.com/yt-dlp/yt-dlp/issues/10341)) by [bashonly](https://github.com/bashonly) +- **vtv**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/987a1f94c24275f2b0cd82e719956687415dd732) ([#10173](https://github.com/yt-dlp/yt-dlp/issues/10173)) by [DinhHuy2010](https://github.com/DinhHuy2010) +- **yle_areena** + - [Fix metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/4cdc976bd861b5835601ae402bef543eacd88f3d) ([#10380](https://github.com/yt-dlp/yt-dlp/issues/10380)) by [seproDev](https://github.com/seproDev) + - [Fix subtitle extraction](https://github.com/yt-dlp/yt-dlp/commit/0d174e8bed32081eb38ef7f5d1a1282ae154f517) ([#10379](https://github.com/yt-dlp/yt-dlp/issues/10379)) by [Grub4K](https://github.com/Grub4K) + +#### Misc. changes +- **cleanup**: Miscellaneous: [b337d29](https://github.com/yt-dlp/yt-dlp/commit/b337d2989ce0614651d363383f6f743d977248ef) by [bashonly](https://github.com/bashonly) + +### 2024.07.02 + +#### Core changes +- [Fix `--compat-opt allow-unsafe-ext`](https://github.com/yt-dlp/yt-dlp/commit/773bbb181506856ffda95496ab60c1c9603f1f71) ([#10336](https://github.com/yt-dlp/yt-dlp/issues/10336)) by [bashonly](https://github.com/bashonly), [rdamas](https://github.com/rdamas) + +#### Extractor changes +- **banbye**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/7509791385ba88cb7ec0ab17e826681f4af4b66e) ([#10332](https://github.com/yt-dlp/yt-dlp/issues/10332)) by [PatrykMis](https://github.com/PatrykMis), [seproDev](https://github.com/seproDev) +- **murrtube**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/6403530e2dfe259a87afe444708c4f3024cc45b8) ([#9249](https://github.com/yt-dlp/yt-dlp/issues/9249)) by [DrakoCpp](https://github.com/DrakoCpp) +- **zaiko**: [Support JWT video URLs](https://github.com/yt-dlp/yt-dlp/commit/7799e518956387bb3c1064c9beae26eab8d5044a) ([#10130](https://github.com/yt-dlp/yt-dlp/issues/10130)) by [pzhlkj6612](https://github.com/pzhlkj6612) + +#### Postprocessor changes +- **embedthumbnail**: [Fix embedding with mutagen](https://github.com/yt-dlp/yt-dlp/commit/d502f4c6d95b74896f40070d07229997f0850f31) ([#10337](https://github.com/yt-dlp/yt-dlp/issues/10337)) by [bashonly](https://github.com/bashonly) + +#### Misc. changes +- **cleanup**: Miscellaneous: [93d33cb](https://github.com/yt-dlp/yt-dlp/commit/93d33cb29af9e2e84369ac43589d50ce8e0160ef) by [bashonly](https://github.com/bashonly) + +### 2024.07.01 + +#### Important changes +- Security: [[CVE-2024-38519](https://nvd.nist.gov/vuln/detail/CVE-2024-38519)] [Properly sanitize file-extension to prevent file system modification and RCE](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-79w7-vh3h-8g4j) + - Unsafe extensions are now blocked from being downloaded + +#### Core changes +- [Add `playlist_channel` and `playlist_channel_id` fields](https://github.com/yt-dlp/yt-dlp/commit/55e3e6fd21e741ec5ae3d8624de5e5ea345810eb) ([#10266](https://github.com/yt-dlp/yt-dlp/issues/10266)) by [bashonly](https://github.com/bashonly) +- [Disallow unsafe extensions (CVE-2024-38519)](https://github.com/yt-dlp/yt-dlp/commit/5ce582448ececb8d9c30c8c31f58330090ced03a) by [Grub4K](https://github.com/Grub4K) +- **cookies**: [Fix `--cookies-from-browser` DE detection on Linux](https://github.com/yt-dlp/yt-dlp/commit/a8520244b8642880e4d35925e9e49eff94d548de) ([#10237](https://github.com/yt-dlp/yt-dlp/issues/10237)) by [peisenwang](https://github.com/peisenwang) + +#### Extractor changes +- **afreecatv** + - [Support browser impersonation](https://github.com/yt-dlp/yt-dlp/commit/e8352ad6599de7b5371dc39a1a1edc7890aaedb4) ([#10174](https://github.com/yt-dlp/yt-dlp/issues/10174)) by [hui1601](https://github.com/hui1601) + - catchstory: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/054a3ba7d1293f9fbe21800d62d1e5ddcbded238) ([#10235](https://github.com/yt-dlp/yt-dlp/issues/10235)) by [hui1601](https://github.com/hui1601) +- **bilibili**: [Support legacy formats](https://github.com/yt-dlp/yt-dlp/commit/1d6ab17d0752ee9cf19e3e63c7dec7b600d3f228) ([#9117](https://github.com/yt-dlp/yt-dlp/issues/9117)) by [c-basalt](https://github.com/c-basalt), [GD-Slime](https://github.com/GD-Slime) +- **bitchute**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/5b1a2aa978d0074cee278e7659f32f52ecc4ab53) ([#10301](https://github.com/yt-dlp/yt-dlp/issues/10301)) by [seproDev](https://github.com/seproDev) +- **brightcove**: [Upgrade requests to HTTPS](https://github.com/yt-dlp/yt-dlp/commit/90c3721a322756bb7f4ca10ceb73744500bee37e) ([#10202](https://github.com/yt-dlp/yt-dlp/issues/10202)) by [bashonly](https://github.com/bashonly) +- **cloudflarestream**: [Fix `_VALID_URL` and embed extraction](https://github.com/yt-dlp/yt-dlp/commit/7aa322c02cec54eb77154a89da7e400194f0bd03) ([#10215](https://github.com/yt-dlp/yt-dlp/issues/10215)) by [bashonly](https://github.com/bashonly) +- **cloudycdn**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/b758877afa225747fba81c8a580e27583a231734) ([#10271](https://github.com/yt-dlp/yt-dlp/issues/10271)) by [Caesim404](https://github.com/Caesim404) +- **digitalconcerthall**: [Rework extractor](https://github.com/yt-dlp/yt-dlp/commit/2a4f2e82dbeeb0c9130883c83dac689d5260c871) ([#10152](https://github.com/yt-dlp/yt-dlp/issues/10152)) by [seproDev](https://github.com/seproDev), [tippfehlr](https://github.com/tippfehlr) +- **facebook**: reel: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/8ca1d57ed08d00efa117820a5a82f763b20e2d1d) ([#10232](https://github.com/yt-dlp/yt-dlp/issues/10232)) by [bashonly](https://github.com/bashonly) +- **francetv** + - [Detect and raise errors for DRM](https://github.com/yt-dlp/yt-dlp/commit/3690c2f59827c79a1bbe388a7c1ae75db7477db2) ([#10165](https://github.com/yt-dlp/yt-dlp/issues/10165)) by [bashonly](https://github.com/bashonly) + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/081708d6074dfbb907e25af61ba530bba0d4b31d) ([#10177](https://github.com/yt-dlp/yt-dlp/issues/10177)) by [bashonly](https://github.com/bashonly) +- **generic**: [Add `key_query` extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/5dbac313ae4e3e8521dfe2e1a6a048a98ff4b4fe) by [bashonly](https://github.com/bashonly) +- **graspop**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/1d369b4096d79233e0ac2c93762746a64d7a69c8) ([#10268](https://github.com/yt-dlp/yt-dlp/issues/10268)) by [Niluge-KiWi](https://github.com/Niluge-KiWi) +- **jiocinema**: series: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/61714f46956f61612032bba857aed7ad1387eccd) ([#10139](https://github.com/yt-dlp/yt-dlp/issues/10139)) by [varunchopra](https://github.com/varunchopra) +- **khanacademy**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/4093eb1fcc29a0e2aea9adfcba479787d9ae0c0c) ([#9136](https://github.com/yt-dlp/yt-dlp/issues/9136)) by [c-basalt](https://github.com/c-basalt) +- **laracasts**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b8da8a98f897599095d4ef1644b8c5fd39921118) ([#10055](https://github.com/yt-dlp/yt-dlp/issues/10055)) by [ASertacAkkaya](https://github.com/ASertacAkkaya), [seproDev](https://github.com/seproDev) +- **matchtv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f3411af12e209bc5624e1ac31271b8aabe2d3c90) ([#10190](https://github.com/yt-dlp/yt-dlp/issues/10190)) by [megumintyan](https://github.com/megumintyan) +- **mediasite**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/0953209a857c51648aee89d205c086b0e1dd3864) ([#10273](https://github.com/yt-dlp/yt-dlp/issues/10273)) by [bashonly](https://github.com/bashonly) +- **microsoftembed**: [Add extractors for dev materials](https://github.com/yt-dlp/yt-dlp/commit/9200bc70c94546b2191bb6fbfc9cea98a919cc56) ([#9177](https://github.com/yt-dlp/yt-dlp/issues/9177)) by [c-basalt](https://github.com/c-basalt) +- **mlbtv**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/61edf57f8f13f6dfd81154174e647eb5fdd26089) ([#10296](https://github.com/yt-dlp/yt-dlp/issues/10296)) by [bashonly](https://github.com/bashonly) +- **neteasemusic**: [Extract more formats from new API](https://github.com/yt-dlp/yt-dlp/commit/7a03f88c40b80d3cf54f68edd9d4bdd6aa527570) ([#10258](https://github.com/yt-dlp/yt-dlp/issues/10258)) by [hafeoz](https://github.com/hafeoz) +- **nhkradiru**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/b8e2a5e0e1030076f833917906e19bb6c7b318f6) ([#10106](https://github.com/yt-dlp/yt-dlp/issues/10106)) by [garret1317](https://github.com/garret1317) +- **nuum**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/aefede25561a06cba398d4f593eee2fbe942693b) ([#10316](https://github.com/yt-dlp/yt-dlp/issues/10316)) by [DmitryScaletta](https://github.com/DmitryScaletta) +- **orf** + - on + - [Add `prefer_segments_playlist` extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/e6a22834df1776ec4e486526f6df2bf53cb7e06f) ([#10314](https://github.com/yt-dlp/yt-dlp/issues/10314)) by [seproDev](https://github.com/seproDev) + - [Support segmented episodes](https://github.com/yt-dlp/yt-dlp/commit/8b46ad4d8b8ee8c5472af0cde863baa89ca3f425) ([#10053](https://github.com/yt-dlp/yt-dlp/issues/10053)) by [seproDev](https://github.com/seproDev) +- **patreoncampaign**: [Fix `campaign_id` extraction](https://github.com/yt-dlp/yt-dlp/commit/2e5a47da400b645aadbda6afd1156bd89c744f48) ([#10070](https://github.com/yt-dlp/yt-dlp/issues/10070)) by [bashonly](https://github.com/bashonly) +- **podbayfm**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/d4b52ce3fcb8d9578ed12365648eaba8718c603e) ([#10195](https://github.com/yt-dlp/yt-dlp/issues/10195)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) +- **pokergo**: [Make metadata extraction non-fatal](https://github.com/yt-dlp/yt-dlp/commit/36e8dd832579b5375a0f6626af4268b86b4eb21a) ([#10319](https://github.com/yt-dlp/yt-dlp/issues/10319)) by [axpauls](https://github.com/axpauls) +- **qqmusic**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/4f5d7be3c5590bb257d8ff521572aee9839ab754) ([#9768](https://github.com/yt-dlp/yt-dlp/issues/9768)) by [c-basalt](https://github.com/c-basalt) +- **rtvslo.si**: show: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/92a1c4abaeeba9a69d611c57b73555cb1a1f00ad) ([#8418](https://github.com/yt-dlp/yt-dlp/issues/8418)) by [JSubelj](https://github.com/JSubelj), [seproDev](https://github.com/seproDev) +- **soundcloud**: [Fix `download` format extraction](https://github.com/yt-dlp/yt-dlp/commit/e53e56b73543799638fa6abb0c78f8b091aa84e1) ([#10125](https://github.com/yt-dlp/yt-dlp/issues/10125)) by [bashonly](https://github.com/bashonly) +- **sproutvideo**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/d6c2c2bc84f1434255be5c73baeb17d893d2c0d4) ([#10098](https://github.com/yt-dlp/yt-dlp/issues/10098)) by [bashonly](https://github.com/bashonly), [TheZ3ro](https://github.com/TheZ3ro) +- **tiktok** + - [Detect and raise when login is required](https://github.com/yt-dlp/yt-dlp/commit/ea88129784fcbb6987161df9ba05909325d8e2e9) ([#10124](https://github.com/yt-dlp/yt-dlp/issues/10124)) by [bashonly](https://github.com/bashonly) + - [Fix API extraction](https://github.com/yt-dlp/yt-dlp/commit/96472d72f29550c25c5dcedcde02c38c192b0011) ([#10216](https://github.com/yt-dlp/yt-dlp/issues/10216)) by [bashonly](https://github.com/bashonly) +- **tubitv** + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/bef9a9e5361fd7a72e21d0f1a8c8afb70d89e8c5) ([#9975](https://github.com/yt-dlp/yt-dlp/issues/9975)) by [chilinux](https://github.com/chilinux) + - series: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/d7d861811c15585a4f7ec9d5ae68d2ac28de28a0) ([#10116](https://github.com/yt-dlp/yt-dlp/issues/10116)) by [bashonly](https://github.com/bashonly) +- **vimeo**: [Support browser impersonation](https://github.com/yt-dlp/yt-dlp/commit/d4b99a233314bf31f9c842035ea9884673d5313a) ([#10327](https://github.com/yt-dlp/yt-dlp/issues/10327)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Extract all formats from multi-language m3u8s](https://github.com/yt-dlp/yt-dlp/commit/9bd85019931927a99b0fe0dc58ac51acca9fbe72) ([#9875](https://github.com/yt-dlp/yt-dlp/issues/9875)) by [bashonly](https://github.com/bashonly), [clienthax](https://github.com/clienthax) + - [Skip formats if nsig decoding fails](https://github.com/yt-dlp/yt-dlp/commit/800ec085ccf98420584d8bb38c20a2c079669b09) ([#10223](https://github.com/yt-dlp/yt-dlp/issues/10223)) by [bashonly](https://github.com/bashonly) + - [Suppress "Unavailable videos are hidden" warning](https://github.com/yt-dlp/yt-dlp/commit/24f3097ea9a470a984d0454dc013cafa2325f5f8) ([#10159](https://github.com/yt-dlp/yt-dlp/issues/10159)) by [mgedmin](https://github.com/mgedmin) + - tab: [Fix channel metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/a0d9967f6822fc279e86bce33464194985148727) ([#10071](https://github.com/yt-dlp/yt-dlp/issues/10071)) by [bashonly](https://github.com/bashonly), [shoxie007](https://github.com/shoxie007) + +#### Downloader changes +- **hls**: [Apply `extra_param_to_key_url` from info dict](https://github.com/yt-dlp/yt-dlp/commit/ca8885edd93bdf8912af6c22ee335b6222cb9ba9) by [bashonly](https://github.com/bashonly) + +#### Postprocessor changes +- **embedthumbnail**: [Fix postprocessor](https://github.com/yt-dlp/yt-dlp/commit/f2a4ea1794718e4dc0148bc172cb877f1080903b) ([#10248](https://github.com/yt-dlp/yt-dlp/issues/10248)) by [Grub4K](https://github.com/Grub4K) + +#### Networking changes +- **Request Handler**: requests: [Bump minimum `requests` version to 2.32.2](https://github.com/yt-dlp/yt-dlp/commit/db50f19d76c6870a5a13d0cab9287d684fd7449a) ([#10079](https://github.com/yt-dlp/yt-dlp/issues/10079)) by [bashonly](https://github.com/bashonly) + +#### Misc. changes +- **build** + - [Bump Pyinstaller to `>=6.7.0` for all builds](https://github.com/yt-dlp/yt-dlp/commit/5fdd13006a1c5d78642c8d3c4c7df0448273c2ae) ([#10069](https://github.com/yt-dlp/yt-dlp/issues/10069)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + - [Cache dependencies for `macos` job](https://github.com/yt-dlp/yt-dlp/commit/46c1b7cfec1d0e6155083ca7e6948674c64ecb97) ([#10088](https://github.com/yt-dlp/yt-dlp/issues/10088)) by [bashonly](https://github.com/bashonly) + - [Use `macos-12` image for `yt-dlp_macos`](https://github.com/yt-dlp/yt-dlp/commit/03334d639d5282cd4107edb32c623ba400262fc4) ([#10063](https://github.com/yt-dlp/yt-dlp/issues/10063)) by [bashonly](https://github.com/bashonly) +- **cleanup** + - [Add more ruff rules](https://github.com/yt-dlp/yt-dlp/commit/add96eb9f84cfffe85682bf2fb85135746994ee8) ([#10149](https://github.com/yt-dlp/yt-dlp/issues/10149)) by [seproDev](https://github.com/seproDev) + - [Bump ruff to 0.5.x](https://github.com/yt-dlp/yt-dlp/commit/7814c50948a2b9a4c746441ecbc509ae563d5d1f) ([#10282](https://github.com/yt-dlp/yt-dlp/issues/10282)) by [seproDev](https://github.com/seproDev) + - Miscellaneous: [6aaf96a](https://github.com/yt-dlp/yt-dlp/commit/6aaf96a3d6e7d0d426e97e11a2fcf52fda00e733) by [bashonly](https://github.com/bashonly), [c-basalt](https://github.com/c-basalt), [jucor](https://github.com/jucor), [seproDev](https://github.com/seproDev) +- **test**: download: [Raise on network errors](https://github.com/yt-dlp/yt-dlp/commit/54a63e80af82791d2f0985bd0176bb182963fd5f) ([#10283](https://github.com/yt-dlp/yt-dlp/issues/10283)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + +### 2024.05.27 + +#### Extractor changes +- [Fix parsing of base URL in SMIL manifest](https://github.com/yt-dlp/yt-dlp/commit/26603d0b34898818992bee4598e0607c07059511) ([#9225](https://github.com/yt-dlp/yt-dlp/issues/9225)) by [seproDev](https://github.com/seproDev) +- **peertube**: [Support livestreams](https://github.com/yt-dlp/yt-dlp/commit/12b248ce60be1aa1362edd839d915bba70dbee4b) ([#10044](https://github.com/yt-dlp/yt-dlp/issues/10044)) by [bashonly](https://github.com/bashonly), [trueauracoral](https://github.com/trueauracoral) +- **piksel**: [Update domain](https://github.com/yt-dlp/yt-dlp/commit/ae2194e1dd4a99d32eb3cab7c48a0ff03101ef3b) ([#9223](https://github.com/yt-dlp/yt-dlp/issues/9223)) by [seproDev](https://github.com/seproDev) +- **tiktok**: user: [Fix extraction loop](https://github.com/yt-dlp/yt-dlp/commit/c53c2e40fde8f2e15c7c62f8ca1a5d9e90ddc079) ([#10035](https://github.com/yt-dlp/yt-dlp/issues/10035)) by [bashonly](https://github.com/bashonly) + +#### Misc. changes +- **cleanup**: Miscellaneous: [5e3e19c](https://github.com/yt-dlp/yt-dlp/commit/5e3e19c93c52830da98d9d1ed84ea7a559efefbd) by [bashonly](https://github.com/bashonly) + +### 2024.05.26 + +#### Core changes +- [Better warning when requested subs format not found](https://github.com/yt-dlp/yt-dlp/commit/7e4259dff0b681a3f0e8a930799ce0394328c86e) ([#9873](https://github.com/yt-dlp/yt-dlp/issues/9873)) by [DaPotato69](https://github.com/DaPotato69) +- [Merged with youtube-dl a08f2b7](https://github.com/yt-dlp/yt-dlp/commit/a4da9db87b6486b270c15dfa07ab5bfedc83f6bd) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) +- [Warn if lack of ffmpeg alters format selection](https://github.com/yt-dlp/yt-dlp/commit/96da9525043f78aca4544d01761b13b2140e9ae6) ([#9805](https://github.com/yt-dlp/yt-dlp/issues/9805)) by [pukkandan](https://github.com/pukkandan), [seproDev](https://github.com/seproDev) +- **cookies** + - [Add `--cookies-from-browser` support for Whale](https://github.com/yt-dlp/yt-dlp/commit/dd9ad97b1fbdd36c086b8ba82328a4d954f78f8e) ([#9649](https://github.com/yt-dlp/yt-dlp/issues/9649)) by [roeniss](https://github.com/roeniss) + - [Get chrome session cookies with `--cookies-from-browser`](https://github.com/yt-dlp/yt-dlp/commit/f1f158976e38d38a260762accafe7bbe6d451151) ([#9747](https://github.com/yt-dlp/yt-dlp/issues/9747)) by [StefanLobbenmeier](https://github.com/StefanLobbenmeier) +- **windows**: [Improve shell quoting and tests](https://github.com/yt-dlp/yt-dlp/commit/64766459e37451b665c1464073c28361fbcf1c25) ([#9802](https://github.com/yt-dlp/yt-dlp/issues/9802)) by [Grub4K](https://github.com/Grub4K) (With fixes in [7e26bd5](https://github.com/yt-dlp/yt-dlp/commit/7e26bd53f9c5893518fde81dfd0079ec08dd841e)) + +#### Extractor changes +- [Add POST data hash to `--write-pages` filenames](https://github.com/yt-dlp/yt-dlp/commit/61b17437dc14a1c7e90ff48a6198df77828c6df4) ([#9879](https://github.com/yt-dlp/yt-dlp/issues/9879)) by [minamotorin](https://github.com/minamotorin) (With fixes in [c999bac](https://github.com/yt-dlp/yt-dlp/commit/c999bac02c5a4f755b2a82488a975e91c988ffd8) by [bashonly](https://github.com/bashonly)) +- [Make `_search_nextjs_data` non fatal](https://github.com/yt-dlp/yt-dlp/commit/3ee1194288981c4f2c4abd8315326de0c424d2ce) ([#8937](https://github.com/yt-dlp/yt-dlp/issues/8937)) by [Grub4K](https://github.com/Grub4K) +- **afreecatv**: live: [Add `cdn` extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/315b3544296bb83012e20ee3af9d3cbf5600dd1c) ([#9666](https://github.com/yt-dlp/yt-dlp/issues/9666)) by [bashonly](https://github.com/bashonly) +- **alura**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/fc2879ecb05aaad36869609d154e4321362c1f63) ([#9658](https://github.com/yt-dlp/yt-dlp/issues/9658)) by [hugohaa](https://github.com/hugohaa) +- **artetv**: [Label forced subtitles](https://github.com/yt-dlp/yt-dlp/commit/7b5674949fd03a33b47b67b31d56a5adf1c48c91) ([#9945](https://github.com/yt-dlp/yt-dlp/issues/9945)) by [vtexier](https://github.com/vtexier) +- **bbc**: [Fix and extend extraction](https://github.com/yt-dlp/yt-dlp/commit/7975ddf245d22af034d5b983eeb1c5ec6c2ce053) ([#9705](https://github.com/yt-dlp/yt-dlp/issues/9705)) by [dirkf](https://github.com/dirkf), [kylegustavo](https://github.com/kylegustavo), [pukkandan](https://github.com/pukkandan) +- **bilibili**: [Fix `--geo-verification-proxy` support](https://github.com/yt-dlp/yt-dlp/commit/2338827072dacab0f15348b70aec8685feefc8d1) ([#9817](https://github.com/yt-dlp/yt-dlp/issues/9817)) by [fireattack](https://github.com/fireattack) +- **bilibilispacevideo** + - [Better error message](https://github.com/yt-dlp/yt-dlp/commit/06d52c87314e0bbc16c43c405090843885577b88) ([#9839](https://github.com/yt-dlp/yt-dlp/issues/9839)) by [fireattack](https://github.com/fireattack) + - [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/4cc99d7b6cce8b39506ead01407445d576b63ee4) ([#9905](https://github.com/yt-dlp/yt-dlp/issues/9905)) by [c-basalt](https://github.com/c-basalt) +- **boosty**: [Add cookies support](https://github.com/yt-dlp/yt-dlp/commit/145dc6f6563e80d2da1b3e9aea2ffa795b71622c) ([#9522](https://github.com/yt-dlp/yt-dlp/issues/9522)) by [RasmusAntons](https://github.com/RasmusAntons) +- **brilliantpala**: [Fix login](https://github.com/yt-dlp/yt-dlp/commit/eead3bbc01f6529862bdad1f0b2adeabda4f006e) ([#9788](https://github.com/yt-dlp/yt-dlp/issues/9788)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **canalalpha**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/00a9f2e1f7fa69499221f2e8dd73a08efeef79bc) ([#9675](https://github.com/yt-dlp/yt-dlp/issues/9675)) by [kclauhk](https://github.com/kclauhk) +- **cbc.ca**: player: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/c8bf48f3a8fa29587e7c73ef5a7710385a5ea725) ([#9866](https://github.com/yt-dlp/yt-dlp/issues/9866)) by [carusocr](https://github.com/carusocr) +- **cda**: [Fix age-gated web extraction](https://github.com/yt-dlp/yt-dlp/commit/6d8a53d870ff6795f509085bfbf3981417999038) ([#9939](https://github.com/yt-dlp/yt-dlp/issues/9939)) by [dirkf](https://github.com/dirkf), [emqi](https://github.com/emqi), [Podiumnoche](https://github.com/Podiumnoche), [Szpachlarz](https://github.com/Szpachlarz) +- **commonmistakes**: [Raise error on blob URLs](https://github.com/yt-dlp/yt-dlp/commit/98d71d8c5e5dab08b561ee6f137e968d2a004262) ([#9897](https://github.com/yt-dlp/yt-dlp/issues/9897)) by [seproDev](https://github.com/seproDev) +- **crunchyroll** + - [Always make metadata available](https://github.com/yt-dlp/yt-dlp/commit/cb2fb4a643949322adba561ca73bcba3221ec0c5) ([#9772](https://github.com/yt-dlp/yt-dlp/issues/9772)) by [bashonly](https://github.com/bashonly) + - [Fix auth and remove cookies support](https://github.com/yt-dlp/yt-dlp/commit/ff38a011d57b763f3a69bebd25a5dc9044a717ce) ([#9749](https://github.com/yt-dlp/yt-dlp/issues/9749)) by [bashonly](https://github.com/bashonly) + - [Fix stream extraction](https://github.com/yt-dlp/yt-dlp/commit/f2816634e3be88fe158b342ee33918de3c272a54) ([#10005](https://github.com/yt-dlp/yt-dlp/issues/10005)) by [bashonly](https://github.com/bashonly) + - [Support browser impersonation](https://github.com/yt-dlp/yt-dlp/commit/5904853ae5788509fdc4892cb7ecdfa9ae7f78e6) ([#9857](https://github.com/yt-dlp/yt-dlp/issues/9857)) by [bashonly](https://github.com/bashonly) +- **dangalplay**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/0d067e77c3f5527946fb0c22ee1c7011994cba40) ([#10021](https://github.com/yt-dlp/yt-dlp/issues/10021)) by [bashonly](https://github.com/bashonly) +- **discoveryplus**: [Fix dmax.de and related extractors](https://github.com/yt-dlp/yt-dlp/commit/90d2da311bbb5dc06f385ee428c7e4590936e995) ([#10020](https://github.com/yt-dlp/yt-dlp/issues/10020)) by [bashonly](https://github.com/bashonly) +- **eplus**: [Handle URLs without videos](https://github.com/yt-dlp/yt-dlp/commit/351dc0bc334c4e1b5f00c152818c3ec0ed71f788) ([#9855](https://github.com/yt-dlp/yt-dlp/issues/9855)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **europarlwebstream**: [Support new URL format](https://github.com/yt-dlp/yt-dlp/commit/800a43983e5fb719526ce4cb3956216085c63268) ([#9647](https://github.com/yt-dlp/yt-dlp/issues/9647)) by [seproDev](https://github.com/seproDev), [voidful](https://github.com/voidful) +- **facebook**: [Fix DASH formats extraction](https://github.com/yt-dlp/yt-dlp/commit/e3b42d8b1b8bcfff7ba146c19fc3f6f6ba843cea) ([#9734](https://github.com/yt-dlp/yt-dlp/issues/9734)) by [bashonly](https://github.com/bashonly) +- **godresource**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/65e709d23530959075816e966c42179ad46e8e3b) ([#9629](https://github.com/yt-dlp/yt-dlp/issues/9629)) by [HobbyistDev](https://github.com/HobbyistDev) +- **googledrive**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/85ec2a337ac325cf6427cbafd56f0a034c1a5218) ([#9908](https://github.com/yt-dlp/yt-dlp/issues/9908)) by [WyohKnott](https://github.com/WyohKnott) +- **hearthisat**: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/5bbfdb7c999b22f1aeca0c3489c167d6eb73013b) ([#9949](https://github.com/yt-dlp/yt-dlp/issues/9949)) by [bohwaz](https://github.com/bohwaz), [seproDev](https://github.com/seproDev) +- **hytale**: [Use `CloudflareStreamIE` explicitly](https://github.com/yt-dlp/yt-dlp/commit/31b417e1d1ccc67d5c027bf8878f483dc34cb118) ([#9672](https://github.com/yt-dlp/yt-dlp/issues/9672)) by [llamasblade](https://github.com/llamasblade) +- **instagram**: [Support `/reels/` URLs](https://github.com/yt-dlp/yt-dlp/commit/06cb0638392b607b47d3c2ac48eb2ebecb0f060d) ([#9539](https://github.com/yt-dlp/yt-dlp/issues/9539)) by [amir16yp](https://github.com/amir16yp) +- **jiocinema**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/1463945ae5fb05986a0bd1aa02e41d1a08d93a02) ([#10026](https://github.com/yt-dlp/yt-dlp/issues/10026)) by [bashonly](https://github.com/bashonly) +- **jiosaavn**: [Extract via API and fix playlists](https://github.com/yt-dlp/yt-dlp/commit/0c21c53885cf03f4040467ae8c44d7ff51016116) ([#9656](https://github.com/yt-dlp/yt-dlp/issues/9656)) by [bashonly](https://github.com/bashonly) +- **lci**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/5a2eebc76770fca91ffabeff658d560f716fec80) ([#10025](https://github.com/yt-dlp/yt-dlp/issues/10025)) by [ocococococ](https://github.com/ocococococ) +- **mixch**: [Extract comments](https://github.com/yt-dlp/yt-dlp/commit/b38018b781b062d5169d104ab430489aef8e7f1e) ([#9860](https://github.com/yt-dlp/yt-dlp/issues/9860)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **moviepilot**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/296df0da1d38a44d34c99b60a18066c301774537) ([#9366](https://github.com/yt-dlp/yt-dlp/issues/9366)) by [panatexxa](https://github.com/panatexxa) +- **netease**: program: [Improve `--no-playlist` message](https://github.com/yt-dlp/yt-dlp/commit/73f12119b52d98281804b0c072b2ed6aa841ec88) ([#9488](https://github.com/yt-dlp/yt-dlp/issues/9488)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **nfb**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/0a1a8e3005f66c44bf67633dccd4df19c3fccd1a) ([#9650](https://github.com/yt-dlp/yt-dlp/issues/9650)) by [rrgomes](https://github.com/rrgomes) +- **ntslive**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/be7db1a5a8c483726c511c30ea4689cbb8b27962) ([#9641](https://github.com/yt-dlp/yt-dlp/issues/9641)) by [lostfictions](https://github.com/lostfictions) +- **orf**: on: [Improve extraction](https://github.com/yt-dlp/yt-dlp/commit/0dd53faeca2ba0ce138e4092d07b5f2dbf2422f9) ([#9677](https://github.com/yt-dlp/yt-dlp/issues/9677)) by [TuxCoder](https://github.com/TuxCoder) +- **orftvthek**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/3779f2a307ba3ef1d28e107cdd71b221dfb4eb36) ([#10011](https://github.com/yt-dlp/yt-dlp/issues/10011)) by [seproDev](https://github.com/seproDev) +- **patreon** + - [Extract multiple embeds](https://github.com/yt-dlp/yt-dlp/commit/036e0d92c6052465673d459678322ea03e61483d) ([#9850](https://github.com/yt-dlp/yt-dlp/issues/9850)) by [bashonly](https://github.com/bashonly) + - [Fix Vimeo embed extraction](https://github.com/yt-dlp/yt-dlp/commit/c9ce57d9bf51541da2381d99bc096a9d0ddf1f27) ([#9712](https://github.com/yt-dlp/yt-dlp/issues/9712)) by [bashonly](https://github.com/bashonly) +- **piapro**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/3ba8de62d61d782256f5c1e9939a0762039657de) ([#9311](https://github.com/yt-dlp/yt-dlp/issues/9311)) by [FinnRG](https://github.com/FinnRG), [seproDev](https://github.com/seproDev) +- **pornhub**: [Fix login by email address](https://github.com/yt-dlp/yt-dlp/commit/518c1afc1592cae3e4eb39dc646b5bc059333112) ([#9914](https://github.com/yt-dlp/yt-dlp/issues/9914)) by [feederbox826](https://github.com/feederbox826) +- **qub**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/6b54cccdcb892bca3e55993480d8b86f1c7e6da6) ([#7019](https://github.com/yt-dlp/yt-dlp/issues/7019)) by [alexhuot1](https://github.com/alexhuot1), [dirkf](https://github.com/dirkf) +- **reddit**: [Fix subtitles extraction](https://github.com/yt-dlp/yt-dlp/commit/82f4f4444e26daf35b7302c406fe2312f78f619e) ([#10006](https://github.com/yt-dlp/yt-dlp/issues/10006)) by [kclauhk](https://github.com/kclauhk) +- **soundcloud** + - [Add `formats` extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/beaf832c7a9d57833f365ce18f6115b88071b296) ([#10004](https://github.com/yt-dlp/yt-dlp/issues/10004)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + - [Extract `genres`](https://github.com/yt-dlp/yt-dlp/commit/231c2eacc41b06b65c63edf94c0d04768a5da607) ([#9821](https://github.com/yt-dlp/yt-dlp/issues/9821)) by [bashonly](https://github.com/bashonly) +- **taptap**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/63b569bc5e7d461753637a20ad84a575adee4c0a) ([#9776](https://github.com/yt-dlp/yt-dlp/issues/9776)) by [c-basalt](https://github.com/c-basalt) +- **tele5**: [Overhaul extractor](https://github.com/yt-dlp/yt-dlp/commit/c92e4e625e9e6bbbbf8e3b20c3e7ebe57c16072d) ([#10024](https://github.com/yt-dlp/yt-dlp/issues/10024)) by [bashonly](https://github.com/bashonly) +- **theatercomplextown**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/8056a3026ed6ec6a6d0ed56fdd7ebcd16e928341) ([#9754](https://github.com/yt-dlp/yt-dlp/issues/9754)) by [bashonly](https://github.com/bashonly) +- **tiktok** + - [Add `device_id` extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/3584b8390bd21c0393a3079eeee71aed56a1c1d8) ([#9951](https://github.com/yt-dlp/yt-dlp/issues/9951)) by [bashonly](https://github.com/bashonly) + - [Extract all web formats](https://github.com/yt-dlp/yt-dlp/commit/4ccd73fea0f6f4be343e1ec7f22dd03799addcf8) ([#9960](https://github.com/yt-dlp/yt-dlp/issues/9960)) by [bashonly](https://github.com/bashonly) + - [Extract via mobile API only if extractor-arg is passed](https://github.com/yt-dlp/yt-dlp/commit/41ba4a808b597a3afed78c89675a30deb6844450) ([#9938](https://github.com/yt-dlp/yt-dlp/issues/9938)) by [bashonly](https://github.com/bashonly) + - [Fix subtitles extraction](https://github.com/yt-dlp/yt-dlp/commit/eef1e9f44ff14c5e65b759bb1eafa3946cdaf719) ([#9961](https://github.com/yt-dlp/yt-dlp/issues/9961)) by [bashonly](https://github.com/bashonly) + - collection: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/119d41f27061d220d276a2d38cfc8d873437452a) ([#9986](https://github.com/yt-dlp/yt-dlp/issues/9986)) by [bashonly](https://github.com/bashonly), [imanoreotwe](https://github.com/imanoreotwe) + - user: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/347f13dd9bccc2b4db3ea25689410d45d8370ed4) ([#9661](https://github.com/yt-dlp/yt-dlp/issues/9661)) by [bashonly](https://github.com/bashonly) +- **tv5monde**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/6db96268c521e945d42649607db1574f5d92e082) ([#9143](https://github.com/yt-dlp/yt-dlp/issues/9143)) by [alard](https://github.com/alard), [seproDev](https://github.com/seproDev) +- **twitter** + - [Fix auth for x.com migration](https://github.com/yt-dlp/yt-dlp/commit/3e35aa32c74bc108375be8c8b6b3bfc90dfff1b4) ([#9952](https://github.com/yt-dlp/yt-dlp/issues/9952)) by [bashonly](https://github.com/bashonly) + - [Support x.com URLs](https://github.com/yt-dlp/yt-dlp/commit/4813173e4544f125d6f2afc31e600727d761b8dd) ([#9926](https://github.com/yt-dlp/yt-dlp/issues/9926)) by [bashonly](https://github.com/bashonly) +- **vk**: [Improve format extraction](https://github.com/yt-dlp/yt-dlp/commit/df5c9e733aaba703cf285c0372b6d61629330c82) ([#9885](https://github.com/yt-dlp/yt-dlp/issues/9885)) by [seproDev](https://github.com/seproDev) +- **wrestleuniverse**: [Avoid partial stream formats](https://github.com/yt-dlp/yt-dlp/commit/c4853655cb9a793129280806af643de43c48f4d5) ([#9800](https://github.com/yt-dlp/yt-dlp/issues/9800)) by [bashonly](https://github.com/bashonly) +- **xiaohongshu**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/a2e9031605d87c469be9ce98dbbdf4960b727338) ([#9646](https://github.com/yt-dlp/yt-dlp/issues/9646)) by [HobbyistDev](https://github.com/HobbyistDev) +- **xvideos**: quickies: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/b207d26f83fb8ab0ce56df74dff43ff583a3264f) ([#9834](https://github.com/yt-dlp/yt-dlp/issues/9834)) by [JakeFinley96](https://github.com/JakeFinley96) +- **youporn**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/351368cb9a6731b886a58f5a10fd6b302bbe47be) ([#8827](https://github.com/yt-dlp/yt-dlp/issues/8827)) by [The-MAGI](https://github.com/The-MAGI) +- **youtube** + - [Add `mediaconnect` client](https://github.com/yt-dlp/yt-dlp/commit/cf212d0a331aba05c32117573f760cdf3af8c62f) ([#9546](https://github.com/yt-dlp/yt-dlp/issues/9546)) by [clienthax](https://github.com/clienthax) + - [Extract upload timestamp if available](https://github.com/yt-dlp/yt-dlp/commit/96a134dea6397a5f2131947c427aac52c8b4e677) ([#9856](https://github.com/yt-dlp/yt-dlp/issues/9856)) by [coletdjnz](https://github.com/coletdjnz) + - [Fix comments extraction](https://github.com/yt-dlp/yt-dlp/commit/8e15177b4113c355989881e4e030f695a9b59c3a) ([#9775](https://github.com/yt-dlp/yt-dlp/issues/9775)) by [bbilly1](https://github.com/bbilly1), [jakeogh](https://github.com/jakeogh), [minamotorin](https://github.com/minamotorin), [shoxie007](https://github.com/shoxie007) + - [Remove `android` from default clients](https://github.com/yt-dlp/yt-dlp/commit/12d8ea8246fa901de302ff5cc748caddadc82f41) ([#9553](https://github.com/yt-dlp/yt-dlp/issues/9553)) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz) +- **zenyandex**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/c4b87dd885ee5391e5f481e7c8bd550a7c543623) ([#9813](https://github.com/yt-dlp/yt-dlp/issues/9813)) by [src-tinkerer](https://github.com/src-tinkerer) + +#### Networking changes +- [Add `extensions` attribute to `Response`](https://github.com/yt-dlp/yt-dlp/commit/bec9a59e8ec82c18e3bf9268eaa436793dd52e35) ([#9756](https://github.com/yt-dlp/yt-dlp/issues/9756)) by [bashonly](https://github.com/bashonly) +- **Request Handler** + - requests + - [Patch support for `requests` 2.32.2+](https://github.com/yt-dlp/yt-dlp/commit/3f7999533ebe41c2a579d91b4e4cb211cfcd3bc0) ([#9992](https://github.com/yt-dlp/yt-dlp/issues/9992)) by [Grub4K](https://github.com/Grub4K) + - [Update to `requests` 2.32.0](https://github.com/yt-dlp/yt-dlp/commit/c36513f1be2ef3d3cec864accbffda1afaa06ffd) ([#9980](https://github.com/yt-dlp/yt-dlp/issues/9980)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- [Add `hatch`, `ruff`, `pre-commit` and improve dev docs](https://github.com/yt-dlp/yt-dlp/commit/e897bd8292a41999cf51dba91b390db5643c72db) ([#7409](https://github.com/yt-dlp/yt-dlp/issues/7409)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K), [seproDev](https://github.com/seproDev) +- **build** + - [Migrate `linux_exe` to static musl builds](https://github.com/yt-dlp/yt-dlp/commit/ac817bc83efd939dca3e40c4b527d0ccfc77172b) ([#9811](https://github.com/yt-dlp/yt-dlp/issues/9811)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + - [Normalize `curl_cffi` group to `curl-cffi`](https://github.com/yt-dlp/yt-dlp/commit/02483bea1c4dbe1bace8ca4d19700104fbb8a00f) ([#9698](https://github.com/yt-dlp/yt-dlp/issues/9698)) by [bashonly](https://github.com/bashonly) (With fixes in [89f535e](https://github.com/yt-dlp/yt-dlp/commit/89f535e2656964b4061c25a7739d4d6ba0a30568)) + - [Run `macos_legacy` job on `macos-12`](https://github.com/yt-dlp/yt-dlp/commit/1a366403d9c26b992faa77e00f4d02ead57559e3) ([#9804](https://github.com/yt-dlp/yt-dlp/issues/9804)) by [bashonly](https://github.com/bashonly) + - [`macos` job requires `setuptools<70`](https://github.com/yt-dlp/yt-dlp/commit/78c57cc0e0998b8ed90e4306f410aa4be4115cd7) ([#9993](https://github.com/yt-dlp/yt-dlp/issues/9993)) by [bashonly](https://github.com/bashonly) +- **cleanup** + - [Remove questionable extractors](https://github.com/yt-dlp/yt-dlp/commit/01395a34345d1c6ba1b73ca92f94dd200dc45341) ([#9911](https://github.com/yt-dlp/yt-dlp/issues/9911)) by [seproDev](https://github.com/seproDev) + - Miscellaneous: [5c019f6](https://github.com/yt-dlp/yt-dlp/commit/5c019f6328ad40d66561eac3c4de0b3cd070d0f6), [ae2af11](https://github.com/yt-dlp/yt-dlp/commit/ae2af1104f80caf2f47544763a33db2c17a3e1de) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K), [seproDev](https://github.com/seproDev) +- **test** + - [Add HTTP proxy tests](https://github.com/yt-dlp/yt-dlp/commit/3c7a287e281d9f9a353dce8902ff78a84c24a040) ([#9578](https://github.com/yt-dlp/yt-dlp/issues/9578)) by [coletdjnz](https://github.com/coletdjnz) + - [Fix connect timeout test](https://github.com/yt-dlp/yt-dlp/commit/53b4d44f55cca66ac33dab092ef2a30b1164b684) ([#9906](https://github.com/yt-dlp/yt-dlp/issues/9906)) by [coletdjnz](https://github.com/coletdjnz) + +### 2024.04.09 + +#### Important changes +- Security: [[CVE-2024-22423](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2024-22423)] [Prevent RCE when using `--exec` with `%q` on Windows](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-hjq6-52gw-2g7p) + - The shell escape function now properly escapes `%`, `\` and `\n`. + - `utils.Popen` has been patched accordingly. + +#### Core changes +- [Add new option `--progress-delta`](https://github.com/yt-dlp/yt-dlp/commit/9590cc6b4768e190183d7d071a6c78170889116a) ([#9082](https://github.com/yt-dlp/yt-dlp/issues/9082)) by [Grub4K](https://github.com/Grub4K) +- [Add new options `--impersonate` and `--list-impersonate-targets`](https://github.com/yt-dlp/yt-dlp/commit/0b81d4d252bd065ccd352722987ea34fe17f9244) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) +- [Add option `--no-break-on-existing`](https://github.com/yt-dlp/yt-dlp/commit/16be117729150b2784f3b17755c886cb0cf73374) ([#9610](https://github.com/yt-dlp/yt-dlp/issues/9610)) by [bashonly](https://github.com/bashonly) +- [Fix `filesize_approx` calculation](https://github.com/yt-dlp/yt-dlp/commit/86e3b82261e8ebc6c6707c09544c9dfb8907c0fd) ([#9560](https://github.com/yt-dlp/yt-dlp/issues/9560)) by [pukkandan](https://github.com/pukkandan), [seproDev](https://github.com/seproDev) +- [Infer `acodec` for single-codec containers](https://github.com/yt-dlp/yt-dlp/commit/86a972033e05fea80e5fe7f2aff6723dbe2f3952) by [pukkandan](https://github.com/pukkandan) +- [Prevent RCE when using `--exec` with `%q` (CVE-2024-22423)](https://github.com/yt-dlp/yt-dlp/commit/ff07792676f404ffff6ee61b5638c9dc1a33a37a) by [Grub4K](https://github.com/Grub4K) +- **cookies**: [Add `--cookies-from-browser` support for Firefox Flatpak](https://github.com/yt-dlp/yt-dlp/commit/2ab2651a4a7be18939e2b4cb21be79fe477c797a) ([#9619](https://github.com/yt-dlp/yt-dlp/issues/9619)) by [un-def](https://github.com/un-def) +- **utils** + - `traverse_obj` + - [Allow unbranching using `all` and `any`](https://github.com/yt-dlp/yt-dlp/commit/3699eeb67cad333272b14a42dd3843d93fda1a2e) ([#9571](https://github.com/yt-dlp/yt-dlp/issues/9571)) by [Grub4K](https://github.com/Grub4K) + - [Convenience improvements](https://github.com/yt-dlp/yt-dlp/commit/32abfb00bdbd119ca675fdc6d1719331f0a2741a) ([#9577](https://github.com/yt-dlp/yt-dlp/issues/9577)) by [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- [Add extractor impersonate API](https://github.com/yt-dlp/yt-dlp/commit/50c29352312f5662acf9a64b0012766f5c40af61) ([#9474](https://github.com/yt-dlp/yt-dlp/issues/9474)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) +- **afreecatv** + - [Overhaul extractor](https://github.com/yt-dlp/yt-dlp/commit/9415f1a5ef88482ebafe3083e8bcb778ac512df7) ([#9566](https://github.com/yt-dlp/yt-dlp/issues/9566)) by [bashonly](https://github.com/bashonly), [Tomoka1](https://github.com/Tomoka1) + - live: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/9073ae6458f4c6a832aa832c67174c61852869be) ([#9348](https://github.com/yt-dlp/yt-dlp/issues/9348)) by [hui1601](https://github.com/hui1601) +- **asobistage**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/0284f1fee202302a78888420f933deae19d9f4e1) ([#8735](https://github.com/yt-dlp/yt-dlp/issues/8735)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **box**: [Support URLs without file IDs](https://github.com/yt-dlp/yt-dlp/commit/07f5b2f7570fd9ac85aed17f4c0118f6eac77beb) ([#9504](https://github.com/yt-dlp/yt-dlp/issues/9504)) by [shreyasminocha](https://github.com/shreyasminocha) +- **cbc.ca**: player: [Support new URL format](https://github.com/yt-dlp/yt-dlp/commit/b49d5ffc53a72d8245ba319ff07bdc5b8c6a4f0c) ([#9561](https://github.com/yt-dlp/yt-dlp/issues/9561)) by [trainman261](https://github.com/trainman261) +- **crunchyroll** + - [Extract `vo_adaptive_hls` formats by default](https://github.com/yt-dlp/yt-dlp/commit/be77923ffe842f667971019460f6005f3cad01eb) ([#9447](https://github.com/yt-dlp/yt-dlp/issues/9447)) by [bashonly](https://github.com/bashonly) + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/954e57e405f79188450eb30103a9308732cd318f) ([#9615](https://github.com/yt-dlp/yt-dlp/issues/9615)) by [bytedream](https://github.com/bytedream) +- **dropbox**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/a48cc86d6f6b20427553620c2ddb990ede6a4b41) ([#9627](https://github.com/yt-dlp/yt-dlp/issues/9627)) by [bashonly](https://github.com/bashonly) +- **fathom**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/bc2b8c0596fd6b75af24822c4f0f1da6783d71f7) ([#9495](https://github.com/yt-dlp/yt-dlp/issues/9495)) by [src-tinkerer](https://github.com/src-tinkerer) +- **gofile**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/0da66980d3193cad3dae0120cddddbfcabddf7a1) ([#9446](https://github.com/yt-dlp/yt-dlp/issues/9446)) by [jazz1611](https://github.com/jazz1611) +- **imgur**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/86d2f4d24849af0d1f3af7c0e2ac43bf8a058f74) ([#9471](https://github.com/yt-dlp/yt-dlp/issues/9471)) by [trwstin](https://github.com/trwstin) +- **jiosaavn** + - [Extract artists](https://github.com/yt-dlp/yt-dlp/commit/0ae16ceb1846cc4e609b70ce7c5d8e7458efceb2) ([#9612](https://github.com/yt-dlp/yt-dlp/issues/9612)) by [bashonly](https://github.com/bashonly) + - [Fix format extensions](https://github.com/yt-dlp/yt-dlp/commit/443e206ec41e64ca2aef61d8ef91640fb69b3113) ([#9609](https://github.com/yt-dlp/yt-dlp/issues/9609)) by [bashonly](https://github.com/bashonly) + - [Support playlists](https://github.com/yt-dlp/yt-dlp/commit/2e94602f241f6e41bdc48576c61089435529339b) ([#9622](https://github.com/yt-dlp/yt-dlp/issues/9622)) by [bashonly](https://github.com/bashonly) +- **joqrag**: [Fix live status detection](https://github.com/yt-dlp/yt-dlp/commit/f2fd449b46c4058222e1744f7a35caa20b2d003d) ([#9624](https://github.com/yt-dlp/yt-dlp/issues/9624)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **kick**: [Support browser impersonation](https://github.com/yt-dlp/yt-dlp/commit/c8a61a910096c77ce08dad5e1b2fbda5eb964156) ([#9611](https://github.com/yt-dlp/yt-dlp/issues/9611)) by [bashonly](https://github.com/bashonly) +- **loom**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/f859ed3ba1e8b129ae6a467592c65687e73fbca1) ([#8686](https://github.com/yt-dlp/yt-dlp/issues/8686)) by [bashonly](https://github.com/bashonly), [hruzgar](https://github.com/hruzgar) +- **medici**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/4cd9e251b9abada107b10830de997bf4d79ca369) ([#9518](https://github.com/yt-dlp/yt-dlp/issues/9518)) by [Offert4324](https://github.com/Offert4324) +- **mixch** + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/4c3b7a0769706f7f0ea24adf1f219d5ae82d2b07) ([#9608](https://github.com/yt-dlp/yt-dlp/issues/9608)) by [bashonly](https://github.com/bashonly), [nipotan](https://github.com/nipotan) + - archive: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/c59de48e2bb4c681b03b93b584a05f52609ce4a0) ([#8761](https://github.com/yt-dlp/yt-dlp/issues/8761)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **nhk**: [Fix NHK World extractors](https://github.com/yt-dlp/yt-dlp/commit/4af9d5c2f6aa81403ae2a8a5ae3cc824730f0b86) ([#9623](https://github.com/yt-dlp/yt-dlp/issues/9623)) by [bashonly](https://github.com/bashonly) +- **patreon**: [Do not extract dead embed URLs](https://github.com/yt-dlp/yt-dlp/commit/36b240f9a72af57eb2c9d927ebb7fd1c917ebf18) ([#9613](https://github.com/yt-dlp/yt-dlp/issues/9613)) by [johnvictorfs](https://github.com/johnvictorfs) +- **radio1be**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/36baaa10e06715ccba06b78885b2042c4844c826) ([#9122](https://github.com/yt-dlp/yt-dlp/issues/9122)) by [HobbyistDev](https://github.com/HobbyistDev) +- **sharepoint**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/ff349ff94aae0b2b148bd3670f7c91d39c2f1d8e) ([#6531](https://github.com/yt-dlp/yt-dlp/issues/6531)) by [bashonly](https://github.com/bashonly), [C0D3D3V](https://github.com/C0D3D3V) +- **sonylivseries**: [Fix season extraction](https://github.com/yt-dlp/yt-dlp/commit/f2868b26e917354203f82a370ad2396646edb813) ([#9423](https://github.com/yt-dlp/yt-dlp/issues/9423)) by [bashonly](https://github.com/bashonly) +- **soundcloud** + - [Adjust format sorting](https://github.com/yt-dlp/yt-dlp/commit/a2d0840739cddd585d24e0ce4796394fc8a4fa2e) ([#9584](https://github.com/yt-dlp/yt-dlp/issues/9584)) by [bashonly](https://github.com/bashonly) + - [Support cookies](https://github.com/yt-dlp/yt-dlp/commit/97362712a1f2b04e735bdf54f749ad99165a62fe) ([#9586](https://github.com/yt-dlp/yt-dlp/issues/9586)) by [bashonly](https://github.com/bashonly) + - [Support retries for API rate-limit](https://github.com/yt-dlp/yt-dlp/commit/246571ae1d867df8bf31a056bdf3bbbfd398366a) ([#9585](https://github.com/yt-dlp/yt-dlp/issues/9585)) by [bashonly](https://github.com/bashonly) +- **thisoldhouse**: [Support Brightcove embeds](https://github.com/yt-dlp/yt-dlp/commit/0df63cce69026d2f4c0cbb4dd36163e83eac93dc) ([#9576](https://github.com/yt-dlp/yt-dlp/issues/9576)) by [bashonly](https://github.com/bashonly) +- **tiktok** + - [Fix API extraction](https://github.com/yt-dlp/yt-dlp/commit/cb61e20c266facabb7a30f9ce53bd79dfc158475) ([#9548](https://github.com/yt-dlp/yt-dlp/issues/9548)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + - [Prefer non-bytevc2 formats](https://github.com/yt-dlp/yt-dlp/commit/63f685f341f35f6f02b0368d1ba53bdb5b520410) ([#9575](https://github.com/yt-dlp/yt-dlp/issues/9575)) by [bashonly](https://github.com/bashonly) + - [Restore `carrier_region` API parameter](https://github.com/yt-dlp/yt-dlp/commit/fc53ec13ff1ee926a3e533a68cfca8acc887b661) ([#9637](https://github.com/yt-dlp/yt-dlp/issues/9637)) by [bashonly](https://github.com/bashonly) + - [Update API hostname](https://github.com/yt-dlp/yt-dlp/commit/8c05b3ebae23c5b444857549a85b84004c01a536) ([#9444](https://github.com/yt-dlp/yt-dlp/issues/9444)) by [bashonly](https://github.com/bashonly) +- **twitch**: [Extract AV1 and HEVC formats](https://github.com/yt-dlp/yt-dlp/commit/02f93ff51b3ff9436d60c4993562b366eaae8851) ([#9158](https://github.com/yt-dlp/yt-dlp/issues/9158)) by [kasper93](https://github.com/kasper93) +- **vkplay**: [Fix `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/b15b0c1d2106437ec61a5c436c543e8760eac160) ([#9636](https://github.com/yt-dlp/yt-dlp/issues/9636)) by [bashonly](https://github.com/bashonly) +- **xvideos**: [Support new URL format](https://github.com/yt-dlp/yt-dlp/commit/aa7e9ae4f48276bd5d0173966c77db9484f65a0a) ([#9502](https://github.com/yt-dlp/yt-dlp/issues/9502)) by [sta1us](https://github.com/sta1us) +- **youtube** + - [Calculate more accurate `filesize`](https://github.com/yt-dlp/yt-dlp/commit/a25a424323267e3f6f9f63c0b62df499bd7b8d46) by [pukkandan](https://github.com/pukkandan) + - [Update `android` params](https://github.com/yt-dlp/yt-dlp/commit/e7b17fce14775bd2448695c8eb7379b8d31d3537) by [pukkandan](https://github.com/pukkandan) + - search: [Fix params for uncensored results](https://github.com/yt-dlp/yt-dlp/commit/17d248a58781e2588d18a5ebe00c441d10011fcd) ([#9456](https://github.com/yt-dlp/yt-dlp/issues/9456)) by [alb](https://github.com/alb), [pukkandan](https://github.com/pukkandan) + +#### Downloader changes +- **ffmpeg**: [Accept output args from info dict](https://github.com/yt-dlp/yt-dlp/commit/9c42b7eef547e826e9fcc7beb6706a2523949d05) ([#9278](https://github.com/yt-dlp/yt-dlp/issues/9278)) by [bashonly](https://github.com/bashonly) + +#### Networking changes +- [Respect `SSLKEYLOGFILE` environment variable](https://github.com/yt-dlp/yt-dlp/commit/79a451e5763eda8b10d00684d5d3378f3255ee01) ([#9543](https://github.com/yt-dlp/yt-dlp/issues/9543)) by [luiso1979](https://github.com/luiso1979) +- **Request Handler** + - curlcffi: [Add support for `curl_cffi`](https://github.com/yt-dlp/yt-dlp/commit/52f5be1f1e0dc45bb397ab950f564721976a39bf) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) + - websockets: [Workaround race condition causing issues on PyPy](https://github.com/yt-dlp/yt-dlp/commit/e5d4f11104ce7ea1717a90eea82c0f7d230ea5d5) ([#9514](https://github.com/yt-dlp/yt-dlp/issues/9514)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **build** + - [Do not include `curl_cffi` in `macos_legacy`](https://github.com/yt-dlp/yt-dlp/commit/b19ae095fdddd43c2a2c67d10fbe0d9a645bb98f) ([#9653](https://github.com/yt-dlp/yt-dlp/issues/9653)) by [bashonly](https://github.com/bashonly) + - [Optional dependencies cleanup](https://github.com/yt-dlp/yt-dlp/commit/58dd0f8d1eee6bc9fdc57f1923bed772fa3c946d) ([#9550](https://github.com/yt-dlp/yt-dlp/issues/9550)) by [bashonly](https://github.com/bashonly) + - [Print SHA sums to GHA logs](https://github.com/yt-dlp/yt-dlp/commit/e8032503b9517465b0e86d776fc1e60d8795d673) ([#9582](https://github.com/yt-dlp/yt-dlp/issues/9582)) by [bashonly](https://github.com/bashonly) + - [Update changelog for tarball and sdist](https://github.com/yt-dlp/yt-dlp/commit/17b96974a334688f76b57d350e07cae8cda46877) ([#9425](https://github.com/yt-dlp/yt-dlp/issues/9425)) by [bashonly](https://github.com/bashonly) +- **cleanup** + - [Standardize `import datetime as dt`](https://github.com/yt-dlp/yt-dlp/commit/c305a25c1b16bcf7a5ec499c3b786ed1e2c748da) ([#8978](https://github.com/yt-dlp/yt-dlp/issues/8978)) by [pukkandan](https://github.com/pukkandan) + - ie: [No `from` stdlib imports in extractors](https://github.com/yt-dlp/yt-dlp/commit/e3a3ed8a981d9395c4859b6ef56cd02bc3148db2) by [pukkandan](https://github.com/pukkandan) + - Miscellaneous: [216f6a3](https://github.com/yt-dlp/yt-dlp/commit/216f6a3cb57824e6a3c859649ce058c199b1b247) by [bashonly](https://github.com/bashonly), [pukkandan](https://github.com/pukkandan) +- **docs** + - [Update yt-dlp tagline](https://github.com/yt-dlp/yt-dlp/commit/388c979ac63a8774339fac2516fe1cc852b4276e) ([#9481](https://github.com/yt-dlp/yt-dlp/issues/9481)) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan), [seproDev](https://github.com/seproDev) + - [Various manpage fixes](https://github.com/yt-dlp/yt-dlp/commit/df0e138fc02ae2764a44f2f59fc93c756c4d3ee2) by [leoheitmannruiz](https://github.com/leoheitmannruiz) +- **test** + - [Workaround websocket server hanging](https://github.com/yt-dlp/yt-dlp/commit/f849d77ab54788446b995d256e1ee0894c4fb927) ([#9467](https://github.com/yt-dlp/yt-dlp/issues/9467)) by [coletdjnz](https://github.com/coletdjnz) + - `traversal`: [Separate traversal tests](https://github.com/yt-dlp/yt-dlp/commit/979ce2e786f2ee3fc783b6dc1ef4188d8805c923) ([#9574](https://github.com/yt-dlp/yt-dlp/issues/9574)) by [Grub4K](https://github.com/Grub4K) + +### 2024.03.10 + +#### Core changes +- [Add `--compat-options 2023`](https://github.com/yt-dlp/yt-dlp/commit/3725b4f0c93ca3943e6300013a9670e4ab757fda) ([#9084](https://github.com/yt-dlp/yt-dlp/issues/9084)) by [Grub4K](https://github.com/Grub4K) (With fixes in [ffff1bc](https://github.com/yt-dlp/yt-dlp/commit/ffff1bc6598fc7a9258e51bc153cab812467f9f9) by [pukkandan](https://github.com/pukkandan)) +- [Create `ydl._request_director` when needed](https://github.com/yt-dlp/yt-dlp/commit/069b2aedae2279668b6051627a81fc4fbd9c146a) by [pukkandan](https://github.com/pukkandan) (With fixes in [dbd8b1b](https://github.com/yt-dlp/yt-dlp/commit/dbd8b1bff9afd8f05f982bcd52c20bc173c266ca) by [Grub4k](https://github.com/Grub4k)) +- [Don't select storyboard formats as fallback](https://github.com/yt-dlp/yt-dlp/commit/d63eae7e7ffb1f3e733e552b9e5e82355bfba214) by [bashonly](https://github.com/bashonly) +- [Handle `--load-info-json` format selection errors](https://github.com/yt-dlp/yt-dlp/commit/263a4b55ac17a796e8991ca8d2d86a3c349f8a60) ([#9392](https://github.com/yt-dlp/yt-dlp/issues/9392)) by [bashonly](https://github.com/bashonly) +- [Warn user when not launching through shell on Windows](https://github.com/yt-dlp/yt-dlp/commit/6a6cdcd1824a14e3b336332c8f31f65497b8c4b8) ([#9250](https://github.com/yt-dlp/yt-dlp/issues/9250)) by [Grub4K](https://github.com/Grub4K), [seproDev](https://github.com/seproDev) +- **cookies** + - [Fix `--cookies-from-browser` for `snap` Firefox](https://github.com/yt-dlp/yt-dlp/commit/cbed249aaa053a3f425b9bafc97f8dbd71c44487) ([#9016](https://github.com/yt-dlp/yt-dlp/issues/9016)) by [Grub4K](https://github.com/Grub4K) + - [Fix `--cookies-from-browser` with macOS Firefox profiles](https://github.com/yt-dlp/yt-dlp/commit/85b33f5c163f60dbd089a6b9bc2ba1366d3ddf93) ([#8909](https://github.com/yt-dlp/yt-dlp/issues/8909)) by [RalphORama](https://github.com/RalphORama) + - [Improve error message for Windows `--cookies-from-browser chrome` issue](https://github.com/yt-dlp/yt-dlp/commit/2792092afd367e39251ace1fb2819c855ab8919f) ([#9080](https://github.com/yt-dlp/yt-dlp/issues/9080)) by [Grub4K](https://github.com/Grub4K) +- **plugins**: [Handle `PermissionError`](https://github.com/yt-dlp/yt-dlp/commit/9a8afadd172b7cab143f0049959fa64973589d94) ([#9229](https://github.com/yt-dlp/yt-dlp/issues/9229)) by [pukkandan](https://github.com/pukkandan), [syntaxsurge](https://github.com/syntaxsurge) +- **utils** + - [Improve `repr` of `DateRange`, `match_filter_func`](https://github.com/yt-dlp/yt-dlp/commit/45491a2a30da4d1723cfa9288cb664813bb09afb) by [pukkandan](https://github.com/pukkandan) + - `traverse_obj`: [Support `xml.etree.ElementTree.Element`](https://github.com/yt-dlp/yt-dlp/commit/ffbd4f2a02fee387ea5e0a267ce32df5259111ac) ([#8911](https://github.com/yt-dlp/yt-dlp/issues/8911)) by [Grub4K](https://github.com/Grub4K) +- **webvtt**: [Don't parse single fragment files](https://github.com/yt-dlp/yt-dlp/commit/f24e44e8cbd88ce338d52f594a19330f64d38b50) ([#9034](https://github.com/yt-dlp/yt-dlp/issues/9034)) by [seproDev](https://github.com/seproDev) + +#### Extractor changes +- [Migrate commonly plural fields to lists](https://github.com/yt-dlp/yt-dlp/commit/104a7b5a46dc1805157fb4cc11c05876934d37c1) ([#8917](https://github.com/yt-dlp/yt-dlp/issues/8917)) by [llistochek](https://github.com/llistochek), [pukkandan](https://github.com/pukkandan) (With fixes in [b136e2a](https://github.com/yt-dlp/yt-dlp/commit/b136e2af341f7a88028aea4c5cd50efe2fa9b182) by [bashonly](https://github.com/bashonly)) +- [Support multi-period MPD streams](https://github.com/yt-dlp/yt-dlp/commit/4ce57d3b873c2887814cbec03d029533e82f7db5) ([#6654](https://github.com/yt-dlp/yt-dlp/issues/6654)) by [alard](https://github.com/alard), [pukkandan](https://github.com/pukkandan) +- **abematv** + - [Fix extraction with cache](https://github.com/yt-dlp/yt-dlp/commit/c51316f8a69fbd0080f2720777d42ab438e254a3) ([#8895](https://github.com/yt-dlp/yt-dlp/issues/8895)) by [sefidel](https://github.com/sefidel) + - [Support login for playlists](https://github.com/yt-dlp/yt-dlp/commit/8226a3818f804478c756cf460baa9bf3a3b062a5) ([#8901](https://github.com/yt-dlp/yt-dlp/issues/8901)) by [sefidel](https://github.com/sefidel) +- **adn** + - [Add support for German site](https://github.com/yt-dlp/yt-dlp/commit/5eb1458be4767385a9bf1d570ff08e46100cbaa2) ([#8708](https://github.com/yt-dlp/yt-dlp/issues/8708)) by [infanf](https://github.com/infanf) + - [Improve auth error handling](https://github.com/yt-dlp/yt-dlp/commit/9526b1f179d19f75284eceaa5e0ee381af18cf19) ([#9068](https://github.com/yt-dlp/yt-dlp/issues/9068)) by [infanf](https://github.com/infanf) +- **aenetworks**: [Rating should be optional for AP extraction](https://github.com/yt-dlp/yt-dlp/commit/014cb5774d7afe624b6eb4e07f7be924b9e5e186) ([#9005](https://github.com/yt-dlp/yt-dlp/issues/9005)) by [agibson-fl](https://github.com/agibson-fl) +- **altcensored**: channel: [Fix playlist extraction](https://github.com/yt-dlp/yt-dlp/commit/e28e135d6fd6a430fed3e20dfe1a8c8bbc5f9185) ([#9297](https://github.com/yt-dlp/yt-dlp/issues/9297)) by [marcdumais](https://github.com/marcdumais) +- **amadeustv**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/e641aab7a61df7406df60ebfe0c77bd5186b2b41) ([#8744](https://github.com/yt-dlp/yt-dlp/issues/8744)) by [ArnauvGilotra](https://github.com/ArnauvGilotra) +- **ant1newsgrembed**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/1ed5ee2f045f717e814f84ba461dadc58e712266) ([#9191](https://github.com/yt-dlp/yt-dlp/issues/9191)) by [seproDev](https://github.com/seproDev) +- **archiveorg**: [Fix format URL encoding](https://github.com/yt-dlp/yt-dlp/commit/3894ab9574748188bbacbd925a3971eda6fa2bb0) ([#9279](https://github.com/yt-dlp/yt-dlp/issues/9279)) by [bashonly](https://github.com/bashonly) +- **ard** + - mediathek + - [Revert to using old id](https://github.com/yt-dlp/yt-dlp/commit/b6951271ac014761c9c317b9cecd5e8e139cfa7c) ([#8916](https://github.com/yt-dlp/yt-dlp/issues/8916)) by [Grub4K](https://github.com/Grub4K) + - [Support cookies to verify age](https://github.com/yt-dlp/yt-dlp/commit/c099ec9392b0283dde34b290d1a04158ad8eb882) ([#9037](https://github.com/yt-dlp/yt-dlp/issues/9037)) by [StefanLobbenmeier](https://github.com/StefanLobbenmeier) +- **art19**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/999ea80beb053491089d256104c4188aced3110f) ([#9099](https://github.com/yt-dlp/yt-dlp/issues/9099)) by [seproDev](https://github.com/seproDev) +- **artetv**: [Separate closed captions](https://github.com/yt-dlp/yt-dlp/commit/393b487a4ea391c44e811505ec98531031d7e81e) ([#8231](https://github.com/yt-dlp/yt-dlp/issues/8231)) by [Nicals](https://github.com/Nicals), [seproDev](https://github.com/seproDev) +- **asobichannel**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/12f042740550c06552819374e2251deb7a519bab) ([#8700](https://github.com/yt-dlp/yt-dlp/issues/8700)) by [Snack-X](https://github.com/Snack-X) +- **bigo**: [Fix JSON extraction](https://github.com/yt-dlp/yt-dlp/commit/85a2d07c1f82c2082b568963d1c32ad3fc848f61) ([#8893](https://github.com/yt-dlp/yt-dlp/issues/8893)) by [DmitryScaletta](https://github.com/DmitryScaletta) +- **bilibili** + - [Add referer header and fix metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/1713c882730a928ac344c099874d2093fc2c8b51) ([#8832](https://github.com/yt-dlp/yt-dlp/issues/8832)) by [SirElderling](https://github.com/SirElderling) (With fixes in [f1570ab](https://github.com/yt-dlp/yt-dlp/commit/f1570ab84d5f49564256c620063d2d3e9ed4acf0) by [TobiX](https://github.com/TobiX)) + - [Support `--no-playlist`](https://github.com/yt-dlp/yt-dlp/commit/e439693f729daf6fb15457baea1bca10ef5da34d) ([#9139](https://github.com/yt-dlp/yt-dlp/issues/9139)) by [c-basalt](https://github.com/c-basalt) +- **bilibilisearch**: [Set cookie to fix extraction](https://github.com/yt-dlp/yt-dlp/commit/ffa017cfc5973b265c92248546fcf5020dc43eaf) ([#9119](https://github.com/yt-dlp/yt-dlp/issues/9119)) by [c-basalt](https://github.com/c-basalt) +- **biliintl**: [Fix and improve subtitles extraction](https://github.com/yt-dlp/yt-dlp/commit/cf6413e840476c15e5b166dc2f7cc2a90a4a9aad) ([#7077](https://github.com/yt-dlp/yt-dlp/issues/7077)) by [dirkf](https://github.com/dirkf), [HobbyistDev](https://github.com/HobbyistDev), [itachi-19](https://github.com/itachi-19), [seproDev](https://github.com/seproDev) +- **boosty**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/540b68298192874c75ad5ee4589bed64d02a7d55) ([#9144](https://github.com/yt-dlp/yt-dlp/issues/9144)) by [un-def](https://github.com/un-def) +- **ccma**: [Extract 1080p DASH formats](https://github.com/yt-dlp/yt-dlp/commit/4253e3b7f483127bd812bdac02466f4a5b47ff34) ([#9130](https://github.com/yt-dlp/yt-dlp/issues/9130)) by [seproDev](https://github.com/seproDev) +- **cctv**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/6ad11fef65474bcf70f3a8556850d93c141e44a2) ([#9325](https://github.com/yt-dlp/yt-dlp/issues/9325)) by [src-tinkerer](https://github.com/src-tinkerer) +- **chzzk** + - [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/ba6b0c8261e9f0a6373885736ff90a89dd1fb614) ([#8887](https://github.com/yt-dlp/yt-dlp/issues/8887)) by [DmitryScaletta](https://github.com/DmitryScaletta) + - live: [Support `--wait-for-video`](https://github.com/yt-dlp/yt-dlp/commit/804f2366117b7065552a1c3cddb9ec19b688a5c1) ([#9309](https://github.com/yt-dlp/yt-dlp/issues/9309)) by [hui1601](https://github.com/hui1601) +- **cineverse**: [Detect when login required](https://github.com/yt-dlp/yt-dlp/commit/fc2cc626f07328a6c71b5e21853e4cfa7b1e6256) ([#9081](https://github.com/yt-dlp/yt-dlp/issues/9081)) by [garret1317](https://github.com/garret1317) +- **cloudflarestream** + - [Extract subtitles](https://github.com/yt-dlp/yt-dlp/commit/4d9dc0abe24ad5d9d22a16f40fc61137dcd103f7) ([#9007](https://github.com/yt-dlp/yt-dlp/issues/9007)) by [Bibhav48](https://github.com/Bibhav48) + - [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/f3d5face83f948c24bcb91e06d4fa6e8622d7d79) ([#9280](https://github.com/yt-dlp/yt-dlp/issues/9280)) by [bashonly](https://github.com/bashonly) + - [Improve embed detection](https://github.com/yt-dlp/yt-dlp/commit/464c919ea82aefdf35f138a1ab2dd0bb8fb7fd0e) ([#9287](https://github.com/yt-dlp/yt-dlp/issues/9287)) by [bashonly](https://github.com/bashonly) +- **cloudycdn, lsm**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/5dda3b291f59f388f953337e9fb09a94b64aaf34) ([#8643](https://github.com/yt-dlp/yt-dlp/issues/8643)) by [Caesim404](https://github.com/Caesim404) +- **cnbc**: [Overhaul extractors](https://github.com/yt-dlp/yt-dlp/commit/998dffb5a2343ec709b3d6bbf2bf019649080239) ([#8741](https://github.com/yt-dlp/yt-dlp/issues/8741)) by [gonzalezjo](https://github.com/gonzalezjo), [Noor-5](https://github.com/Noor-5), [ruiminggu](https://github.com/ruiminggu), [seproDev](https://github.com/seproDev), [zhijinwuu](https://github.com/zhijinwuu) +- **craftsy**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/96f3924bac174f2fd401f86f78e77d7e0c5ee008) ([#9384](https://github.com/yt-dlp/yt-dlp/issues/9384)) by [bashonly](https://github.com/bashonly) +- **crooksandliars**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/03536126d32bd861e38536371f0cd5f1b71dcb7a) ([#9192](https://github.com/yt-dlp/yt-dlp/issues/9192)) by [seproDev](https://github.com/seproDev) +- **crtvg**: [Fix `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/785ab1af7f131e73444634ad57b39478651a43d3) ([#9404](https://github.com/yt-dlp/yt-dlp/issues/9404)) by [Xpl0itU](https://github.com/Xpl0itU) +- **dailymotion**: [Support search](https://github.com/yt-dlp/yt-dlp/commit/11ffa92a61e5847b3dfa8975f91ecb3ac2178841) ([#8292](https://github.com/yt-dlp/yt-dlp/issues/8292)) by [drzraf](https://github.com/drzraf), [seproDev](https://github.com/seproDev) +- **douyin**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/9ff946645568e71046487571eefa9cb524a5189b) ([#9239](https://github.com/yt-dlp/yt-dlp/issues/9239)) by [114514ns](https://github.com/114514ns), [bashonly](https://github.com/bashonly) (With fixes in [e546e5d](https://github.com/yt-dlp/yt-dlp/commit/e546e5d3b33a50075e574a2e7b8eda7ea874d21e) by [bashonly](https://github.com/bashonly)) +- **duboku**: [Fix m3u8 formats extraction](https://github.com/yt-dlp/yt-dlp/commit/d3d4187da90a6b85f4ebae4bb07693cc9b412d75) ([#9161](https://github.com/yt-dlp/yt-dlp/issues/9161)) by [DmitryScaletta](https://github.com/DmitryScaletta) +- **dumpert**: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/eedb38ce4093500e19279d50b708fb9c18bf4dbf) ([#9320](https://github.com/yt-dlp/yt-dlp/issues/9320)) by [rvsit](https://github.com/rvsit) +- **elementorembed**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/6171b050d70435008e64fa06aa6f19c4e5bec75f) ([#8948](https://github.com/yt-dlp/yt-dlp/issues/8948)) by [pompos02](https://github.com/pompos02), [seproDev](https://github.com/seproDev) +- **eporner**: [Extract AV1 formats](https://github.com/yt-dlp/yt-dlp/commit/96d0f8c1cb8aec250c5614bfde6b5fb95f10819b) ([#9028](https://github.com/yt-dlp/yt-dlp/issues/9028)) by [michal-repo](https://github.com/michal-repo) +- **errjupiter** + - [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/a514cc2feb1c3b265b19acab11487acad8bb3ab0) ([#8549](https://github.com/yt-dlp/yt-dlp/issues/8549)) by [glensc](https://github.com/glensc) + - [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/80ed8bdeba5a945f127ef9ab055a4823329a1210) ([#9218](https://github.com/yt-dlp/yt-dlp/issues/9218)) by [glensc](https://github.com/glensc) +- **facebook** + - [Add new ID format](https://github.com/yt-dlp/yt-dlp/commit/cf9af2c7f1fedd881a157b3fbe725e5494b00924) ([#3824](https://github.com/yt-dlp/yt-dlp/issues/3824)) by [kclauhk](https://github.com/kclauhk), [Wikidepia](https://github.com/Wikidepia) + - [Improve extraction](https://github.com/yt-dlp/yt-dlp/commit/2e30b5567b5c6113d46b39163db5b044aea8667e) by [jingtra](https://github.com/jingtra), [ringus1](https://github.com/ringus1) + - [Improve thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/3c4d3ee491b0ec22ed3cade51d943d3d27141ba7) ([#9060](https://github.com/yt-dlp/yt-dlp/issues/9060)) by [kclauhk](https://github.com/kclauhk) + - [Set format HTTP chunk size](https://github.com/yt-dlp/yt-dlp/commit/5b68c478fb0b93ea6b8fac23f50e12217fa063db) ([#9058](https://github.com/yt-dlp/yt-dlp/issues/9058)) by [bashonly](https://github.com/bashonly), [kclauhk](https://github.com/kclauhk) + - [Support events](https://github.com/yt-dlp/yt-dlp/commit/9b5efaf86b99a2664fff9fc725d275f766c3221d) ([#9055](https://github.com/yt-dlp/yt-dlp/issues/9055)) by [kclauhk](https://github.com/kclauhk) + - [Support permalink URLs](https://github.com/yt-dlp/yt-dlp/commit/87286e93af949c4e6a0f8ba34af6a1ab5aa102b6) ([#9061](https://github.com/yt-dlp/yt-dlp/issues/9061)) by [kclauhk](https://github.com/kclauhk) + - ads: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/a40b0070c2a00d3ed839897462171a82323aa875) ([#8870](https://github.com/yt-dlp/yt-dlp/issues/8870)) by [kclauhk](https://github.com/kclauhk) +- **flextv**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/4f043479090dc8a7e06e0bb53691e5414320dfb2) ([#9178](https://github.com/yt-dlp/yt-dlp/issues/9178)) by [DmitryScaletta](https://github.com/DmitryScaletta) +- **floatplane**: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/9cd90447907a59c8a2727583f4a755fb23ed8cd3) ([#8934](https://github.com/yt-dlp/yt-dlp/issues/8934)) by [chtk](https://github.com/chtk) +- **francetv** + - [Fix DAI livestreams](https://github.com/yt-dlp/yt-dlp/commit/e4fbe5f886a6693f2466877c12e99c30c5442ace) ([#9380](https://github.com/yt-dlp/yt-dlp/issues/9380)) by [bashonly](https://github.com/bashonly) + - [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/9749ac7fecbfda391afbadf2870797ce0e382622) ([#9333](https://github.com/yt-dlp/yt-dlp/issues/9333)) by [bashonly](https://github.com/bashonly) + - [Fix m3u8 formats extraction](https://github.com/yt-dlp/yt-dlp/commit/ede624d1db649f5a4b61f8abbb746f365322de27) ([#9347](https://github.com/yt-dlp/yt-dlp/issues/9347)) by [bashonly](https://github.com/bashonly) +- **funk**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/cd0443fb14e2ed805abb02792473457553a123d1) ([#9194](https://github.com/yt-dlp/yt-dlp/issues/9194)) by [seproDev](https://github.com/seproDev) +- **generic**: [Follow https redirects properly](https://github.com/yt-dlp/yt-dlp/commit/c8c9039e640495700f76a13496e3418bdd4382ba) ([#9121](https://github.com/yt-dlp/yt-dlp/issues/9121)) by [seproDev](https://github.com/seproDev) +- **getcourseru**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/4310b6650eeb5630295f4591b37720877878c57a) ([#8873](https://github.com/yt-dlp/yt-dlp/issues/8873)) by [divStar](https://github.com/divStar), [seproDev](https://github.com/seproDev) +- **gofile**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/77c2472ca1ef9050a66aa68bc5fa1bee88706c66) ([#9074](https://github.com/yt-dlp/yt-dlp/issues/9074)) by [jazz1611](https://github.com/jazz1611) +- **googledrive**: [Fix source file extraction](https://github.com/yt-dlp/yt-dlp/commit/5498729c59b03a9511c64552da3ba2f802166f8d) ([#8990](https://github.com/yt-dlp/yt-dlp/issues/8990)) by [jazz1611](https://github.com/jazz1611) +- **goplay**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/7e90e34fa4617b53f8c8a9e69f460508cb1f51b0) ([#6654](https://github.com/yt-dlp/yt-dlp/issues/6654)) by [alard](https://github.com/alard) +- **gopro**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/4a07a455bbf7acf87550053bbba949c828e350ba) ([#9019](https://github.com/yt-dlp/yt-dlp/issues/9019)) by [stilor](https://github.com/stilor) +- **ilpost**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/aa5dcc4ee65916a36cbe1b1b5b29b9110c3163ed) ([#9001](https://github.com/yt-dlp/yt-dlp/issues/9001)) by [CapacitorSet](https://github.com/CapacitorSet) +- **jiosaavnsong**: [Support more bitrates](https://github.com/yt-dlp/yt-dlp/commit/5154dc0a687528f995cde22b5ff63f82c740e98a) ([#8834](https://github.com/yt-dlp/yt-dlp/issues/8834)) by [alien-developers](https://github.com/alien-developers), [bashonly](https://github.com/bashonly) +- **kukululive**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/20cdad5a2c0499d5a6746f5466a2ab0c97b75884) ([#8877](https://github.com/yt-dlp/yt-dlp/issues/8877)) by [DmitryScaletta](https://github.com/DmitryScaletta) +- **lefigarovideoembed**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/9401736fd08767c58af45a1e36ff5929c5fa1ac9) ([#9198](https://github.com/yt-dlp/yt-dlp/issues/9198)) by [seproDev](https://github.com/seproDev) +- **linkedin**: [Fix metadata and extract subtitles](https://github.com/yt-dlp/yt-dlp/commit/017adb28e7fe7b8c8fc472332d86740f31141519) ([#9056](https://github.com/yt-dlp/yt-dlp/issues/9056)) by [barsnick](https://github.com/barsnick) +- **magellantv**: [Support episodes](https://github.com/yt-dlp/yt-dlp/commit/3dc9232e1aa58fe3c2d8cafb50e8162d6f0e891e) ([#9199](https://github.com/yt-dlp/yt-dlp/issues/9199)) by [seproDev](https://github.com/seproDev) +- **magentamusik**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/5e2e24b2c5795756d81785b06b10723ddb6db7b2) ([#7790](https://github.com/yt-dlp/yt-dlp/issues/7790)) by [pwaldhauer](https://github.com/pwaldhauer), [seproDev](https://github.com/seproDev) +- **medaltv**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/02e343f6ef6d7b3f9087ff69e4a1db0b4b4a5c5d) ([#9098](https://github.com/yt-dlp/yt-dlp/issues/9098)) by [Danish-H](https://github.com/Danish-H) +- **mlbarticle**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/50e06e21a68e336198198bda332b8e7d2314f201) ([#9021](https://github.com/yt-dlp/yt-dlp/issues/9021)) by [HobbyistDev](https://github.com/HobbyistDev) +- **motherless**: [Support uploader playlists](https://github.com/yt-dlp/yt-dlp/commit/9f1e9dab21bbe651544c8f4663b0e615dc450e4d) ([#8994](https://github.com/yt-dlp/yt-dlp/issues/8994)) by [dasidiot](https://github.com/dasidiot) +- **mujrozhlas**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/4170b3d7120e06db3391eef39c5add18a1ddf2c3) ([#9306](https://github.com/yt-dlp/yt-dlp/issues/9306)) by [bashonly](https://github.com/bashonly) +- **mx3**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/5a63454b3637b3603434026cddfeac509218b90e) ([#8736](https://github.com/yt-dlp/yt-dlp/issues/8736)) by [martinxyz](https://github.com/martinxyz) +- **naver**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/a281beba8d8f007cf220f96dd1d9412bb070c7d8) ([#8883](https://github.com/yt-dlp/yt-dlp/issues/8883)) by [seproDev](https://github.com/seproDev) +- **nebula**: [Support podcasts](https://github.com/yt-dlp/yt-dlp/commit/0de09c5b9ed619d4a93d7c451c6ddff0381de808) ([#9140](https://github.com/yt-dlp/yt-dlp/issues/9140)) by [c-basalt](https://github.com/c-basalt), [seproDev](https://github.com/seproDev) +- **nerdcubedfeed**: [Overhaul extractor](https://github.com/yt-dlp/yt-dlp/commit/29a74a6126101aabaa1726ae41b1ca55cf26e7a7) ([#9269](https://github.com/yt-dlp/yt-dlp/issues/9269)) by [seproDev](https://github.com/seproDev) +- **newgrounds** + - [Fix login and clean up extraction](https://github.com/yt-dlp/yt-dlp/commit/0fcefb92f3ebfc5cada19c1e85a715f020d0f333) ([#9356](https://github.com/yt-dlp/yt-dlp/issues/9356)) by [Grub4K](https://github.com/Grub4K), [mrmedieval](https://github.com/mrmedieval) + - user: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/3e083191cdc34dd8c482da9a9b4bc682f824cb9d) ([#9046](https://github.com/yt-dlp/yt-dlp/issues/9046)) by [u-spec-png](https://github.com/u-spec-png) +- **nfb**: [Add support for onf.ca and series](https://github.com/yt-dlp/yt-dlp/commit/4b8b0dded8c65cd5b2ab2e858058ba98c9bf49ff) ([#8997](https://github.com/yt-dlp/yt-dlp/issues/8997)) by [bashonly](https://github.com/bashonly), [rrgomes](https://github.com/rrgomes) +- **nhkradiru**: [Extract extended description](https://github.com/yt-dlp/yt-dlp/commit/4392447d9404e3c25cfeb8f5bdfff31b0448da39) ([#9162](https://github.com/yt-dlp/yt-dlp/issues/9162)) by [garret1317](https://github.com/garret1317) +- **nhkradirulive**: [Make metadata extraction non-fatal](https://github.com/yt-dlp/yt-dlp/commit/5af1f19787f7d652fce72dd3ab9536cdd980fe85) ([#8956](https://github.com/yt-dlp/yt-dlp/issues/8956)) by [garret1317](https://github.com/garret1317) +- **niconico** + - [Remove legacy danmaku extraction](https://github.com/yt-dlp/yt-dlp/commit/974d444039c8bbffb57265c6792cd52d169fe1b9) ([#9209](https://github.com/yt-dlp/yt-dlp/issues/9209)) by [pzhlkj6612](https://github.com/pzhlkj6612) + - [Support DMS formats](https://github.com/yt-dlp/yt-dlp/commit/aa13a8e3dd3b698cc40ec438988b1ad834e11a41) ([#9282](https://github.com/yt-dlp/yt-dlp/issues/9282)) by [pzhlkj6612](https://github.com/pzhlkj6612), [xpadev-net](https://github.com/xpadev-net) (With fixes in [40966e8](https://github.com/yt-dlp/yt-dlp/commit/40966e8da27bbf770dacf9be9363fcc3ad72cc9f) by [pzhlkj6612](https://github.com/pzhlkj6612)) +- **ninaprotocol**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/62c65bfaf81e04e6746f6fdbafe384eb3edddfbc) ([#8946](https://github.com/yt-dlp/yt-dlp/issues/8946)) by [RaduManole](https://github.com/RaduManole), [seproDev](https://github.com/seproDev) +- **ninenews**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/43694ce13c5a9f1afca8b02b8b2b9b1576d6503d) ([#8840](https://github.com/yt-dlp/yt-dlp/issues/8840)) by [SirElderling](https://github.com/SirElderling) +- **nova**: [Fix embed extraction](https://github.com/yt-dlp/yt-dlp/commit/c168d8791d0974a8a8fcb3b4a4bc2d830df51622) ([#9221](https://github.com/yt-dlp/yt-dlp/issues/9221)) by [seproDev](https://github.com/seproDev) +- **ntvru**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/7a29cbbd5fd7363e7e8535ee1506b7052465d13f) ([#9276](https://github.com/yt-dlp/yt-dlp/issues/9276)) by [bashonly](https://github.com/bashonly), [dirkf](https://github.com/dirkf) +- **nuum**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/acaf806c15f0a802ba286c23af02a10cf4bd4731) ([#8868](https://github.com/yt-dlp/yt-dlp/issues/8868)) by [DmitryScaletta](https://github.com/DmitryScaletta), [seproDev](https://github.com/seproDev) +- **nytimes** + - [Extract timestamp](https://github.com/yt-dlp/yt-dlp/commit/05420227aaab60a39c0f9ade069c5862be36b1fa) ([#9142](https://github.com/yt-dlp/yt-dlp/issues/9142)) by [SirElderling](https://github.com/SirElderling) + - [Overhaul extractors](https://github.com/yt-dlp/yt-dlp/commit/07256b9fee23960799024b95d5972abc7174aa81) ([#9075](https://github.com/yt-dlp/yt-dlp/issues/9075)) by [SirElderling](https://github.com/SirElderling) +- **onefootball**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/644738ddaa45428cb0babd41ead22454e5a2545e) ([#9222](https://github.com/yt-dlp/yt-dlp/issues/9222)) by [seproDev](https://github.com/seproDev) +- **openrec**: [Pass referer for m3u8 formats](https://github.com/yt-dlp/yt-dlp/commit/f591e605dfee4085ec007d6d056c943cbcacc429) ([#9253](https://github.com/yt-dlp/yt-dlp/issues/9253)) by [fireattack](https://github.com/fireattack) +- **orf**: on: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/a0d50aabc5462aee302bd3f2663d3a3554875789) ([#9113](https://github.com/yt-dlp/yt-dlp/issues/9113)) by [HobbyistDev](https://github.com/HobbyistDev) +- **patreon**: [Fix embedded HLS extraction](https://github.com/yt-dlp/yt-dlp/commit/f0e8bc7c60b61fe18b63116c975609d76b904771) ([#8993](https://github.com/yt-dlp/yt-dlp/issues/8993)) by [johnvictorfs](https://github.com/johnvictorfs) +- **peertube**: [Update instances](https://github.com/yt-dlp/yt-dlp/commit/35d96982f1033e36215d323317981ee17e8ab0d5) ([#9070](https://github.com/yt-dlp/yt-dlp/issues/9070)) by [Chocobozzz](https://github.com/Chocobozzz) +- **piapro**: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/8e6e3651727b0b85764857fc6329fe5e0a3f00de) ([#8999](https://github.com/yt-dlp/yt-dlp/issues/8999)) by [FinnRG](https://github.com/FinnRG) +- **playsuisse**: [Add login support](https://github.com/yt-dlp/yt-dlp/commit/cae6e461073fb7c32fd32052a3e6721447c469bc) ([#9077](https://github.com/yt-dlp/yt-dlp/issues/9077)) by [chkuendig](https://github.com/chkuendig) +- **pornhub**: [Fix login support](https://github.com/yt-dlp/yt-dlp/commit/de954c1b4d3a6db8a6525507e65303c7bb03f39f) ([#9227](https://github.com/yt-dlp/yt-dlp/issues/9227)) by [feederbox826](https://github.com/feederbox826) +- **pr0gramm**: [Enable POL filter and provide tags without login](https://github.com/yt-dlp/yt-dlp/commit/5f25f348f9eb5db842b1ec6799f95bebb7ba35a7) ([#9051](https://github.com/yt-dlp/yt-dlp/issues/9051)) by [Grub4K](https://github.com/Grub4K) +- **prankcastpost**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/a2bac6b7adb7b0e955125838e20bb39eece630ce) ([#8933](https://github.com/yt-dlp/yt-dlp/issues/8933)) by [columndeeply](https://github.com/columndeeply) +- **radiko**: [Extract more metadata](https://github.com/yt-dlp/yt-dlp/commit/e3ce2b385ec1f03fac9d4210c57fda77134495fc) ([#9115](https://github.com/yt-dlp/yt-dlp/issues/9115)) by [YoshichikaAAA](https://github.com/YoshichikaAAA) +- **rai** + - [Filter unavailable formats](https://github.com/yt-dlp/yt-dlp/commit/f78814923748277e7067b796f25870686fb46205) ([#9189](https://github.com/yt-dlp/yt-dlp/issues/9189)) by [nixxo](https://github.com/nixxo) + - [Fix m3u8 formats extraction](https://github.com/yt-dlp/yt-dlp/commit/8f423cf8051fbfeedd57cca00d106012e6e86a97) ([#9291](https://github.com/yt-dlp/yt-dlp/issues/9291)) by [nixxo](https://github.com/nixxo) +- **redcdnlivx, sejm**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/fcaa2e735b00b15a2b0d9f55f4187c654b4b5b39) ([#8676](https://github.com/yt-dlp/yt-dlp/issues/8676)) by [selfisekai](https://github.com/selfisekai) +- **redtube** + - [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/c91d8b1899403daff6fc15206ad32de8db17fb8f) ([#9076](https://github.com/yt-dlp/yt-dlp/issues/9076)) by [jazz1611](https://github.com/jazz1611) + - [Support redtube.com.br URLs](https://github.com/yt-dlp/yt-dlp/commit/4a6ff0b47a700dee3ee5c54804c31965308479ae) ([#9103](https://github.com/yt-dlp/yt-dlp/issues/9103)) by [jazz1611](https://github.com/jazz1611) +- **ridehome**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/cd7086c0d54ec1d7e02a30bd5bd934bdb2c54642) ([#8875](https://github.com/yt-dlp/yt-dlp/issues/8875)) by [SirElderling](https://github.com/SirElderling) +- **rinsefmartistplaylist**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/1a36dbad712d359ec1c5b73d9bbbe562c03e9660) ([#8794](https://github.com/yt-dlp/yt-dlp/issues/8794)) by [SirElderling](https://github.com/SirElderling) +- **roosterteeth** + - [Add Brightcove fallback](https://github.com/yt-dlp/yt-dlp/commit/b2cc150ad83ba20ceb2d6e73d09854eed3c2d05c) ([#9403](https://github.com/yt-dlp/yt-dlp/issues/9403)) by [bashonly](https://github.com/bashonly) + - [Extract ad-free streams](https://github.com/yt-dlp/yt-dlp/commit/dd29e6e5fdf0f3758cb0829e73749832768f1a4e) ([#9355](https://github.com/yt-dlp/yt-dlp/issues/9355)) by [jkmartindale](https://github.com/jkmartindale) + - [Extract release date and timestamp](https://github.com/yt-dlp/yt-dlp/commit/dfd8c0b69683b1c11beea039a96dd2949026c1d7) ([#9393](https://github.com/yt-dlp/yt-dlp/issues/9393)) by [bashonly](https://github.com/bashonly) + - [Support bonus features](https://github.com/yt-dlp/yt-dlp/commit/8993721ecb34867b52b79f6e92b233008d1cbe78) ([#9406](https://github.com/yt-dlp/yt-dlp/issues/9406)) by [Bl4Cc4t](https://github.com/Bl4Cc4t) +- **rule34video** + - [Extract `creators`](https://github.com/yt-dlp/yt-dlp/commit/3d9dc2f3590e10abf1561ebdaed96734a740587c) ([#9258](https://github.com/yt-dlp/yt-dlp/issues/9258)) by [gmes78](https://github.com/gmes78) + - [Extract more metadata](https://github.com/yt-dlp/yt-dlp/commit/fee2d8d9c38f9b5f0a8df347c1e698983339c34d) ([#7416](https://github.com/yt-dlp/yt-dlp/issues/7416)) by [gmes78](https://github.com/gmes78) + - [Fix `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/c0ecceeefe6ebd27452d9d8f20658f83ae121d04) ([#9044](https://github.com/yt-dlp/yt-dlp/issues/9044)) by [gmes78](https://github.com/gmes78) +- **rumblechannel**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/0023af81fbce01984f35b34ecaf8562739831227) ([#9092](https://github.com/yt-dlp/yt-dlp/issues/9092)) by [Pranaxcau](https://github.com/Pranaxcau), [vista-narvas](https://github.com/vista-narvas) +- **screencastify**: [Update `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/0bee29493ca8f91a0055a3706c7c94f5860188df) ([#9232](https://github.com/yt-dlp/yt-dlp/issues/9232)) by [seproDev](https://github.com/seproDev) +- **svtpage**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/ddd4b5e10a653bee78e656107710021c1b82934c) ([#8938](https://github.com/yt-dlp/yt-dlp/issues/8938)) by [diman8](https://github.com/diman8) +- **swearnet**: [Raise for login required](https://github.com/yt-dlp/yt-dlp/commit/b05640d532c43a52c0a0da096bb2dbd51e105ec0) ([#9281](https://github.com/yt-dlp/yt-dlp/issues/9281)) by [bashonly](https://github.com/bashonly) +- **tiktok**: [Fix webpage extraction](https://github.com/yt-dlp/yt-dlp/commit/d9b4154cbcb979d7e30af3a73b1bee422aae5aa3) ([#9327](https://github.com/yt-dlp/yt-dlp/issues/9327)) by [bashonly](https://github.com/bashonly) +- **trtworld**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/8ab84650837e58046430c9f4b615c56a8886e071) ([#8701](https://github.com/yt-dlp/yt-dlp/issues/8701)) by [ufukk](https://github.com/ufukk) +- **tvp**: [Support livestreams](https://github.com/yt-dlp/yt-dlp/commit/882e3b753c79c7799ce135c3a5edb72494b576af) ([#8860](https://github.com/yt-dlp/yt-dlp/issues/8860)) by [selfisekai](https://github.com/selfisekai) +- **twitch**: [Fix m3u8 extraction](https://github.com/yt-dlp/yt-dlp/commit/5b8c69ae04444a4c80a5a99917e40f75a116c3b8) ([#8960](https://github.com/yt-dlp/yt-dlp/issues/8960)) by [DmitryScaletta](https://github.com/DmitryScaletta) +- **twitter** + - [Extract bitrate for HLS audio formats](https://github.com/yt-dlp/yt-dlp/commit/28e53d60df9b8aadd52a93504e30e885c9c35262) ([#9257](https://github.com/yt-dlp/yt-dlp/issues/9257)) by [bashonly](https://github.com/bashonly) + - [Extract numeric `channel_id`](https://github.com/yt-dlp/yt-dlp/commit/55f1833376505ed1e4be0516b09bb3ea4425e8a4) ([#9263](https://github.com/yt-dlp/yt-dlp/issues/9263)) by [bashonly](https://github.com/bashonly) +- **txxx**: [Extract thumbnails](https://github.com/yt-dlp/yt-dlp/commit/d79c7e9937c388c68b722ab7450960e43ef776d6) ([#9063](https://github.com/yt-dlp/yt-dlp/issues/9063)) by [shmohawk](https://github.com/shmohawk) +- **utreon**: [Support playeur.com](https://github.com/yt-dlp/yt-dlp/commit/41d6b61e9852a5b97f47cc8a7718b31fb23f0aea) ([#9182](https://github.com/yt-dlp/yt-dlp/issues/9182)) by [DmitryScaletta](https://github.com/DmitryScaletta) +- **vbox7**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/67bb70cd700c8d4c3149cd9e0539a5f32c3d1ce6) ([#9100](https://github.com/yt-dlp/yt-dlp/issues/9100)) by [seproDev](https://github.com/seproDev) +- **viewlift**: [Add support for chorki.com](https://github.com/yt-dlp/yt-dlp/commit/41b6cdb4197aaf7ad82bdad6885eb5d5c64acd74) ([#9095](https://github.com/yt-dlp/yt-dlp/issues/9095)) by [NurTasin](https://github.com/NurTasin) +- **vimeo** + - [Extract `live_status` and `release_timestamp`](https://github.com/yt-dlp/yt-dlp/commit/f0426e9ca57dd14b82e6c13afc17947614f1e8eb) ([#9290](https://github.com/yt-dlp/yt-dlp/issues/9290)) by [pzhlkj6612](https://github.com/pzhlkj6612) + - [Fix API headers](https://github.com/yt-dlp/yt-dlp/commit/8e765755f7f4909e1b535e61b7376b2d66e1ba6a) ([#9125](https://github.com/yt-dlp/yt-dlp/issues/9125)) by [bashonly](https://github.com/bashonly) + - [Fix login](https://github.com/yt-dlp/yt-dlp/commit/2e8de097ad82da378e97005e8f1ff7e5aebca585) ([#9274](https://github.com/yt-dlp/yt-dlp/issues/9274)) by [bashonly](https://github.com/bashonly) +- **viously**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/95e82347b398d8bb160767cdd975edecd62cbabd) ([#8927](https://github.com/yt-dlp/yt-dlp/issues/8927)) by [nbr23](https://github.com/nbr23), [seproDev](https://github.com/seproDev) +- **youtube** + - [Better error when all player responses are skipped](https://github.com/yt-dlp/yt-dlp/commit/5eedc208ec89d6284777060c94aadd06502338b9) ([#9083](https://github.com/yt-dlp/yt-dlp/issues/9083)) by [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) + - [Bump Android and iOS client versions](https://github.com/yt-dlp/yt-dlp/commit/413d3675804599bc8fe419c19e36490fd8f0b30f) ([#9317](https://github.com/yt-dlp/yt-dlp/issues/9317)) by [bashonly](https://github.com/bashonly) + - [Further bump client versions](https://github.com/yt-dlp/yt-dlp/commit/7aad06541e543fa3452d3d2513e6f079aad1f99b) ([#9395](https://github.com/yt-dlp/yt-dlp/issues/9395)) by [bashonly](https://github.com/bashonly) + - tab: [Fix `tags` extraction](https://github.com/yt-dlp/yt-dlp/commit/8828f4576bd862438d4fbf634f1d6ab18a217b0e) ([#9413](https://github.com/yt-dlp/yt-dlp/issues/9413)) by [x11x](https://github.com/x11x) +- **zenporn**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/f00c0def7434fac3c88503c2a77c4b2419b8e5ca) ([#8509](https://github.com/yt-dlp/yt-dlp/issues/8509)) by [SirElderling](https://github.com/SirElderling) +- **zetland**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/2f4b57594673035a59d72f7667588da848820034) ([#9116](https://github.com/yt-dlp/yt-dlp/issues/9116)) by [HobbyistDev](https://github.com/HobbyistDev) + +#### Downloader changes +- **http**: [Reset resume length to handle `FileNotFoundError`](https://github.com/yt-dlp/yt-dlp/commit/2d91b9845621639c53dca7ee9d3d954f3624ba18) ([#8399](https://github.com/yt-dlp/yt-dlp/issues/8399)) by [boredzo](https://github.com/boredzo) + +#### Networking changes +- [Remove `_CompatHTTPError`](https://github.com/yt-dlp/yt-dlp/commit/811d298b231cfa29e75c321b23a91d1c2b17602c) ([#8871](https://github.com/yt-dlp/yt-dlp/issues/8871)) by [coletdjnz](https://github.com/coletdjnz) +- **Request Handler** + - [Remove additional logging handlers on close](https://github.com/yt-dlp/yt-dlp/commit/0085e2bab8465ee7d46d16fcade3ed5e96cc8a48) ([#9032](https://github.com/yt-dlp/yt-dlp/issues/9032)) by [coletdjnz](https://github.com/coletdjnz) + - requests: [Apply `remove_dot_segments` to absolute redirect locations](https://github.com/yt-dlp/yt-dlp/commit/35f4f764a786685ea45d84abe1cf1ad3847f4c97) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **build** + - [Add `default` optional dependency group](https://github.com/yt-dlp/yt-dlp/commit/cf91400a1dd6cc99b11a6d163e1af73b64d618c9) ([#9295](https://github.com/yt-dlp/yt-dlp/issues/9295)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + - [Add transitional `setup.py` and `pyinst.py`](https://github.com/yt-dlp/yt-dlp/commit/0abf2f1f153ab47990edbeee3477dc55f74c7f89) ([#9296](https://github.com/yt-dlp/yt-dlp/issues/9296)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) + - [Bump `actions/upload-artifact` to v4 and adjust workflows](https://github.com/yt-dlp/yt-dlp/commit/3876429d72afb35247f4b2531eb9b16cfc7e0968) by [bashonly](https://github.com/bashonly) + - [Bump `conda-incubator/setup-miniconda` to v3](https://github.com/yt-dlp/yt-dlp/commit/b0059f0413a6ba6ab0a3aec1f00188ce083cd8bf) by [bashonly](https://github.com/bashonly) + - [Fix `secretstorage` for ARM builds](https://github.com/yt-dlp/yt-dlp/commit/920397634d1e84e76d2cb897bd6d69ba0c6bd5ca) by [bashonly](https://github.com/bashonly) + - [Migrate to `pyproject.toml` and `hatchling`](https://github.com/yt-dlp/yt-dlp/commit/775cde82dc5b1dc64ab0539a92dd8c7ba6c0ad33) by [bashonly](https://github.com/bashonly) (With fixes in [43cfd46](https://github.com/yt-dlp/yt-dlp/commit/43cfd462c0d01eff22c1d4290aeb96eb1ea2c0e1)) + - [Move bundle scripts into `bundle` submodule](https://github.com/yt-dlp/yt-dlp/commit/a1b778428991b1779203bac243ef4e9b6baea90c) by [bashonly](https://github.com/bashonly) + - [Support failed build job re-runs](https://github.com/yt-dlp/yt-dlp/commit/eabbccc439720fba381919a88be4fe4d96464cbd) ([#9277](https://github.com/yt-dlp/yt-dlp/issues/9277)) by [bashonly](https://github.com/bashonly) + - Makefile + - [Add automated `CODE_FOLDERS` and `CODE_FILES`](https://github.com/yt-dlp/yt-dlp/commit/868d2f60a7cb59b410c8cbfb452cbdb072687b81) by [bashonly](https://github.com/bashonly) + - [Ensure compatibility with BSD `make`](https://github.com/yt-dlp/yt-dlp/commit/beaa1a44554d04d9fe63a743a5bb4431ca778f28) ([#9210](https://github.com/yt-dlp/yt-dlp/issues/9210)) by [bashonly](https://github.com/bashonly) (With fixes in [73fcfa3](https://github.com/yt-dlp/yt-dlp/commit/73fcfa39f59113a8728249de2c4cee3025f17dc2)) + - [Fix man pages generated by `pandoc>=3`](https://github.com/yt-dlp/yt-dlp/commit/fb44020fa98e47620b3aa1dab94b4c5b7bfb40bd) ([#7047](https://github.com/yt-dlp/yt-dlp/issues/7047)) by [t-nil](https://github.com/t-nil) +- **ci**: [Bump `actions/setup-python` to v5](https://github.com/yt-dlp/yt-dlp/commit/b14e818b37f62e3224da157b3ad768b3f0815fcd) by [bashonly](https://github.com/bashonly) +- **cleanup** + - [Build files cleanup](https://github.com/yt-dlp/yt-dlp/commit/867f637b95b342e1cb9f1dc3c6cf0ffe727187ce) by [bashonly](https://github.com/bashonly) + - [Fix infodict returned fields](https://github.com/yt-dlp/yt-dlp/commit/f4f9f6d00edcac6d4eb2b3fb78bf81326235d492) ([#8906](https://github.com/yt-dlp/yt-dlp/issues/8906)) by [seproDev](https://github.com/seproDev) + - [Fix typo in README.md](https://github.com/yt-dlp/yt-dlp/commit/292d60b1ed3b9fe5bcb2775a894cca99b0f9473e) ([#8894](https://github.com/yt-dlp/yt-dlp/issues/8894)) by [antonkesy](https://github.com/antonkesy) + - [Mark broken and remove dead extractors](https://github.com/yt-dlp/yt-dlp/commit/df773c3d5d1cc1f877cf8582f0072e386fc49318) ([#9238](https://github.com/yt-dlp/yt-dlp/issues/9238)) by [seproDev](https://github.com/seproDev) + - [Match both `http` and `https` in `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/a687226b48f71b874fa18b0165ec528d591f53fb) ([#8968](https://github.com/yt-dlp/yt-dlp/issues/8968)) by [seproDev](https://github.com/seproDev) + - [Remove unused code](https://github.com/yt-dlp/yt-dlp/commit/ed3bb2b0a12c44334e0d09481752dabf2ca1dc13) ([#8968](https://github.com/yt-dlp/yt-dlp/issues/8968)) by [pukkandan](https://github.com/pukkandan), [seproDev](https://github.com/seproDev) + - Miscellaneous + - [93240fc](https://github.com/yt-dlp/yt-dlp/commit/93240fc1848de4a94f25844c96e0dcd282ef1d3b) by [bashonly](https://github.com/bashonly), [Grub4k](https://github.com/Grub4k), [pukkandan](https://github.com/pukkandan), [seproDev](https://github.com/seproDev) + - [615a844](https://github.com/yt-dlp/yt-dlp/commit/615a84447e8322720be77a0e64298d7f42848693) by [bashonly](https://github.com/bashonly), [pukkandan](https://github.com/pukkandan), [seproDev](https://github.com/seproDev) +- **devscripts** + - `install_deps`: [Add script and migrate to it](https://github.com/yt-dlp/yt-dlp/commit/b8a433aaca86b15cb9f1a451b0f69371d2fc22a9) by [bashonly](https://github.com/bashonly) + - `tomlparse`: [Add makeshift toml parser](https://github.com/yt-dlp/yt-dlp/commit/fd647775e27e030ab17387c249e2ebeba68f8ff0) by [Grub4K](https://github.com/Grub4K) +- **docs**: [Misc Cleanup](https://github.com/yt-dlp/yt-dlp/commit/47ab66db0f083a76c7fba0f6e136b21dd5a93e3b) ([#8977](https://github.com/yt-dlp/yt-dlp/issues/8977)) by [Arthurszzz](https://github.com/Arthurszzz), [bashonly](https://github.com/bashonly), [Grub4k](https://github.com/Grub4k), [pukkandan](https://github.com/pukkandan), [seproDev](https://github.com/seproDev) +- **test** + - [Skip source address tests if the address cannot be bound to](https://github.com/yt-dlp/yt-dlp/commit/69d31914952dd33082ac7019c6f76b43c45b9d06) ([#8900](https://github.com/yt-dlp/yt-dlp/issues/8900)) by [coletdjnz](https://github.com/coletdjnz) + - websockets: [Fix timeout test on Windows](https://github.com/yt-dlp/yt-dlp/commit/ac340d0745a9de5d494033e3507ef624ba25add3) ([#9344](https://github.com/yt-dlp/yt-dlp/issues/9344)) by [seproDev](https://github.com/seproDev) + +### 2023.12.30 + +#### Core changes +- [Fix format selection parse error for CPython 3.12](https://github.com/yt-dlp/yt-dlp/commit/00cdda4f6fe18712ced13dbc64b7ea10f323e268) ([#8797](https://github.com/yt-dlp/yt-dlp/issues/8797)) by [Grub4K](https://github.com/Grub4K) +- [Let `read_stdin` obey `--quiet`](https://github.com/yt-dlp/yt-dlp/commit/a174c453ee1e853c584ceadeac17eef2bd433dc5) by [pukkandan](https://github.com/pukkandan) +- [Merged with youtube-dl be008e6](https://github.com/yt-dlp/yt-dlp/commit/65de7d204ce88c0225df1321060304baab85dbd8) by [bashonly](https://github.com/bashonly), [dirkf](https://github.com/dirkf), [Grub4K](https://github.com/Grub4K) +- [Parse `release_year` from `release_date`](https://github.com/yt-dlp/yt-dlp/commit/1732eccc0a40256e076bf0435a29f0f1d8419280) ([#8524](https://github.com/yt-dlp/yt-dlp/issues/8524)) by [seproDev](https://github.com/seproDev) +- [Release workflow and Updater cleanup](https://github.com/yt-dlp/yt-dlp/commit/632b8ee54eb2df8ac6e20746a0bd95b7ebb053aa) ([#8640](https://github.com/yt-dlp/yt-dlp/issues/8640)) by [bashonly](https://github.com/bashonly) +- [Remove Python 3.7 support](https://github.com/yt-dlp/yt-dlp/commit/f4b95acafcd69a50040730dfdf732e797278fdcc) ([#8361](https://github.com/yt-dlp/yt-dlp/issues/8361)) by [bashonly](https://github.com/bashonly) +- [Support `NO_COLOR` environment variable](https://github.com/yt-dlp/yt-dlp/commit/a0b19d319a6ce8b7059318fa17a34b144fde1785) ([#8385](https://github.com/yt-dlp/yt-dlp/issues/8385)) by [Grub4K](https://github.com/Grub4K), [prettykool](https://github.com/prettykool) +- **outtmpl**: [Support multiplication](https://github.com/yt-dlp/yt-dlp/commit/993edd3f6e17e966c763bc86dc34125445cec6b6) by [pukkandan](https://github.com/pukkandan) +- **utils**: `traverse_obj`: [Move `is_user_input` into output template](https://github.com/yt-dlp/yt-dlp/commit/0b6f829b1dfda15d3c1d7d1fbe4ea6102c26dd24) ([#8673](https://github.com/yt-dlp/yt-dlp/issues/8673)) by [Grub4K](https://github.com/Grub4K) +- **webvtt**: [Allow spaces before newlines for CueBlock](https://github.com/yt-dlp/yt-dlp/commit/15f22b4880b6b3f71f350c64d70976ae65b9f1ca) ([#7681](https://github.com/yt-dlp/yt-dlp/issues/7681)) by [TSRBerry](https://github.com/TSRBerry) (With fixes in [298230e](https://github.com/yt-dlp/yt-dlp/commit/298230e550886b746c266724dd701d842ca2696e) by [pukkandan](https://github.com/pukkandan)) + +#### Extractor changes +- [Add `media_type` field](https://github.com/yt-dlp/yt-dlp/commit/e370f9ec36972d06100a3db893b397bfc1b07b4d) by [trainman261](https://github.com/trainman261) +- [Extract from `media` elements in SMIL manifests](https://github.com/yt-dlp/yt-dlp/commit/ddb2d7588bea48bae965dbfabe6df6550c9d3d43) ([#8504](https://github.com/yt-dlp/yt-dlp/issues/8504)) by [seproDev](https://github.com/seproDev) +- **abematv**: [Fix season metadata](https://github.com/yt-dlp/yt-dlp/commit/cc07f5cc85d9e2a6cd0bedb9d961665eea0d6047) ([#8607](https://github.com/yt-dlp/yt-dlp/issues/8607)) by [middlingphys](https://github.com/middlingphys) +- **allstar**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/3237f8ba29fe13bf95ff42b1e48b5b5109715feb) ([#8274](https://github.com/yt-dlp/yt-dlp/issues/8274)) by [S-Aarab](https://github.com/S-Aarab) +- **altcensored**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/3f90813f0617e0d21302398010de7496c9ae36aa) ([#8291](https://github.com/yt-dlp/yt-dlp/issues/8291)) by [drzraf](https://github.com/drzraf) +- **ard**: [Overhaul extractors](https://github.com/yt-dlp/yt-dlp/commit/5f009a094f0e8450792b097c4c8273622778052d) ([#8878](https://github.com/yt-dlp/yt-dlp/issues/8878)) by [seproDev](https://github.com/seproDev) +- **ardbetamediathek**: [Fix series extraction](https://github.com/yt-dlp/yt-dlp/commit/1f8bd8eba82ba10ddb49ee7cc0be4540dab103d5) ([#8687](https://github.com/yt-dlp/yt-dlp/issues/8687)) by [lstrojny](https://github.com/lstrojny) +- **bbc** + - [Extract more formats](https://github.com/yt-dlp/yt-dlp/commit/c919b68f7e79ea5010f75f648d3c9e45405a8011) ([#8321](https://github.com/yt-dlp/yt-dlp/issues/8321)) by [barsnick](https://github.com/barsnick), [dirkf](https://github.com/dirkf) + - [Fix JSON parsing bug](https://github.com/yt-dlp/yt-dlp/commit/19741ab8a401ec64d5e84fdbfcfb141d105e7bc8) by [bashonly](https://github.com/bashonly) +- **bfmtv**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/4903f452b68efb62dadf22e81be8c7934fc743e7) ([#8651](https://github.com/yt-dlp/yt-dlp/issues/8651)) by [bashonly](https://github.com/bashonly) +- **bilibili**: [Support courses and interactive videos](https://github.com/yt-dlp/yt-dlp/commit/9f09bdcfcb8e2b4b2decdc30d35d34b993bc7a94) ([#8343](https://github.com/yt-dlp/yt-dlp/issues/8343)) by [c-basalt](https://github.com/c-basalt) +- **bitchute**: [Fix and improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/b1a1ec1540605d2ea7abdb63336ffb1c56bf6316) ([#8507](https://github.com/yt-dlp/yt-dlp/issues/8507)) by [SirElderling](https://github.com/SirElderling) +- **box**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/5a230233d6fce06f4abd1fce0dc92b948e6f780b) ([#8649](https://github.com/yt-dlp/yt-dlp/issues/8649)) by [bashonly](https://github.com/bashonly) +- **bundestag**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/00a3e47bf5440c96025a76e08337ff2a475ed83e) ([#8783](https://github.com/yt-dlp/yt-dlp/issues/8783)) by [Grub4K](https://github.com/Grub4K) +- **drtv**: [Set default ext for m3u8 formats](https://github.com/yt-dlp/yt-dlp/commit/f96ab86cd837b1b5823baa87d144e15322ee9298) ([#8590](https://github.com/yt-dlp/yt-dlp/issues/8590)) by [seproDev](https://github.com/seproDev) +- **duoplay**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/66a0127d45033c698bdbedf162cddc55d9e7b906) ([#8542](https://github.com/yt-dlp/yt-dlp/issues/8542)) by [glensc](https://github.com/glensc) +- **eplus**: [Add login support and DRM detection](https://github.com/yt-dlp/yt-dlp/commit/d5d1517e7d838500800d193ac3234b06e89654cd) ([#8661](https://github.com/yt-dlp/yt-dlp/issues/8661)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **facebook** + - [Fix Memories extraction](https://github.com/yt-dlp/yt-dlp/commit/c39358a54bc6675ae0c50b81024e5a086e41656a) ([#8681](https://github.com/yt-dlp/yt-dlp/issues/8681)) by [kclauhk](https://github.com/kclauhk) + - [Improve subtitles extraction](https://github.com/yt-dlp/yt-dlp/commit/9cafb9ff17e14475a35c9a58b5bb010c86c9db4b) ([#8296](https://github.com/yt-dlp/yt-dlp/issues/8296)) by [kclauhk](https://github.com/kclauhk) +- **floatplane**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/628fa244bbce2ad39775a5959e99588f30cac152) ([#8639](https://github.com/yt-dlp/yt-dlp/issues/8639)) by [seproDev](https://github.com/seproDev) +- **francetv**: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/71f28097fec1c9e029f74b68a4eadc8915399840) ([#8409](https://github.com/yt-dlp/yt-dlp/issues/8409)) by [Fymyte](https://github.com/Fymyte) +- **instagram**: [Fix stories extraction](https://github.com/yt-dlp/yt-dlp/commit/50eaea9fd7787546b53660e736325fa31c77765d) ([#8843](https://github.com/yt-dlp/yt-dlp/issues/8843)) by [bashonly](https://github.com/bashonly) +- **joqrag**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/db8b4edc7d0bd27da462f6fe82ff6e13e3d68a04) ([#8384](https://github.com/yt-dlp/yt-dlp/issues/8384)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **litv**: [Fix premium content extraction](https://github.com/yt-dlp/yt-dlp/commit/f45c4efcd928a173e1300a8f1ce4258e70c969b1) ([#8842](https://github.com/yt-dlp/yt-dlp/issues/8842)) by [bashonly](https://github.com/bashonly) +- **maariv**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/c5f01bf7d4b9426c87c3f8248de23934a56579e0) ([#8331](https://github.com/yt-dlp/yt-dlp/issues/8331)) by [amir16yp](https://github.com/amir16yp) +- **mediastream**: [Fix authenticated format extraction](https://github.com/yt-dlp/yt-dlp/commit/b03c89309eb141be1a1eceeeb7475dd3b7529ad9) ([#8657](https://github.com/yt-dlp/yt-dlp/issues/8657)) by [NickCis](https://github.com/NickCis) +- **nebula**: [Overhaul extractors](https://github.com/yt-dlp/yt-dlp/commit/45d82be65f71bb05506bd55376c6fdb36bc54142) ([#8566](https://github.com/yt-dlp/yt-dlp/issues/8566)) by [elyse0](https://github.com/elyse0), [pukkandan](https://github.com/pukkandan), [seproDev](https://github.com/seproDev) +- **nintendo**: [Fix Nintendo Direct extraction](https://github.com/yt-dlp/yt-dlp/commit/1d24da6c899ef280d8b0a48a5e280ecd5d39cdf4) ([#8609](https://github.com/yt-dlp/yt-dlp/issues/8609)) by [Grub4K](https://github.com/Grub4K) +- **ondemandkorea**: [Fix upgraded format extraction](https://github.com/yt-dlp/yt-dlp/commit/04a5e06350e3ef7c03f94f2f3f90dd96c6411152) ([#8677](https://github.com/yt-dlp/yt-dlp/issues/8677)) by [seproDev](https://github.com/seproDev) +- **pr0gramm**: [Support variant formats and subtitles](https://github.com/yt-dlp/yt-dlp/commit/f98a3305eb124a0c375d03209d5c5a64fe1766c8) ([#8674](https://github.com/yt-dlp/yt-dlp/issues/8674)) by [Grub4K](https://github.com/Grub4K) +- **rinsefm**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/c91af948e43570025e4aa887e248fd025abae394) ([#8778](https://github.com/yt-dlp/yt-dlp/issues/8778)) by [hashFactory](https://github.com/hashFactory) +- **rudovideo**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/0d531c35eca4c2eb36e160530a7a333edbc727cc) ([#8664](https://github.com/yt-dlp/yt-dlp/issues/8664)) by [nicodato](https://github.com/nicodato) +- **theguardian**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/1fa3f24d4b5d22176b11d78420f1f4b64a5af0a8) ([#8535](https://github.com/yt-dlp/yt-dlp/issues/8535)) by [SirElderling](https://github.com/SirElderling) +- **theplatform**: [Extract more metadata](https://github.com/yt-dlp/yt-dlp/commit/7e09c147fdccb44806bbf601573adc4b77210a89) ([#8635](https://github.com/yt-dlp/yt-dlp/issues/8635)) by [trainman261](https://github.com/trainman261) +- **twitcasting**: [Detect livestreams via API and `show` page](https://github.com/yt-dlp/yt-dlp/commit/585d0ed9abcfcb957f2b2684b8ad43c3af160383) ([#8601](https://github.com/yt-dlp/yt-dlp/issues/8601)) by [bashonly](https://github.com/bashonly), [JC-Chung](https://github.com/JC-Chung) +- **twitcastinguser**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/ff2fde1b8f922fd34bae6172602008cd67c07c93) ([#8650](https://github.com/yt-dlp/yt-dlp/issues/8650)) by [bashonly](https://github.com/bashonly) +- **twitter** + - [Extract stale tweets](https://github.com/yt-dlp/yt-dlp/commit/1c54a98e19d047e7c15184237b6ef8ad50af489c) ([#8724](https://github.com/yt-dlp/yt-dlp/issues/8724)) by [bashonly](https://github.com/bashonly) + - [Prioritize m3u8 formats](https://github.com/yt-dlp/yt-dlp/commit/e7d22348e77367740da78a3db27167ecf894b7c9) ([#8826](https://github.com/yt-dlp/yt-dlp/issues/8826)) by [bashonly](https://github.com/bashonly) + - [Work around API rate-limit](https://github.com/yt-dlp/yt-dlp/commit/116c268438ea4d3738f6fa502c169081ca8f0ee7) ([#8825](https://github.com/yt-dlp/yt-dlp/issues/8825)) by [bashonly](https://github.com/bashonly) + - broadcast: [Extract `concurrent_view_count`](https://github.com/yt-dlp/yt-dlp/commit/6fe82491ed622b948c512cf4aab46ac3a234ae0a) ([#8600](https://github.com/yt-dlp/yt-dlp/issues/8600)) by [sonmezberkay](https://github.com/sonmezberkay) +- **vidly**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/34df1c1f60fa652c0a6a5c712b06c10e45daf6b7) ([#8612](https://github.com/yt-dlp/yt-dlp/issues/8612)) by [seproDev](https://github.com/seproDev) +- **vocaroo**: [Do not use deprecated `getheader`](https://github.com/yt-dlp/yt-dlp/commit/f223b1b0789f65e06619dcc9fc9e74f50d259379) ([#8606](https://github.com/yt-dlp/yt-dlp/issues/8606)) by [qbnu](https://github.com/qbnu) +- **vvvvid**: [Set user-agent to fix extraction](https://github.com/yt-dlp/yt-dlp/commit/1725e943b0e8a8b585305660d4611e684374409c) ([#8615](https://github.com/yt-dlp/yt-dlp/issues/8615)) by [Kyraminol](https://github.com/Kyraminol) +- **youtube** + - [Fix `like_count` extraction](https://github.com/yt-dlp/yt-dlp/commit/6b5d93b0b0240e287389d1d43b2d5293e18aa4cc) ([#8763](https://github.com/yt-dlp/yt-dlp/issues/8763)) by [Ganesh910](https://github.com/Ganesh910) + - [Improve detection of faulty HLS formats](https://github.com/yt-dlp/yt-dlp/commit/bb5a54e6db2422bbd155d93a0e105b6616c09467) ([#8646](https://github.com/yt-dlp/yt-dlp/issues/8646)) by [bashonly](https://github.com/bashonly) + - [Return empty playlist when channel/tab has no videos](https://github.com/yt-dlp/yt-dlp/commit/044886c220620a7679109e92352890e18b6079e3) by [pukkandan](https://github.com/pukkandan) + - [Support cf.piped.video](https://github.com/yt-dlp/yt-dlp/commit/6a9c7a2b52655bacfa7ab2da24fd0d14a6fff495) ([#8514](https://github.com/yt-dlp/yt-dlp/issues/8514)) by [OIRNOIR](https://github.com/OIRNOIR) +- **zingmp3**: [Add support for radio and podcasts](https://github.com/yt-dlp/yt-dlp/commit/64de1a4c25bada90374b88d7353754fe8fbfcc51) ([#7189](https://github.com/yt-dlp/yt-dlp/issues/7189)) by [hatienl0i261299](https://github.com/hatienl0i261299) + +#### Postprocessor changes +- **ffmpegmetadata**: [Embed stream metadata in single format downloads](https://github.com/yt-dlp/yt-dlp/commit/deeb13eae82e60f82a2c0c5861f460399a997528) ([#8647](https://github.com/yt-dlp/yt-dlp/issues/8647)) by [bashonly](https://github.com/bashonly) + +#### Networking changes +- [Strip whitespace around header values](https://github.com/yt-dlp/yt-dlp/commit/196eb0fe77b78e2e5ca02c506c3837c2b1a7964c) ([#8802](https://github.com/yt-dlp/yt-dlp/issues/8802)) by [coletdjnz](https://github.com/coletdjnz) +- **Request Handler**: websockets: [Migrate websockets to networking framework](https://github.com/yt-dlp/yt-dlp/commit/ccfd70f4c24b579c72123ca76ab50164f8f122b7) ([#7720](https://github.com/yt-dlp/yt-dlp/issues/7720)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **ci** + - [Concurrency optimizations](https://github.com/yt-dlp/yt-dlp/commit/f124fa458826308afc86cf364c509f857686ecfd) ([#8614](https://github.com/yt-dlp/yt-dlp/issues/8614)) by [Grub4K](https://github.com/Grub4K) + - [Run core tests only for core changes](https://github.com/yt-dlp/yt-dlp/commit/13b3cb3c2b7169a1e17d6fc62593bf744170521c) ([#8841](https://github.com/yt-dlp/yt-dlp/issues/8841)) by [Grub4K](https://github.com/Grub4K) +- **cleanup** + - [Fix spelling of `IE_NAME`](https://github.com/yt-dlp/yt-dlp/commit/bc4ab17b38f01000d99c5c2bedec89721fee65ec) ([#8810](https://github.com/yt-dlp/yt-dlp/issues/8810)) by [barsnick](https://github.com/barsnick) + - [Remove dead extractors](https://github.com/yt-dlp/yt-dlp/commit/9751a457cfdb18bf99d9ee0d10e4e6a594502bbf) ([#8604](https://github.com/yt-dlp/yt-dlp/issues/8604)) by [seproDev](https://github.com/seproDev) + - Miscellaneous: [f9fb3ce](https://github.com/yt-dlp/yt-dlp/commit/f9fb3ce86e3c6a0c3c33b45392b8d7288bceba76) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan), [seproDev](https://github.com/seproDev) +- **devscripts**: `run_tests`: [Create Python script](https://github.com/yt-dlp/yt-dlp/commit/2d1d683a541d71f3d3bb999dfe8eeb1976fb91ce) ([#8720](https://github.com/yt-dlp/yt-dlp/issues/8720)) by [Grub4K](https://github.com/Grub4K) (With fixes in [225cf2b](https://github.com/yt-dlp/yt-dlp/commit/225cf2b830a1de2c5eacd257edd2a01aed1e1114)) +- **docs**: [Update youtube-dl merge commit in `README.md`](https://github.com/yt-dlp/yt-dlp/commit/f10589e3453009bb523f55849bba144c9b91cf2a) by [bashonly](https://github.com/bashonly) +- **test**: networking: [Update tests for OpenSSL 3.2](https://github.com/yt-dlp/yt-dlp/commit/37755a037e612bfc608c3d4722e8ef2ce6a022ee) ([#8814](https://github.com/yt-dlp/yt-dlp/issues/8814)) by [bashonly](https://github.com/bashonly) + +### 2023.11.16 + +#### Extractor changes +- **abc.net.au**: iview, showseries: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/15cb3528cbda7b6198f49a6b5953c226d701696b) ([#8586](https://github.com/yt-dlp/yt-dlp/issues/8586)) by [bashonly](https://github.com/bashonly) +- **beatbump**: [Update `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/21dc069bea2d4d99345dd969e098f4535c751d45) ([#8576](https://github.com/yt-dlp/yt-dlp/issues/8576)) by [seproDev](https://github.com/seproDev) +- **dailymotion**: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/a489f071508ec5caf5f32052d142afe86c28df7a) ([#7692](https://github.com/yt-dlp/yt-dlp/issues/7692)) by [TravisDupes](https://github.com/TravisDupes) +- **drtv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/0783fd558ed0d3a8bc754beb75a406256f8b97b2) ([#8484](https://github.com/yt-dlp/yt-dlp/issues/8484)) by [almx](https://github.com/almx), [seproDev](https://github.com/seproDev) +- **eltrecetv**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/dcfad52812aa8ce007cefbfbe63f58b49f6b1046) ([#8216](https://github.com/yt-dlp/yt-dlp/issues/8216)) by [elivinsky](https://github.com/elivinsky) +- **jiosaavn**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b530118e7f48232cacf8050d79a6b20bdfcf5468) ([#8307](https://github.com/yt-dlp/yt-dlp/issues/8307)) by [awalgarg](https://github.com/awalgarg) +- **njpwworld**: [Remove](https://github.com/yt-dlp/yt-dlp/commit/e569c2d1f4b665795a2b64f0aaf7f76930664233) ([#8570](https://github.com/yt-dlp/yt-dlp/issues/8570)) by [aarubui](https://github.com/aarubui) +- **tv5mondeplus**: [Extract subtitles](https://github.com/yt-dlp/yt-dlp/commit/0f634dba3afdc429ece8839b02f6d56c27b7973a) ([#4209](https://github.com/yt-dlp/yt-dlp/issues/4209)) by [FrankZ85](https://github.com/FrankZ85) +- **twitcasting**: [Fix livestream detection](https://github.com/yt-dlp/yt-dlp/commit/2325d03aa7bb80f56ba52cd6992258e44727b424) ([#8574](https://github.com/yt-dlp/yt-dlp/issues/8574)) by [JC-Chung](https://github.com/JC-Chung) +- **zenyandex**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/5efe68b73cbf6e907c2e6a3aa338664385084184) ([#8454](https://github.com/yt-dlp/yt-dlp/issues/8454)) by [starius](https://github.com/starius) + +#### Misc. changes +- **build**: [Make `secretstorage` an optional dependency](https://github.com/yt-dlp/yt-dlp/commit/24f827875c6ba513f12ed09a3aef2bbed223760d) ([#8585](https://github.com/yt-dlp/yt-dlp/issues/8585)) by [bashonly](https://github.com/bashonly) + +### 2023.11.14 + +#### Important changes +- **The release channels have been adjusted!** + * [`master`](https://github.com/yt-dlp/yt-dlp-master-builds) builds are made after each push, containing the latest fixes (but also possibly bugs). This was previously the `nightly` channel. + * [`nightly`](https://github.com/yt-dlp/yt-dlp-nightly-builds) builds are now made once a day, if there were any changes. +- Security: [[CVE-2023-46121](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-46121)] Patch [Generic Extractor MITM Vulnerability via Arbitrary Proxy Injection](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-3ch3-jhc6-5r8x) + - Disallow smuggling of arbitrary `http_headers`; extractors now only use specific headers + +#### Core changes +- [Add `--compat-option manifest-filesize-approx`](https://github.com/yt-dlp/yt-dlp/commit/10025b715ea01489557eb2c5a3cc04d361fcdb52) ([#8356](https://github.com/yt-dlp/yt-dlp/issues/8356)) by [bashonly](https://github.com/bashonly) +- [Fix format sorting with `--load-info-json`](https://github.com/yt-dlp/yt-dlp/commit/595ea4a99b726b8fe9463e7853b7053978d0544e) ([#8521](https://github.com/yt-dlp/yt-dlp/issues/8521)) by [bashonly](https://github.com/bashonly) +- [Include build origin in verbose output](https://github.com/yt-dlp/yt-dlp/commit/20314dd46f25e0e0a7e985a7804049aefa8b909f) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) +- [Only ensure playlist thumbnail dir if writing thumbs](https://github.com/yt-dlp/yt-dlp/commit/a40e0b37dfc8c26916b0e01aa3f29f3bc42250b6) ([#8373](https://github.com/yt-dlp/yt-dlp/issues/8373)) by [bashonly](https://github.com/bashonly) +- **update**: [Overhaul self-updater](https://github.com/yt-dlp/yt-dlp/commit/0b6ad22e6a432006a75df968f0283e6c6b3cfae6) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- [Do not smuggle `http_headers`](https://github.com/yt-dlp/yt-dlp/commit/f04b5bedad7b281bee9814686bba1762bae092eb) by [coletdjnz](https://github.com/coletdjnz) +- [Do not test truth value of `xml.etree.ElementTree.Element`](https://github.com/yt-dlp/yt-dlp/commit/d4f14a72dc1dd79396e0e80980268aee902b61e4) ([#8582](https://github.com/yt-dlp/yt-dlp/issues/8582)) by [bashonly](https://github.com/bashonly) +- **brilliantpala**: [Fix cookies support](https://github.com/yt-dlp/yt-dlp/commit/9b5bedf13a3323074daceb0ec6ebb3cc6e0b9684) ([#8352](https://github.com/yt-dlp/yt-dlp/issues/8352)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **generic**: [Improve direct video link ext detection](https://github.com/yt-dlp/yt-dlp/commit/4ce2f29a50fcfb9920e6f2ffe42192945a2bad7e) ([#8340](https://github.com/yt-dlp/yt-dlp/issues/8340)) by [bashonly](https://github.com/bashonly) +- **laxarxames**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/312a2d1e8bc247264f9d85c5ec764e33aa0133b5) ([#8412](https://github.com/yt-dlp/yt-dlp/issues/8412)) by [aniolpages](https://github.com/aniolpages) +- **n-tv.de**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/8afd9468b0c822843bc480d366d1c86698daabfb) ([#8414](https://github.com/yt-dlp/yt-dlp/issues/8414)) by [1100101](https://github.com/1100101) +- **neteasemusic**: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/46acc418a53470b7f32581b3309c3cb87aa8488d) ([#8531](https://github.com/yt-dlp/yt-dlp/issues/8531)) by [LoserFox](https://github.com/LoserFox) +- **nhk**: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/54579be4364e148277c32e20a5c3efc2c3f52f5b) ([#8388](https://github.com/yt-dlp/yt-dlp/issues/8388)) by [garret1317](https://github.com/garret1317) +- **novaembed**: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/3ff494f6f41c27549420fa88be27555bd449ffdc) ([#8368](https://github.com/yt-dlp/yt-dlp/issues/8368)) by [peci1](https://github.com/peci1) +- **npo**: [Send `POST` request to streams API endpoint](https://github.com/yt-dlp/yt-dlp/commit/8e02a4dcc800f9444e9d461edc41edd7b662f435) ([#8413](https://github.com/yt-dlp/yt-dlp/issues/8413)) by [bartbroere](https://github.com/bartbroere) +- **ondemandkorea**: [Overhaul extractor](https://github.com/yt-dlp/yt-dlp/commit/05adfd883a4f2ecae0267e670a62a2e45c351aeb) ([#8386](https://github.com/yt-dlp/yt-dlp/issues/8386)) by [seproDev](https://github.com/seproDev) +- **orf**: podcast: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/6ba3085616652cbf05d1858efc321fdbfc4c6119) ([#8486](https://github.com/yt-dlp/yt-dlp/issues/8486)) by [Esokrates](https://github.com/Esokrates) +- **polskieradio**: audition: [Fix playlist extraction](https://github.com/yt-dlp/yt-dlp/commit/464327acdb353ceb91d2115163a5a9621b22fe0d) ([#8459](https://github.com/yt-dlp/yt-dlp/issues/8459)) by [shubhexists](https://github.com/shubhexists) +- **qdance**: [Update `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/177f0d963e4b9db749805c482e6f288354c8be84) ([#8426](https://github.com/yt-dlp/yt-dlp/issues/8426)) by [bashonly](https://github.com/bashonly) +- **radiocomercial**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/ef12dbdcd3e7264bd3d744c1e3107597bd23ad35) ([#8508](https://github.com/yt-dlp/yt-dlp/issues/8508)) by [SirElderling](https://github.com/SirElderling) +- **sbs.co.kr**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/25a4bd345a0dcfece6fef752d4537eb403da94d9) ([#8326](https://github.com/yt-dlp/yt-dlp/issues/8326)) by [seproDev](https://github.com/seproDev) +- **theatercomplextown**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/2863fcf2b6876d0c7965ff7d6d9242eea653dc6b) ([#8560](https://github.com/yt-dlp/yt-dlp/issues/8560)) by [bashonly](https://github.com/bashonly) +- **thisav**: [Remove](https://github.com/yt-dlp/yt-dlp/commit/cb480e390d85fb3a598c1b6d5eef3438ce729fc9) ([#8346](https://github.com/yt-dlp/yt-dlp/issues/8346)) by [bashonly](https://github.com/bashonly) +- **thisoldhouse**: [Add login support](https://github.com/yt-dlp/yt-dlp/commit/c76c96677ff6a056f5844a568ef05ee22c46d6f4) ([#8561](https://github.com/yt-dlp/yt-dlp/issues/8561)) by [bashonly](https://github.com/bashonly) +- **twitcasting**: [Fix livestream extraction](https://github.com/yt-dlp/yt-dlp/commit/7b8b1cf5eb8bf44ce70bc24e1f56f0dba2737e98) ([#8427](https://github.com/yt-dlp/yt-dlp/issues/8427)) by [JC-Chung](https://github.com/JC-Chung), [saintliao](https://github.com/saintliao) +- **twitter** + - broadcast + - [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/7d337ca977d73a0a6c07ab481ed8faa8f6ff8726) ([#8383](https://github.com/yt-dlp/yt-dlp/issues/8383)) by [HitomaruKonpaku](https://github.com/HitomaruKonpaku) + - [Support `--wait-for-video`](https://github.com/yt-dlp/yt-dlp/commit/f6e97090d2ed9e05441ab0f4bec3559b816d7a00) ([#8475](https://github.com/yt-dlp/yt-dlp/issues/8475)) by [bashonly](https://github.com/bashonly) +- **weibo**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/15b252dfd2c6807fe57afc5a95e59abadb32ccd2) ([#8463](https://github.com/yt-dlp/yt-dlp/issues/8463)) by [c-basalt](https://github.com/c-basalt) +- **weverse**: [Fix login error handling](https://github.com/yt-dlp/yt-dlp/commit/4a601c9eff9fb42e24a4c8da3fa03628e035b35b) ([#8458](https://github.com/yt-dlp/yt-dlp/issues/8458)) by [seproDev](https://github.com/seproDev) +- **youtube**: [Check newly uploaded iOS HLS formats](https://github.com/yt-dlp/yt-dlp/commit/ef79d20dc9d27ac002a7196f073b37f2f2721aed) ([#8336](https://github.com/yt-dlp/yt-dlp/issues/8336)) by [bashonly](https://github.com/bashonly) +- **zoom**: [Extract combined view formats](https://github.com/yt-dlp/yt-dlp/commit/3906de07551fedb00b789345bf24cc27d6ddf128) ([#7847](https://github.com/yt-dlp/yt-dlp/issues/7847)) by [Mipsters](https://github.com/Mipsters) + +#### Downloader changes +- **aria2c**: [Remove duplicate `--file-allocation=none`](https://github.com/yt-dlp/yt-dlp/commit/21b25281c51523620706b11bfc1c4a889858e1f2) ([#8332](https://github.com/yt-dlp/yt-dlp/issues/8332)) by [CrendKing](https://github.com/CrendKing) +- **dash**: [Force native downloader for `--live-from-start`](https://github.com/yt-dlp/yt-dlp/commit/2622c804d1a5accc3045db398e0fc52074f4bdb3) ([#8339](https://github.com/yt-dlp/yt-dlp/issues/8339)) by [bashonly](https://github.com/bashonly) + +#### Networking changes +- **Request Handler**: requests: [Add handler for `requests` HTTP library (#3668)](https://github.com/yt-dlp/yt-dlp/commit/8a8b54523addf46dfd50ef599761a81bc22362e6) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [Grub4K](https://github.com/Grub4K) (With fixes in [4e38e2a](https://github.com/yt-dlp/yt-dlp/commit/4e38e2ae9d7380015349e6aee59c78bb3938befd)) + + Adds support for HTTPS proxies and persistent connections (keep-alive) + +#### Misc. changes +- **build** + - [Include secretstorage in Linux builds](https://github.com/yt-dlp/yt-dlp/commit/9970d74c8383432c6c8779aa47d3253dcf412b14) by [bashonly](https://github.com/bashonly) + - [Overhaul and unify release workflow](https://github.com/yt-dlp/yt-dlp/commit/1d03633c5a1621b9f3a756f0a4f9dc61fab3aeaa) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) +- **ci** + - [Bump `actions/checkout` to v4](https://github.com/yt-dlp/yt-dlp/commit/5438593a35b7b042fc48fe29cad0b9039f07c9bb) by [bashonly](https://github.com/bashonly) + - [Run core tests with dependencies](https://github.com/yt-dlp/yt-dlp/commit/700444c23ddb65f618c2abd942acdc0c58c650b1) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz) +- **cleanup** + - [Fix changelog typo](https://github.com/yt-dlp/yt-dlp/commit/a9d3f4b20a3533d2a40104c85bc2cc6c2564c800) by [bashonly](https://github.com/bashonly) + - [Update documentation for master and nightly channels](https://github.com/yt-dlp/yt-dlp/commit/a00af29853b8c7350ce086f4cab8c2c9cf2fcf1d) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + - Miscellaneous: [b012271](https://github.com/yt-dlp/yt-dlp/commit/b012271d01b59759e4eefeab0308698cd9e7224c) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [dirkf](https://github.com/dirkf), [gamer191](https://github.com/gamer191), [Grub4K](https://github.com/Grub4K), [seproDev](https://github.com/seproDev) +- **test**: update: [Implement simple updater unit tests](https://github.com/yt-dlp/yt-dlp/commit/87264d4fdadcddd91289b968dd0e4bf58d449267) by [bashonly](https://github.com/bashonly) + +### 2023.10.13 + +#### Core changes +- [Ensure thumbnail output directory exists](https://github.com/yt-dlp/yt-dlp/commit/2acd1d555ef89851c73773776715d3de9a0e30b9) ([#7985](https://github.com/yt-dlp/yt-dlp/issues/7985)) by [Riteo](https://github.com/Riteo) +- **utils** + - `js_to_json`: [Fix `Date` constructor parsing](https://github.com/yt-dlp/yt-dlp/commit/9d7ded6419089c1bf252496073f73ad90ed71004) ([#8295](https://github.com/yt-dlp/yt-dlp/issues/8295)) by [awalgarg](https://github.com/awalgarg), [Grub4K](https://github.com/Grub4K) + - `write_xattr`: [Use `os.setxattr` if available](https://github.com/yt-dlp/yt-dlp/commit/84e26038d4002e763ea51ca1bdce4f7e63c540bf) ([#8205](https://github.com/yt-dlp/yt-dlp/issues/8205)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- **artetv**: [Support age-restricted content](https://github.com/yt-dlp/yt-dlp/commit/09f815ad52843219a7ee3f2a0dddf6c250c91f0c) ([#8301](https://github.com/yt-dlp/yt-dlp/issues/8301)) by [StefanLobbenmeier](https://github.com/StefanLobbenmeier) +- **jtbc**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b286ec68f1f28798b3e371f888a2ed97d399cf77) ([#8314](https://github.com/yt-dlp/yt-dlp/issues/8314)) by [seproDev](https://github.com/seproDev) +- **mbn**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/e030b6b6fba7b2f4614ad2ab9f7649d40a2dd305) ([#8312](https://github.com/yt-dlp/yt-dlp/issues/8312)) by [seproDev](https://github.com/seproDev) +- **nhk**: [Fix Japanese-language VOD extraction](https://github.com/yt-dlp/yt-dlp/commit/4de94b9e165bfd6421a692f5f2eabcdb08edcb71) ([#8309](https://github.com/yt-dlp/yt-dlp/issues/8309)) by [garret1317](https://github.com/garret1317) +- **radiko**: [Fix bug with `downloader_options`](https://github.com/yt-dlp/yt-dlp/commit/b9316642313bbc9e209ac0d2276d37ba60bceb49) by [bashonly](https://github.com/bashonly) +- **tenplay**: [Add support for seasons](https://github.com/yt-dlp/yt-dlp/commit/88a99c87b680ae59002534a517e191f46c42cbd4) ([#7939](https://github.com/yt-dlp/yt-dlp/issues/7939)) by [midnightveil](https://github.com/midnightveil) +- **youku**: [Improve tudou.com support](https://github.com/yt-dlp/yt-dlp/commit/b7098d46b552a9322c6cea39ba80be5229f922de) ([#8160](https://github.com/yt-dlp/yt-dlp/issues/8160)) by [naginatana](https://github.com/naginatana) +- **youtube**: [Fix bug with `--extractor-retries inf`](https://github.com/yt-dlp/yt-dlp/commit/feebf6d02fc9651331eee2af5e08e6112288163b) ([#8328](https://github.com/yt-dlp/yt-dlp/issues/8328)) by [Grub4K](https://github.com/Grub4K) + +#### Downloader changes +- **fragment**: [Improve progress calculation](https://github.com/yt-dlp/yt-dlp/commit/1c51c520f7b511ebd9e4eb7322285a8c31eedbbd) ([#8241](https://github.com/yt-dlp/yt-dlp/issues/8241)) by [Grub4K](https://github.com/Grub4K) + +#### Misc. changes +- **cleanup**: Miscellaneous: [b634ba7](https://github.com/yt-dlp/yt-dlp/commit/b634ba742d8f38ce9ecfa0546485728b0c6c59d1) by [bashonly](https://github.com/bashonly), [gamer191](https://github.com/gamer191) + +### 2023.10.07 + +#### Extractor changes +- **abc.net.au**: iview: [Improve `episode` extraction](https://github.com/yt-dlp/yt-dlp/commit/a9efb4b8d74f3583450ffda0ee57259a47d39c70) ([#8201](https://github.com/yt-dlp/yt-dlp/issues/8201)) by [xofe](https://github.com/xofe) +- **erocast**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/47c598783c98c179e04dd12c2a3fee0f3dc53087) ([#8264](https://github.com/yt-dlp/yt-dlp/issues/8264)) by [madewokherd](https://github.com/madewokherd) +- **gofile**: [Fix token cookie bug](https://github.com/yt-dlp/yt-dlp/commit/0730d5a966fa8a937d84bfb7f68be5198acb039b) by [bashonly](https://github.com/bashonly) +- **iq.com**: [Fix extraction and subtitles](https://github.com/yt-dlp/yt-dlp/commit/35d9cbaf9638ccc9daf8a863063b2e7c135bc664) ([#8260](https://github.com/yt-dlp/yt-dlp/issues/8260)) by [AS6939](https://github.com/AS6939) +- **lbry** + - [Add playlist support](https://github.com/yt-dlp/yt-dlp/commit/48cceec1ddb8649b5e771df8df79eb9c39c82b90) ([#8213](https://github.com/yt-dlp/yt-dlp/issues/8213)) by [bashonly](https://github.com/bashonly), [drzraf](https://github.com/drzraf), [Grub4K](https://github.com/Grub4K) + - [Extract `uploader_id`](https://github.com/yt-dlp/yt-dlp/commit/0e722f2f3ca42e634fd7b06ee70b16bf833ce132) ([#8244](https://github.com/yt-dlp/yt-dlp/issues/8244)) by [drzraf](https://github.com/drzraf) +- **litv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/91a670a4f7babe9c8aa2018f57d8c8952a6f49d8) ([#7785](https://github.com/yt-dlp/yt-dlp/issues/7785)) by [jiru](https://github.com/jiru) +- **neteasemusic**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/f980df734cf5c0eaded2f7b38c6c60bccfeebb48) ([#8181](https://github.com/yt-dlp/yt-dlp/issues/8181)) by [c-basalt](https://github.com/c-basalt) +- **nhk**: [Fix VOD extraction](https://github.com/yt-dlp/yt-dlp/commit/e831c80e8b2fc025b3b67d82974cc59e3526fdc8) ([#8249](https://github.com/yt-dlp/yt-dlp/issues/8249)) by [garret1317](https://github.com/garret1317) +- **radiko**: [Improve extraction](https://github.com/yt-dlp/yt-dlp/commit/2ad3873f0dfa9285c91d2160e36c039e69d597c7) ([#8221](https://github.com/yt-dlp/yt-dlp/issues/8221)) by [garret1317](https://github.com/garret1317) +- **substack** + - [Fix download cookies bug](https://github.com/yt-dlp/yt-dlp/commit/2f2dda3a7e85148773da3cdbc03ac9949ec1bc45) ([#8219](https://github.com/yt-dlp/yt-dlp/issues/8219)) by [handlerug](https://github.com/handlerug) + - [Fix embed extraction](https://github.com/yt-dlp/yt-dlp/commit/fbcc299bd8a19cf8b3c8805d6c268a9110230973) ([#8218](https://github.com/yt-dlp/yt-dlp/issues/8218)) by [handlerug](https://github.com/handlerug) +- **theta**: [Remove extractors](https://github.com/yt-dlp/yt-dlp/commit/792f1e64f6a2beac51e85408d142b3118115c4fd) ([#8251](https://github.com/yt-dlp/yt-dlp/issues/8251)) by [alerikaisattera](https://github.com/alerikaisattera) +- **wrestleuniversevod**: [Call API with device ID](https://github.com/yt-dlp/yt-dlp/commit/b095fd3fa9d58a65dc9b830bd63b9d909422aa86) ([#8272](https://github.com/yt-dlp/yt-dlp/issues/8272)) by [bashonly](https://github.com/bashonly) +- **xhamster**: user: [Support creator urls](https://github.com/yt-dlp/yt-dlp/commit/cc8d8441524ec3442d7c0d3f8f33f15b66aa06f3) ([#8232](https://github.com/yt-dlp/yt-dlp/issues/8232)) by [Grub4K](https://github.com/Grub4K) +- **youtube** + - [Fix `heatmap` extraction](https://github.com/yt-dlp/yt-dlp/commit/03e85ea99db76a2fddb65bf46f8819bda780aaf3) ([#8299](https://github.com/yt-dlp/yt-dlp/issues/8299)) by [bashonly](https://github.com/bashonly) + - [Raise a warning for `Incomplete Data` instead of an error](https://github.com/yt-dlp/yt-dlp/commit/eb5bdbfa70126c7d5355cc0954b63720522e462c) ([#8238](https://github.com/yt-dlp/yt-dlp/issues/8238)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **cleanup** + - [Update extractor tests](https://github.com/yt-dlp/yt-dlp/commit/19c90e405b4137c06dfe6f9aaa02396df0da93e5) ([#7718](https://github.com/yt-dlp/yt-dlp/issues/7718)) by [trainman261](https://github.com/trainman261) + - Miscellaneous: [377e85a](https://github.com/yt-dlp/yt-dlp/commit/377e85a1797db9e98b78b38203ed9d4ded229991) by [dirkf](https://github.com/dirkf), [gamer191](https://github.com/gamer191), [Grub4K](https://github.com/Grub4K) + +### 2023.09.24 + +#### Important changes +- **The minimum *recommended* Python version has been raised to 3.8** +Since Python 3.7 has reached end-of-life, support for it will be dropped soon. [Read more](https://github.com/yt-dlp/yt-dlp/issues/7803) +- Security: [[CVE-2023-40581](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-40581)] [Prevent RCE when using `--exec` with `%q` on Windows](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-42h4-v29r-42qg) + - The shell escape function is now using `""` instead of `\"`. + - `utils.Popen` has been patched to properly quote commands. + +#### Core changes +- [Fix HTTP headers and cookie handling](https://github.com/yt-dlp/yt-dlp/commit/6c5211cebeacfc53ad5d5ddf4a659be76039656f) by [bashonly](https://github.com/bashonly), [pukkandan](https://github.com/pukkandan) +- [Fix `--check-formats`](https://github.com/yt-dlp/yt-dlp/commit/8cb7fc44db010e965d808ee679ef0725cb6e147c) by [pukkandan](https://github.com/pukkandan) +- [Fix support for upcoming Python 3.12](https://github.com/yt-dlp/yt-dlp/commit/836e06d246512f286f30c1371b2c54b72c9ecd93) ([#8130](https://github.com/yt-dlp/yt-dlp/issues/8130)) by [Grub4K](https://github.com/Grub4K) +- [Merged with youtube-dl 66ab08](https://github.com/yt-dlp/yt-dlp/commit/9d6254069c75877bc88bc3584f4326fb1853a543) by [coletdjnz](https://github.com/coletdjnz) +- [Prevent RCE when using `--exec` with `%q` (CVE-2023-40581)](https://github.com/yt-dlp/yt-dlp/commit/de015e930747165dbb8fcd360f8775fd973b7d6e) by [Grub4K](https://github.com/Grub4K) +- [Raise minimum recommended Python version to 3.8](https://github.com/yt-dlp/yt-dlp/commit/61bdf15fc7400601c3da1aa7a43917310a5bf391) ([#8183](https://github.com/yt-dlp/yt-dlp/issues/8183)) by [Grub4K](https://github.com/Grub4K) +- [`FFmpegFixupM3u8PP` may need to run with ffmpeg](https://github.com/yt-dlp/yt-dlp/commit/f73c11803579889dc8e1c99e25dba9a22fef39d8) by [pukkandan](https://github.com/pukkandan) +- **compat** + - [Add `types.NoneType`](https://github.com/yt-dlp/yt-dlp/commit/e0c4db04dc82a699bdabd9821ddc239ebe17d30a) by [pukkandan](https://github.com/pukkandan) (With fixes in [25b6e8f](https://github.com/yt-dlp/yt-dlp/commit/25b6e8f94679b4458550702b46e61249b875a4fd)) + - [Deprecate old functions](https://github.com/yt-dlp/yt-dlp/commit/3d2623a898196640f7cc0fc8b70118ff19e6925d) ([#2861](https://github.com/yt-dlp/yt-dlp/issues/2861)) by [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) + - [Ensure submodules are imported correctly](https://github.com/yt-dlp/yt-dlp/commit/a250b247334ce9f641e709cbb64974da6034a2b3) by [pukkandan](https://github.com/pukkandan) +- **cookies**: [Containers JSON should be opened as utf-8](https://github.com/yt-dlp/yt-dlp/commit/dab87ca23650fd87184ff5286b53e6985b59f71d) ([#7800](https://github.com/yt-dlp/yt-dlp/issues/7800)) by [bashonly](https://github.com/bashonly) +- **dependencies**: [Handle deprecation of `sqlite3.version`](https://github.com/yt-dlp/yt-dlp/commit/35f9a306e6934793cff100200cd03f288ec33f11) ([#8167](https://github.com/yt-dlp/yt-dlp/issues/8167)) by [bashonly](https://github.com/bashonly) +- **outtmpl**: [Fix replacement for `playlist_index`](https://github.com/yt-dlp/yt-dlp/commit/a264433c9fba147ecae2420091614186cfeeb895) by [pukkandan](https://github.com/pukkandan) +- **utils** + - [Add temporary shim for logging](https://github.com/yt-dlp/yt-dlp/commit/1b392f905d20ef1f1b300b180f867d43c9ce49b8) by [pukkandan](https://github.com/pukkandan) + - [Improve `parse_duration`](https://github.com/yt-dlp/yt-dlp/commit/af86873218c24c3859ccf575a87f2b00a73b49d0) by [bashonly](https://github.com/bashonly) + - HTTPHeaderDict: [Handle byte values](https://github.com/yt-dlp/yt-dlp/commit/3f7965105d8d2048359e67c1e8b8ebd51588143b) by [pukkandan](https://github.com/pukkandan) + - `clean_podcast_url`: [Handle more trackers](https://github.com/yt-dlp/yt-dlp/commit/2af4eeb77246b8183aae75a0a8d19f18c08115b2) ([#7556](https://github.com/yt-dlp/yt-dlp/issues/7556)) by [bashonly](https://github.com/bashonly), [mabdelfattah](https://github.com/mabdelfattah) + - `js_to_json`: [Handle `Array` objects](https://github.com/yt-dlp/yt-dlp/commit/52414d64ca7b92d3f83964cdd68247989b0c4625) by [Grub4K](https://github.com/Grub4K), [std-move](https://github.com/std-move) + +#### Extractor changes +- [Extract subtitles from SMIL manifests](https://github.com/yt-dlp/yt-dlp/commit/550e65410a7a1b105923494ac44460a4dc1a15d9) ([#7667](https://github.com/yt-dlp/yt-dlp/issues/7667)) by [bashonly](https://github.com/bashonly), [pukkandan](https://github.com/pukkandan) +- [Fix `--load-pages`](https://github.com/yt-dlp/yt-dlp/commit/81b4712bca608b9015aa68a4d96661d56e9cb894) by [pukkandan](https://github.com/pukkandan) +- [Make `_search_nuxt_data` more lenient](https://github.com/yt-dlp/yt-dlp/commit/904a19ee93195ce0bd4b08bd22b186120afb5b17) by [std-move](https://github.com/std-move) +- **abematv** + - [Fix proxy handling](https://github.com/yt-dlp/yt-dlp/commit/497bbbbd7328cb705f70eced94dbd90993819a46) ([#8046](https://github.com/yt-dlp/yt-dlp/issues/8046)) by [SevenLives](https://github.com/SevenLives) + - [Temporary fix for protocol handler](https://github.com/yt-dlp/yt-dlp/commit/9f66247289b9f8ecf931833b3f5f127274dd2161) by [pukkandan](https://github.com/pukkandan) +- **amazonminitv**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/538d37671a17e0782d17f08df17800e2e3bd57c8) by [bashonly](https://github.com/bashonly), [GautamMKGarg](https://github.com/GautamMKGarg) +- **antenna**: [Support antenna.gr](https://github.com/yt-dlp/yt-dlp/commit/665876034c8d3c031443f6b4958bed02ccdf4164) ([#7584](https://github.com/yt-dlp/yt-dlp/issues/7584)) by [stdedos](https://github.com/stdedos) +- **artetv**: [Fix HLS formats extraction](https://github.com/yt-dlp/yt-dlp/commit/c2da0b5ea215298135f76e3dc14b972a3c4afacb) by [bashonly](https://github.com/bashonly) +- **axs**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/aee6b9b88c0bcccf27fd23b7e00fc0b7b168928f) ([#8094](https://github.com/yt-dlp/yt-dlp/issues/8094)) by [barsnick](https://github.com/barsnick) +- **banbye**: [Support video ids containing a hyphen](https://github.com/yt-dlp/yt-dlp/commit/578a82e497502b951036ce9da6fe0dac6937ac27) ([#8059](https://github.com/yt-dlp/yt-dlp/issues/8059)) by [kshitiz305](https://github.com/kshitiz305) +- **bbc**: [Extract tracklist as chapters](https://github.com/yt-dlp/yt-dlp/commit/eda0e415d26eb084e570cf5372d38ee1f616b70f) ([#7788](https://github.com/yt-dlp/yt-dlp/issues/7788)) by [garret1317](https://github.com/garret1317) +- **bild.de**: [Extract HLS formats](https://github.com/yt-dlp/yt-dlp/commit/b4c1c408c63724339eb12b16c91b253a7ee62cfa) ([#8032](https://github.com/yt-dlp/yt-dlp/issues/8032)) by [barsnick](https://github.com/barsnick) +- **bilibili** + - [Add support for series, favorites and watch later](https://github.com/yt-dlp/yt-dlp/commit/9e68747f9607f05e92bb7d9b6e79d678b50070e1) ([#7518](https://github.com/yt-dlp/yt-dlp/issues/7518)) by [c-basalt](https://github.com/c-basalt) + - [Extract Dolby audio formats](https://github.com/yt-dlp/yt-dlp/commit/b84fda7388dd20d38921e23b469147f3957c1812) ([#8142](https://github.com/yt-dlp/yt-dlp/issues/8142)) by [ClosedPort22](https://github.com/ClosedPort22) + - [Extract `format_id`](https://github.com/yt-dlp/yt-dlp/commit/5336bf57a7061e0955a37f0542fc8ebf50d55b17) ([#7555](https://github.com/yt-dlp/yt-dlp/issues/7555)) by [c-basalt](https://github.com/c-basalt) +- **bilibilibangumi**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/bdd0b75e3f41ff35440eda6d395008beef19ef2f) ([#7337](https://github.com/yt-dlp/yt-dlp/issues/7337)) by [GD-Slime](https://github.com/GD-Slime) +- **bpb**: [Overhaul extractor](https://github.com/yt-dlp/yt-dlp/commit/f659e6439444ac64305b5c80688cd82f59d2279c) ([#8119](https://github.com/yt-dlp/yt-dlp/issues/8119)) by [Grub4K](https://github.com/Grub4K) +- **brilliantpala**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/92feb5654c5a4c81ba872904a618700fcbb3e546) ([#6680](https://github.com/yt-dlp/yt-dlp/issues/6680)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **canal1, caracoltvplay**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b3febedbeb662dfdf9b5c1d5799039ad4fc969de) ([#7151](https://github.com/yt-dlp/yt-dlp/issues/7151)) by [elyse0](https://github.com/elyse0) +- **cbc**: [Ignore any 426 from API](https://github.com/yt-dlp/yt-dlp/commit/9bf14be775289bd88cc1f5c89fd761ae51879484) ([#7689](https://github.com/yt-dlp/yt-dlp/issues/7689)) by [makew0rld](https://github.com/makew0rld) +- **cbcplayer**: [Extract HLS formats and subtitles](https://github.com/yt-dlp/yt-dlp/commit/339c339fec095ff4141b20e6aa83629117fb26df) ([#7484](https://github.com/yt-dlp/yt-dlp/issues/7484)) by [trainman261](https://github.com/trainman261) +- **cbcplayerplaylist**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/ed711897814f3ee0b1822e4205e74133467e8f1c) ([#7870](https://github.com/yt-dlp/yt-dlp/issues/7870)) by [trainman261](https://github.com/trainman261) +- **cineverse**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/15591940ff102d1ae337d603a46d8f238c83a61f) ([#8146](https://github.com/yt-dlp/yt-dlp/issues/8146)) by [garret1317](https://github.com/garret1317) +- **crunchyroll**: [Remove initial state extraction](https://github.com/yt-dlp/yt-dlp/commit/9b16762f48914de9ac914601769c76668e433325) ([#7632](https://github.com/yt-dlp/yt-dlp/issues/7632)) by [Grub4K](https://github.com/Grub4K) +- **douyutv**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/21f40e75dfc0055ea9cdbd7fe2c46c6f9b561afd) ([#7652](https://github.com/yt-dlp/yt-dlp/issues/7652)) by [c-basalt](https://github.com/c-basalt) +- **dropbox**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/b9f2bc2dbed2323734a0d18e65e1e2e23dc833d8) ([#7926](https://github.com/yt-dlp/yt-dlp/issues/7926)) by [bashonly](https://github.com/bashonly), [denhotte](https://github.com/denhotte), [nathantouze](https://github.com/nathantouze) (With fixes in [099fb1b](https://github.com/yt-dlp/yt-dlp/commit/099fb1b35cf835303306549f5113d1802d79c9c7) by [bashonly](https://github.com/bashonly)) +- **eplus**: inbound: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/295fbb3ae3a7d0dd50e286be5c487cf145ed5778) ([#5782](https://github.com/yt-dlp/yt-dlp/issues/5782)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **expressen**: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/a5e264d74b4bd60c6e7ec4e38f1a23af4e420531) ([#8153](https://github.com/yt-dlp/yt-dlp/issues/8153)) by [kylegustavo](https://github.com/kylegustavo) +- **facebook** + - [Add dash manifest URL](https://github.com/yt-dlp/yt-dlp/commit/a854fbec56d5004f5147116a41d1dd050632a579) ([#7743](https://github.com/yt-dlp/yt-dlp/issues/7743)) by [ringus1](https://github.com/ringus1) + - [Fix webpage extraction](https://github.com/yt-dlp/yt-dlp/commit/d3d81cc98f554d0adb87d24bfd6fabaaa803944d) ([#7890](https://github.com/yt-dlp/yt-dlp/issues/7890)) by [ringus1](https://github.com/ringus1) + - [Improve format sorting](https://github.com/yt-dlp/yt-dlp/commit/308936619c8a4f3a52d73c829c2006ff6c55fea2) ([#8074](https://github.com/yt-dlp/yt-dlp/issues/8074)) by [fireattack](https://github.com/fireattack) + - reel: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/bb5d84c9d2f1e978c3eddfb5ccbe138036682a36) ([#7564](https://github.com/yt-dlp/yt-dlp/issues/7564)) by [bashonly](https://github.com/bashonly), [demon071](https://github.com/demon071) +- **fox**: [Support foxsports.com](https://github.com/yt-dlp/yt-dlp/commit/30b29f37159e9226e2f2d5434c9a4096ac4efa2e) ([#7724](https://github.com/yt-dlp/yt-dlp/issues/7724)) by [ischmidt20](https://github.com/ischmidt20) +- **funker530**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/0ce1f48bf1cb78d40d734ce73ee1c90eccf92274) ([#8040](https://github.com/yt-dlp/yt-dlp/issues/8040)) by [04-pasha-04](https://github.com/04-pasha-04) +- **generic** + - [Fix KVS thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/53675852195d8dd859555d4789944a6887171ff8) by [bashonly](https://github.com/bashonly) + - [Fix generic title for embeds](https://github.com/yt-dlp/yt-dlp/commit/994f7ef8e6003f4b7b258528755d0b6adcc31714) by [pukkandan](https://github.com/pukkandan) +- **gofile**: [Update token](https://github.com/yt-dlp/yt-dlp/commit/99c99c7185f5d8e9b3699a6fc7f86ec663d7b97e) by [bashonly](https://github.com/bashonly) +- **hotstar** + - [Extract `release_year`](https://github.com/yt-dlp/yt-dlp/commit/7237c8dca0590aa7438ade93f927df88c9381ec7) ([#7869](https://github.com/yt-dlp/yt-dlp/issues/7869)) by [Rajeshwaran2001](https://github.com/Rajeshwaran2001) + - [Make metadata extraction non-fatal](https://github.com/yt-dlp/yt-dlp/commit/30ea88591b728cca0896018dbf67c2298070c669) by [bashonly](https://github.com/bashonly) + - [Support `/clips/` URLs](https://github.com/yt-dlp/yt-dlp/commit/86eeb044c2342d68c6ef177577f87852e6badd85) ([#7710](https://github.com/yt-dlp/yt-dlp/issues/7710)) by [bashonly](https://github.com/bashonly) +- **hungama**: [Overhaul extractors](https://github.com/yt-dlp/yt-dlp/commit/4b3a6ef1b3e235ba9a45142830b6edb357c71696) ([#7757](https://github.com/yt-dlp/yt-dlp/issues/7757)) by [bashonly](https://github.com/bashonly), [Yalab7](https://github.com/Yalab7) +- **indavideoembed**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/63e0c5748c0eb461a2ccca4181616eb930b4b750) ([#8129](https://github.com/yt-dlp/yt-dlp/issues/8129)) by [aky-01](https://github.com/aky-01) +- **iprima**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/568f08051841aedea968258889539741e26009e9) ([#7216](https://github.com/yt-dlp/yt-dlp/issues/7216)) by [std-move](https://github.com/std-move) +- **lbry**: [Fix original format extraction](https://github.com/yt-dlp/yt-dlp/commit/127a22460658ac39cbe5c4b3fb88d578363e0dfa) ([#7711](https://github.com/yt-dlp/yt-dlp/issues/7711)) by [bashonly](https://github.com/bashonly) +- **lecturio**: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/efa2339502a37cf13ae7f143bd8b2c28f452d1cd) ([#7649](https://github.com/yt-dlp/yt-dlp/issues/7649)) by [simon300000](https://github.com/simon300000) +- **magellantv**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/f4ea501551526ebcb54d19b84cf0ebe798583a85) ([#7616](https://github.com/yt-dlp/yt-dlp/issues/7616)) by [bashonly](https://github.com/bashonly) +- **massengeschmack.tv**: [Fix title extraction](https://github.com/yt-dlp/yt-dlp/commit/81f46ac573dc443ad48560f308582a26784d3015) ([#7813](https://github.com/yt-dlp/yt-dlp/issues/7813)) by [sb0stn](https://github.com/sb0stn) +- **media.ccc.de**: lists: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/cf11b40ac40e3d23a6352753296f3a732886efb9) ([#8144](https://github.com/yt-dlp/yt-dlp/issues/8144)) by [Rohxn16](https://github.com/Rohxn16) +- **mediaite**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/630a55df8de7747e79aa680959d785dfff2c4b76) ([#7923](https://github.com/yt-dlp/yt-dlp/issues/7923)) by [Grabien](https://github.com/Grabien) +- **mediaklikk**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/6e07e4bc7e59f5bdb60e93c011e57b18b009f2b5) ([#8086](https://github.com/yt-dlp/yt-dlp/issues/8086)) by [bashonly](https://github.com/bashonly), [zhallgato](https://github.com/zhallgato) +- **mediastream**: [Make embed extraction non-fatal](https://github.com/yt-dlp/yt-dlp/commit/635ae31f68a3ac7f6393d59657ed711e34ee3552) by [bashonly](https://github.com/bashonly) +- **mixcloud**: [Update API URL](https://github.com/yt-dlp/yt-dlp/commit/7b71643cc986de9a3768dac4ac9b64f4d05e7f5e) ([#8114](https://github.com/yt-dlp/yt-dlp/issues/8114)) by [garret1317](https://github.com/garret1317) +- **monstercat**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/eaee21bf71889d495076037cbe590c8c0b21ef3a) ([#8133](https://github.com/yt-dlp/yt-dlp/issues/8133)) by [garret1317](https://github.com/garret1317) +- **motortrendondemand**: [Update `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/c03a58ec9933e4a42c2d8fa80b8a0ddb2cde64e6) ([#7683](https://github.com/yt-dlp/yt-dlp/issues/7683)) by [AmirAflak](https://github.com/AmirAflak) +- **museai**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/65cfa2b057d7946fbe322155a778fe206556d0c6) ([#7614](https://github.com/yt-dlp/yt-dlp/issues/7614)) by [bashonly](https://github.com/bashonly) +- **mzaalo**: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/d7aee8e310b2c4f21d50aac0b420e1b3abde21a4) by [bashonly](https://github.com/bashonly) +- **n1info**: article: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/8ac5b6d96ae5c60cd5ae2495949e0068a6754c45) ([#7373](https://github.com/yt-dlp/yt-dlp/issues/7373)) by [u-spec-png](https://github.com/u-spec-png) +- **nfl.com**: plus, replay: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/1eaca74bc2ca0f5b1ec532f24c61de44f2e8cb2d) ([#7838](https://github.com/yt-dlp/yt-dlp/issues/7838)) by [bashonly](https://github.com/bashonly) +- **niconicochannelplus**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/698beb9a497f51693e64d167e572ff9efa4bc25f) ([#5686](https://github.com/yt-dlp/yt-dlp/issues/5686)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **nitter**: [Fix title extraction fallback](https://github.com/yt-dlp/yt-dlp/commit/a83da3717d30697102e76f63a6f29d77f9373c2a) ([#8102](https://github.com/yt-dlp/yt-dlp/issues/8102)) by [ApoorvShah111](https://github.com/ApoorvShah111) +- **noodlemagazine**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/bae4834245a708fff97219849ec880c319c88bc6) ([#7830](https://github.com/yt-dlp/yt-dlp/issues/7830)) by [RedDeffender](https://github.com/RedDeffender) (With fixes in [69dbfe0](https://github.com/yt-dlp/yt-dlp/commit/69dbfe01c47cd078682a87f179f5846e2679e927) by [bashonly](https://github.com/bashonly)) +- **novaembed**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/2269065ad60cb0ab62408ae6a7b20283e5252232) ([#7910](https://github.com/yt-dlp/yt-dlp/issues/7910)) by [std-move](https://github.com/std-move) +- **patreoncampaign**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/11de6fec9c9b8d34d1f90c8e6218ec58a3471b58) ([#7664](https://github.com/yt-dlp/yt-dlp/issues/7664)) by [bashonly](https://github.com/bashonly) +- **pbs**: [Add extractor `PBSKidsIE`](https://github.com/yt-dlp/yt-dlp/commit/6d6081dda1290a85bdab6717f239289e3aa74c8e) ([#7602](https://github.com/yt-dlp/yt-dlp/issues/7602)) by [snixon](https://github.com/snixon) +- **piapro**: [Support `/content` URL](https://github.com/yt-dlp/yt-dlp/commit/1bcb9fe8715b1f288efc322be3de409ee0597080) ([#7592](https://github.com/yt-dlp/yt-dlp/issues/7592)) by [FinnRG](https://github.com/FinnRG) +- **piaulizaportal**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/6636021206dad17c7745ae6bce6cb73d6f2ef319) ([#7903](https://github.com/yt-dlp/yt-dlp/issues/7903)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **picartovod**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/db9743894071760f994f640a4c24358f749a78c0) ([#7727](https://github.com/yt-dlp/yt-dlp/issues/7727)) by [Frankgoji](https://github.com/Frankgoji) +- **pornbox**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/40999467f72db074a3f13057da9bf82a857530fe) ([#7386](https://github.com/yt-dlp/yt-dlp/issues/7386)) by [niemands](https://github.com/niemands) +- **pornhub**: [Update access cookies for UK](https://github.com/yt-dlp/yt-dlp/commit/1d3d579c2142f69831b6ae140e1d8e824e07fa0e) ([#7591](https://github.com/yt-dlp/yt-dlp/issues/7591)) by [zhong-yiyu](https://github.com/zhong-yiyu) +- **pr0gramm**: [Rewrite extractor](https://github.com/yt-dlp/yt-dlp/commit/b532556d0a85e7d76f8f0880861232fb706ddbc5) ([#8151](https://github.com/yt-dlp/yt-dlp/issues/8151)) by [Grub4K](https://github.com/Grub4K) +- **radiofrance**: [Add support for livestreams, podcasts, playlists](https://github.com/yt-dlp/yt-dlp/commit/ba8e9eb2c8bbb699f314169fab8e544437ad731e) ([#7006](https://github.com/yt-dlp/yt-dlp/issues/7006)) by [elyse0](https://github.com/elyse0) +- **rbgtum**: [Fix extraction and support new URL format](https://github.com/yt-dlp/yt-dlp/commit/5fccabac27ca3c1165ade1b0df6fbadc24258dc2) ([#7690](https://github.com/yt-dlp/yt-dlp/issues/7690)) by [simon300000](https://github.com/simon300000) +- **reddit** + - [Extract subtitles](https://github.com/yt-dlp/yt-dlp/commit/20c3c9b433dd47faf0dbde6b46e4e34eb76109a5) by [bashonly](https://github.com/bashonly) + - [Fix thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/9a04113dfbb69b904e4e2bea736da293505786b8) by [bashonly](https://github.com/bashonly) +- **rtvslo**: [Fix format extraction](https://github.com/yt-dlp/yt-dlp/commit/94389b225d9bcf29aa7ba8afaf1bbd7c62204eae) ([#8131](https://github.com/yt-dlp/yt-dlp/issues/8131)) by [bashonly](https://github.com/bashonly) +- **rule34video**: [Extract tags](https://github.com/yt-dlp/yt-dlp/commit/58493923e9b6f774947a2131e5258e9f3cf816be) ([#7117](https://github.com/yt-dlp/yt-dlp/issues/7117)) by [soundchaser128](https://github.com/soundchaser128) +- **rumble**: [Fix embed extraction](https://github.com/yt-dlp/yt-dlp/commit/23d829a3420450bcfb0788e6fb2cf4f6acdbe596) ([#8035](https://github.com/yt-dlp/yt-dlp/issues/8035)) by [trislee](https://github.com/trislee) +- **s4c** + - [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/b9de629d78ce31699f2de886071dc257830f9676) ([#7730](https://github.com/yt-dlp/yt-dlp/issues/7730)) by [ifan-t](https://github.com/ifan-t) + - [Add series support and extract subs/thumbs](https://github.com/yt-dlp/yt-dlp/commit/fe371dcf0ba5ce8d42480eade54eeeac99ab3cb0) ([#7776](https://github.com/yt-dlp/yt-dlp/issues/7776)) by [ifan-t](https://github.com/ifan-t) +- **sohu**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/5be7e978867b5f66ad6786c674d79d40e950ae16) ([#7628](https://github.com/yt-dlp/yt-dlp/issues/7628)) by [bashonly](https://github.com/bashonly), [c-basalt](https://github.com/c-basalt) +- **stageplus**: [Fix m3u8 extraction](https://github.com/yt-dlp/yt-dlp/commit/56b3dc03354b75be995759d8441d2754c0442b9a) ([#7929](https://github.com/yt-dlp/yt-dlp/issues/7929)) by [bashonly](https://github.com/bashonly) +- **streamanity**: [Remove](https://github.com/yt-dlp/yt-dlp/commit/2cfe221fbbe46faa3f46552c08d947a51f424903) ([#7571](https://github.com/yt-dlp/yt-dlp/issues/7571)) by [alerikaisattera](https://github.com/alerikaisattera) +- **svtplay**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/2301b5c1b77a65abbb46b72f91e1e4666fd5d985) ([#7789](https://github.com/yt-dlp/yt-dlp/issues/7789)) by [dirkf](https://github.com/dirkf), [wader](https://github.com/wader) +- **tbsjp**: [Add episode, program, playlist extractors](https://github.com/yt-dlp/yt-dlp/commit/876b70c8edf4c0147f180bd981fbc4d625cbfb9c) ([#7765](https://github.com/yt-dlp/yt-dlp/issues/7765)) by [garret1317](https://github.com/garret1317) +- **tiktok** + - [Fix audio-only format extraction](https://github.com/yt-dlp/yt-dlp/commit/b09bd0c19648f60c59fb980cd454cb0069959fb9) ([#7712](https://github.com/yt-dlp/yt-dlp/issues/7712)) by [bashonly](https://github.com/bashonly) + - [Fix webpage extraction](https://github.com/yt-dlp/yt-dlp/commit/069cbece9dba6384f1cc5fcfc7ce562a31af42fc) by [bashonly](https://github.com/bashonly) +- **triller**: [Fix unlisted video extraction](https://github.com/yt-dlp/yt-dlp/commit/39837ae3199aa934299badbd0d63243ed639e6c8) ([#7670](https://github.com/yt-dlp/yt-dlp/issues/7670)) by [bashonly](https://github.com/bashonly) +- **tv5mondeplus**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/7d3d658f4c558ee7d72b1c01b46f2126948681cd) ([#7952](https://github.com/yt-dlp/yt-dlp/issues/7952)) by [dirkf](https://github.com/dirkf), [korli](https://github.com/korli) +- **twitcasting** + - [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/cebbd33b1c678149fc8f0e254db6fc0da317ea80) ([#8120](https://github.com/yt-dlp/yt-dlp/issues/8120)) by [c-basalt](https://github.com/c-basalt) + - [Support `--wait-for-video`](https://github.com/yt-dlp/yt-dlp/commit/c1d71d0d9f41db5e4306c86af232f5f6220a130b) ([#7975](https://github.com/yt-dlp/yt-dlp/issues/7975)) by [at-wat](https://github.com/at-wat) +- **twitter** + - [Add fallback, improve error handling](https://github.com/yt-dlp/yt-dlp/commit/6014355c6142f68e20c8374e3787e5b5820f19e2) ([#7621](https://github.com/yt-dlp/yt-dlp/issues/7621)) by [bashonly](https://github.com/bashonly) + - [Fix GraphQL and legacy API](https://github.com/yt-dlp/yt-dlp/commit/92315c03774cfabb3a921884326beb4b981f786b) ([#7516](https://github.com/yt-dlp/yt-dlp/issues/7516)) by [bashonly](https://github.com/bashonly) + - [Fix retweet extraction and syndication API](https://github.com/yt-dlp/yt-dlp/commit/a006ce2b27357c15792eb5c18f06765e640b801c) ([#8016](https://github.com/yt-dlp/yt-dlp/issues/8016)) by [bashonly](https://github.com/bashonly) + - [Revert 92315c03774cfabb3a921884326beb4b981f786b](https://github.com/yt-dlp/yt-dlp/commit/b03fa7834579a01cc5fba48c0e73488a16683d48) by [pukkandan](https://github.com/pukkandan) + - spaces + - [Fix format protocol](https://github.com/yt-dlp/yt-dlp/commit/613dbce177d34ffc31053e8e01acf4bb107bcd1e) ([#7550](https://github.com/yt-dlp/yt-dlp/issues/7550)) by [bashonly](https://github.com/bashonly) + - [Pass referer header to downloader](https://github.com/yt-dlp/yt-dlp/commit/c6ef553792ed48462f9fd0e78143bef6b1a71c2e) by [bashonly](https://github.com/bashonly) +- **unsupported**: [List more sites with DRM](https://github.com/yt-dlp/yt-dlp/commit/e7057383380d7d53815f8feaf90ca3dcbde88983) by [pukkandan](https://github.com/pukkandan) +- **videa**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/98eac0e6ba0e510ae7dfdfd249d42ee71fb272b1) ([#8003](https://github.com/yt-dlp/yt-dlp/issues/8003)) by [aky-01](https://github.com/aky-01), [hatsomatt](https://github.com/hatsomatt) +- **vrt**: [Update token signing key](https://github.com/yt-dlp/yt-dlp/commit/325191d0c9bf3fe257b8a7c2eb95080f44f6ddfc) ([#7519](https://github.com/yt-dlp/yt-dlp/issues/7519)) by [Zprokkel](https://github.com/Zprokkel) +- **wat.tv**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/7cccab79e7d00ed965b48b8cefce1da8a0513409) ([#7898](https://github.com/yt-dlp/yt-dlp/issues/7898)) by [davinkevin](https://github.com/davinkevin) +- **wdr**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/5d0395498d7065aa5e55bac85fa9354b4b0d48eb) ([#7979](https://github.com/yt-dlp/yt-dlp/issues/7979)) by [szabyg](https://github.com/szabyg) +- **web.archive**: vlive: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/9652bca1bd02f6bc1b8cb1e186f2ccbf32225561) ([#8132](https://github.com/yt-dlp/yt-dlp/issues/8132)) by [bashonly](https://github.com/bashonly) +- **weibo**: [Fix extractor and support user extraction](https://github.com/yt-dlp/yt-dlp/commit/69b03f84f8378b0b5a2fbae56f9b7d860b2f529e) ([#7657](https://github.com/yt-dlp/yt-dlp/issues/7657)) by [c-basalt](https://github.com/c-basalt) +- **weverse**: [Support extraction without auth](https://github.com/yt-dlp/yt-dlp/commit/c2d8ee0000302aba63476b7d5bd8793e57b6c8c6) ([#7924](https://github.com/yt-dlp/yt-dlp/issues/7924)) by [seproDev](https://github.com/seproDev) +- **wimbledon**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/a15fcd299e767a510debd8dc1646fe863b96ce0e) ([#7551](https://github.com/yt-dlp/yt-dlp/issues/7551)) by [nnoboa](https://github.com/nnoboa) +- **wrestleuniverseppv**: [Fix HLS AES key extraction](https://github.com/yt-dlp/yt-dlp/commit/dae349da97cafe7357106a8f3187fd48a2ad1210) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Add `player_params` extractor arg](https://github.com/yt-dlp/yt-dlp/commit/ba06d77a316650ff057347d224b5afa8b203ad65) ([#7719](https://github.com/yt-dlp/yt-dlp/issues/7719)) by [coletdjnz](https://github.com/coletdjnz) + - [Fix `player_params` arg being converted to lowercase](https://github.com/yt-dlp/yt-dlp/commit/546b2c28a106cf8101d481b215b676d1b091d276) by [coletdjnz](https://github.com/coletdjnz) + - [Fix consent cookie](https://github.com/yt-dlp/yt-dlp/commit/378ae9f9fb8e8c86e6ac89c4c5b815b48ce93620) ([#7774](https://github.com/yt-dlp/yt-dlp/issues/7774)) by [coletdjnz](https://github.com/coletdjnz) + - tab: [Detect looping feeds](https://github.com/yt-dlp/yt-dlp/commit/1ba6fe9db5f660d5538588315c23ad6cf0371c5f) ([#6621](https://github.com/yt-dlp/yt-dlp/issues/6621)) by [coletdjnz](https://github.com/coletdjnz) +- **zaiko**: [Improve thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/ecef42c3adbcb6a84405139047923c4967316f28) ([#8054](https://github.com/yt-dlp/yt-dlp/issues/8054)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **zee5**: [Update access token endpoint](https://github.com/yt-dlp/yt-dlp/commit/a0de8bb8601146b8f87bf7cd562eef8bfb4690be) ([#7914](https://github.com/yt-dlp/yt-dlp/issues/7914)) by [bashonly](https://github.com/bashonly) +- **zoom**: [Extract duration](https://github.com/yt-dlp/yt-dlp/commit/66cc64ff6696f9921ff112a278542f8d999ffea4) by [bashonly](https://github.com/bashonly) + +#### Downloader changes +- **external** + - [Fix ffmpeg input from stdin](https://github.com/yt-dlp/yt-dlp/commit/e57eb98222d29cc4c09ee975d3c492274a6e5be3) ([#7655](https://github.com/yt-dlp/yt-dlp/issues/7655)) by [bashonly](https://github.com/bashonly) + - [Fixes to cookie handling](https://github.com/yt-dlp/yt-dlp/commit/42ded0a429c20ec13dc006825e1508d9a02f0ad4) by [bashonly](https://github.com/bashonly) + +#### Postprocessor changes +- **embedthumbnail**: [Support `m4v`](https://github.com/yt-dlp/yt-dlp/commit/8a4cd12c8f8e93292e3e95200b9d17a3af39624c) ([#7583](https://github.com/yt-dlp/yt-dlp/issues/7583)) by [Neurognostic](https://github.com/Neurognostic) + +#### Networking changes +- [Add module](https://github.com/yt-dlp/yt-dlp/commit/c365dba8430ee33abda85d31f95128605bf240eb) ([#2861](https://github.com/yt-dlp/yt-dlp/issues/2861)) by [pukkandan](https://github.com/pukkandan) +- [Add request handler preference framework](https://github.com/yt-dlp/yt-dlp/commit/db7b054a6111ca387220d0eb87bf342f9c130eb8) ([#7603](https://github.com/yt-dlp/yt-dlp/issues/7603)) by [coletdjnz](https://github.com/coletdjnz) +- [Add strict Request extension checking](https://github.com/yt-dlp/yt-dlp/commit/86aea0d3a213da3be1da638b9b828e6f0ee1d59f) ([#7604](https://github.com/yt-dlp/yt-dlp/issues/7604)) by [coletdjnz](https://github.com/coletdjnz) +- [Fix POST requests with zero-length payloads](https://github.com/yt-dlp/yt-dlp/commit/71baa490ebd3655746430f208a9b605d120cd315) ([#7648](https://github.com/yt-dlp/yt-dlp/issues/7648)) by [bashonly](https://github.com/bashonly) +- [Fix `--legacy-server-connect`](https://github.com/yt-dlp/yt-dlp/commit/75dc8e673b481a82d0688aeec30f6c65d82bb359) ([#7645](https://github.com/yt-dlp/yt-dlp/issues/7645)) by [bashonly](https://github.com/bashonly) +- [Fix various socks proxy bugs](https://github.com/yt-dlp/yt-dlp/commit/20fbbd9249a2f26c7ae579bde5ba5d69aa8fac69) ([#8065](https://github.com/yt-dlp/yt-dlp/issues/8065)) by [coletdjnz](https://github.com/coletdjnz) +- [Ignore invalid proxies in env](https://github.com/yt-dlp/yt-dlp/commit/bbeacff7fcaa3b521066088a5ccbf34ef5070d1d) ([#7704](https://github.com/yt-dlp/yt-dlp/issues/7704)) by [coletdjnz](https://github.com/coletdjnz) +- [Rewrite architecture](https://github.com/yt-dlp/yt-dlp/commit/227bf1a33be7b89cd7d44ad046844c4ccba104f4) ([#2861](https://github.com/yt-dlp/yt-dlp/issues/2861)) by [coletdjnz](https://github.com/coletdjnz) +- **Request Handler** + - urllib + - [Remove dot segments during URL normalization](https://github.com/yt-dlp/yt-dlp/commit/4bf912282a34b58b6b35d8f7e6be535770c89c76) ([#7662](https://github.com/yt-dlp/yt-dlp/issues/7662)) by [coletdjnz](https://github.com/coletdjnz) + - [Simplify gzip decoding](https://github.com/yt-dlp/yt-dlp/commit/59e92b1f1833440bb2190f847eb735cf0f90bc85) ([#7611](https://github.com/yt-dlp/yt-dlp/issues/7611)) by [Grub4K](https://github.com/Grub4K) (With fixes in [77bff23](https://github.com/yt-dlp/yt-dlp/commit/77bff23ee97565bab2e0d75b893a21bf7983219a)) + +#### Misc. changes +- **build**: [Make sure deprecated modules are added](https://github.com/yt-dlp/yt-dlp/commit/131d132da5c98c6c78bd7eed4b37f4458561b3d9) by [pukkandan](https://github.com/pukkandan) +- **cleanup** + - [Add color to `download-archive` message](https://github.com/yt-dlp/yt-dlp/commit/2b029ca0a9f9105c4f7626993fa60e54c9782749) ([#5138](https://github.com/yt-dlp/yt-dlp/issues/5138)) by [aaruni96](https://github.com/aaruni96), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) + - Miscellaneous + - [6148833](https://github.com/yt-dlp/yt-dlp/commit/6148833f5ceb7674142ddb8d761ffe03cee7df69), [62b5c94](https://github.com/yt-dlp/yt-dlp/commit/62b5c94cadaa5f596dc1a7083db9db12efe357be) by [pukkandan](https://github.com/pukkandan) + - [5ca095c](https://github.com/yt-dlp/yt-dlp/commit/5ca095cbcde3e32642a4fe5b2d69e8e3c785a021) by [barsnick](https://github.com/barsnick), [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [gamer191](https://github.com/gamer191), [Grub4K](https://github.com/Grub4K), [sqrtNOT](https://github.com/sqrtNOT) + - [088add9](https://github.com/yt-dlp/yt-dlp/commit/088add9567d39b758737e4299a0e619fd89d2e8f) by [Grub4K](https://github.com/Grub4K) +- **devscripts**: `make_changelog`: [Fix changelog grouping and add networking group](https://github.com/yt-dlp/yt-dlp/commit/30ba233d4cee945756ed7344e7ddb3a90d2ae608) ([#8124](https://github.com/yt-dlp/yt-dlp/issues/8124)) by [Grub4K](https://github.com/Grub4K) +- **docs**: [Update collaborators](https://github.com/yt-dlp/yt-dlp/commit/1be0a96a4d14f629097509fcc89d15f69a8243c7) by [Grub4K](https://github.com/Grub4K) +- **test** + - [Add tests for socks proxies](https://github.com/yt-dlp/yt-dlp/commit/fcd6a76adc49d5cd8783985c7ce35384b72e545f) ([#7908](https://github.com/yt-dlp/yt-dlp/issues/7908)) by [coletdjnz](https://github.com/coletdjnz) + - [Fix `httplib_validation_errors` test for old Python versions](https://github.com/yt-dlp/yt-dlp/commit/95abea9a03289da1384e5bda3d590223ccc0a238) ([#7677](https://github.com/yt-dlp/yt-dlp/issues/7677)) by [coletdjnz](https://github.com/coletdjnz) + - [Fix `test_load_certifi`](https://github.com/yt-dlp/yt-dlp/commit/de20687ee6b742646128a7629b57096631a20619) by [pukkandan](https://github.com/pukkandan) + - download: [Test for `expected_exception`](https://github.com/yt-dlp/yt-dlp/commit/661c9a1d029296b28e0b2f8be8a72a43abaf6536) by [at-wat](https://github.com/at-wat) + +### 2023.07.06 + +#### Important changes +- Security: [[CVE-2023-35934](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-35934)] Fix [Cookie leak](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj) + - `--add-header Cookie:` is deprecated and auto-scoped to input URL domains + - Cookies are scoped when passed to external downloaders + - Add `cookies` field to info.json and deprecate `http_headers.Cookie` + +#### Core changes +- [Allow extractors to mark formats as potentially DRM](https://github.com/yt-dlp/yt-dlp/commit/bc344cd456380999c1ee74554dfd432a38f32ec7) ([#7396](https://github.com/yt-dlp/yt-dlp/issues/7396)) by [pukkandan](https://github.com/pukkandan) +- [Bugfix for b4e0d75848e9447cee2cd3646ce54d4744a7ff56](https://github.com/yt-dlp/yt-dlp/commit/e59e20744eb32ce4b6ea0dece7c673be8376a710) by [pukkandan](https://github.com/pukkandan) +- [Change how `Cookie` headers are handled](https://github.com/yt-dlp/yt-dlp/commit/3121512228487c9c690d3d39bfd2579addf96e07) by [Grub4K](https://github.com/Grub4K) +- [Prevent `Cookie` leaks on HTTP redirect](https://github.com/yt-dlp/yt-dlp/commit/f8b4bcc0a791274223723488bfbfc23ea3276641) by [coletdjnz](https://github.com/coletdjnz) +- **formats**: [Fix best fallback for storyboards](https://github.com/yt-dlp/yt-dlp/commit/906c0bdcd8974340d619e99ccd613c163eb0d0c2) by [pukkandan](https://github.com/pukkandan) +- **outtmpl**: [Pad `playlist_index` etc even when with internal formatting](https://github.com/yt-dlp/yt-dlp/commit/47bcd437247152e0af5b3ebc5592db7bb66855c2) by [pukkandan](https://github.com/pukkandan) +- **utils**: clean_podcast_url: [Handle protocol in redirect URL](https://github.com/yt-dlp/yt-dlp/commit/91302ed349f34dc26cc1d661bb45a4b71f4417f7) by [pukkandan](https://github.com/pukkandan) + +#### Extractor changes +- **abc**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/8f05fbae2a79ce0713077ccc68b354e63216bf20) ([#7434](https://github.com/yt-dlp/yt-dlp/issues/7434)) by [meliber](https://github.com/meliber) +- **AdultSwim**: [Extract subtitles from m3u8](https://github.com/yt-dlp/yt-dlp/commit/5e16cf92eb496b7c1541a6b1d727cb87542984db) ([#7421](https://github.com/yt-dlp/yt-dlp/issues/7421)) by [nnoboa](https://github.com/nnoboa) +- **crunchyroll**: music: [Fix `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/5b4b92769afcc398475e481bfa839f1158902fe9) ([#7439](https://github.com/yt-dlp/yt-dlp/issues/7439)) by [AmanSal1](https://github.com/AmanSal1), [rdamas](https://github.com/rdamas) +- **Douyin**: [Fix extraction from webpage](https://github.com/yt-dlp/yt-dlp/commit/a2be9781fbf4d7e4db245c277ca2ecc41cf3a7b2) by [bashonly](https://github.com/bashonly) +- **googledrive**: [Fix source format extraction](https://github.com/yt-dlp/yt-dlp/commit/3b7f5300c577fef40464d46d4e4037a69d51fe82) ([#7395](https://github.com/yt-dlp/yt-dlp/issues/7395)) by [RfadnjdExt](https://github.com/RfadnjdExt) +- **kick**: [Fix `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/ef8509c300ea50da86aea447eb214d3d6f6db6bb) by [bashonly](https://github.com/bashonly) +- **qdance**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/f0a1ff118145b6449982ba401f9a9f656ecd8062) ([#7420](https://github.com/yt-dlp/yt-dlp/issues/7420)) by [bashonly](https://github.com/bashonly) +- **sbs**: [Python 3.7 compat](https://github.com/yt-dlp/yt-dlp/commit/f393bbe724b1fc6c7f754a5da507e807b2b40ad2) by [pukkandan](https://github.com/pukkandan) +- **stacommu**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/af1fd12f675220df6793fc019dff320bc76e8080) ([#7432](https://github.com/yt-dlp/yt-dlp/issues/7432)) by [urectanc](https://github.com/urectanc) +- **twitter** + - [Fix unauthenticated extraction](https://github.com/yt-dlp/yt-dlp/commit/49296437a8e5fa91dacb5446e51ab588474c85d3) ([#7476](https://github.com/yt-dlp/yt-dlp/issues/7476)) by [bashonly](https://github.com/bashonly) + - spaces: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/1cffd621cb371f1563563cfb2fe37d137e8a7bee) ([#7512](https://github.com/yt-dlp/yt-dlp/issues/7512)) by [bashonly](https://github.com/bashonly) +- **vidlii**: [Handle relative URLs](https://github.com/yt-dlp/yt-dlp/commit/ad8902f616ad2541f9b9626738f1393fad89a64c) by [pukkandan](https://github.com/pukkandan) +- **vk**: VKPlay, VKPlayLive: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/8776349ef6b1f644584a92dfa00a05208a48edc4) ([#7358](https://github.com/yt-dlp/yt-dlp/issues/7358)) by [c-basalt](https://github.com/c-basalt) +- **youtube** + - [Add extractor-arg `formats`](https://github.com/yt-dlp/yt-dlp/commit/58786a10f212bd63f9ad1d0b4d9e4d31c3b385e2) by [pukkandan](https://github.com/pukkandan) + - [Avoid false DRM detection](https://github.com/yt-dlp/yt-dlp/commit/94ed638a437fc766699d440e978982e24ce6a30a) ([#7396](https://github.com/yt-dlp/yt-dlp/issues/7396)) by [pukkandan](https://github.com/pukkandan) + - [Fix comments' `is_favorited`](https://github.com/yt-dlp/yt-dlp/commit/89bed013741a776506f60380b7fd89d27d0710b4) ([#7390](https://github.com/yt-dlp/yt-dlp/issues/7390)) by [bbilly1](https://github.com/bbilly1) + - [Ignore incomplete data for comment threads by default](https://github.com/yt-dlp/yt-dlp/commit/4dc4d8473c085900edc841c87c20041233d25b1f) ([#7475](https://github.com/yt-dlp/yt-dlp/issues/7475)) by [coletdjnz](https://github.com/coletdjnz) + - [Process `post_live` over 2 hours](https://github.com/yt-dlp/yt-dlp/commit/d949c10c45bfc359bdacd52e6a180169b8128958) by [pukkandan](https://github.com/pukkandan) + - stories: [Remove](https://github.com/yt-dlp/yt-dlp/commit/90db9a3c00ca80492c6a58c542e4cbf4c2710866) ([#7459](https://github.com/yt-dlp/yt-dlp/issues/7459)) by [pukkandan](https://github.com/pukkandan) + - tab: [Support shorts-only playlists](https://github.com/yt-dlp/yt-dlp/commit/fcbc9ed760be6e3455bbadfaf277b4504b06f068) ([#7425](https://github.com/yt-dlp/yt-dlp/issues/7425)) by [coletdjnz](https://github.com/coletdjnz) + +#### Downloader changes +- **aria2c**: [Add `--no-conf`](https://github.com/yt-dlp/yt-dlp/commit/8a8af356e3bba98a7f7d333aff0777d5d92130c8) by [pukkandan](https://github.com/pukkandan) +- **external**: [Scope cookies](https://github.com/yt-dlp/yt-dlp/commit/1ceb657bdd254ad961489e5060f2ccc7d556b729) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz) +- **http**: [Avoid infinite loop when no data is received](https://github.com/yt-dlp/yt-dlp/commit/662ef1e910b72e57957f06589925b2332ba52821) by [pukkandan](https://github.com/pukkandan) + +#### Misc. changes +- [Add CodeQL workflow](https://github.com/yt-dlp/yt-dlp/commit/6355b5f1e1e8e7f4ef866d71d51e03baf0e82f17) ([#7497](https://github.com/yt-dlp/yt-dlp/issues/7497)) by [jorgectf](https://github.com/jorgectf) +- **cleanup**: Miscellaneous: [337734d](https://github.com/yt-dlp/yt-dlp/commit/337734d4a8a6500bc65434843db346b5cbd05e81) by [pukkandan](https://github.com/pukkandan) +- **docs**: [Minor fixes](https://github.com/yt-dlp/yt-dlp/commit/b532a3481046e1eabb6232ee8196fb696c356ff6) by [pukkandan](https://github.com/pukkandan) +- **make_changelog**: [Skip reverted commits](https://github.com/yt-dlp/yt-dlp/commit/fa44802809d189fca0f4782263d48d6533384503) by [pukkandan](https://github.com/pukkandan) + +### 2023.06.22 + +#### Core changes +- [Fix bug in db3ad8a67661d7b234a6954d9c6a4a9b1749f5eb](https://github.com/yt-dlp/yt-dlp/commit/d7cd97e8d8d42b500fea9abb2aa4ac9b0f98b2ad) by [pukkandan](https://github.com/pukkandan) +- [Improve `--download-sections`](https://github.com/yt-dlp/yt-dlp/commit/b4e0d75848e9447cee2cd3646ce54d4744a7ff56) by [pukkandan](https://github.com/pukkandan) + - Support negative time-ranges + - Add `*from-url` to obey time-ranges in URL +- [Indicate `filesize` approximated from `tbr` better](https://github.com/yt-dlp/yt-dlp/commit/0dff8e4d1e6e9fb938f4256ea9af7d81f42fd54f) by [pukkandan](https://github.com/pukkandan) + +#### Extractor changes +- [Support multiple `_VALID_URL`s](https://github.com/yt-dlp/yt-dlp/commit/5fd8367496b42c7b900b896a0d5460561a2859de) ([#5812](https://github.com/yt-dlp/yt-dlp/issues/5812)) by [nixxo](https://github.com/nixxo) +- **dplay**: GlobalCyclingNetworkPlus: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/774aa09dd6aa61ced9ec818d1f67e53414d22762) ([#7360](https://github.com/yt-dlp/yt-dlp/issues/7360)) by [bashonly](https://github.com/bashonly) +- **dropout**: [Fix season extraction](https://github.com/yt-dlp/yt-dlp/commit/db22142f6f817ff673d417b4b78e8db497bf8ab3) ([#7304](https://github.com/yt-dlp/yt-dlp/issues/7304)) by [OverlordQ](https://github.com/OverlordQ) +- **motherless**: [Add gallery support, fix groups](https://github.com/yt-dlp/yt-dlp/commit/f2ff0f6f1914b82d4a51681a72cc0828115dcb4a) ([#7211](https://github.com/yt-dlp/yt-dlp/issues/7211)) by [rexlambert22](https://github.com/rexlambert22), [Ti4eeT4e](https://github.com/Ti4eeT4e) +- **nebula**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/3f756c8c4095b942cf49788eb0862ceaf57847f2) ([#7156](https://github.com/yt-dlp/yt-dlp/issues/7156)) by [Lamieur](https://github.com/Lamieur), [rohieb](https://github.com/rohieb) +- **rheinmaintv**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/98cb1eda7a4cf67c96078980dbd63e6c06ad7f7c) ([#7311](https://github.com/yt-dlp/yt-dlp/issues/7311)) by [barthelmannk](https://github.com/barthelmannk) +- **youtube** + - [Add `ios` to default clients used](https://github.com/yt-dlp/yt-dlp/commit/1e75d97db21152acc764b30a688e516f04b8a142) by [pukkandan](https://github.com/pukkandan) + - IOS is affected neither by 403 nor by nsig so helps mitigate them preemptively + - IOS also has higher bit-rate 'premium' formats though they are not labeled as such + - [Improve description parsing performance](https://github.com/yt-dlp/yt-dlp/commit/71dc18fa29263a1ff0472c23d81bfc8dd4422d48) ([#7315](https://github.com/yt-dlp/yt-dlp/issues/7315)) by [berkanteber](https://github.com/berkanteber), [pukkandan](https://github.com/pukkandan) + - [Improve nsig function name extraction](https://github.com/yt-dlp/yt-dlp/commit/cd810afe2ac5567c822b7424800fc470ef2d0045) by [pukkandan](https://github.com/pukkandan) + - [Workaround 403 for android formats](https://github.com/yt-dlp/yt-dlp/commit/81ca451480051d7ce1a31c017e005358345a9149) by [pukkandan](https://github.com/pukkandan) + +#### Misc. changes +- [Revert "Add automatic duplicate issue detection"](https://github.com/yt-dlp/yt-dlp/commit/a4486bfc1dc7057efca9dd3fe70d7fa25c56f700) by [pukkandan](https://github.com/pukkandan) +- **cleanup** + - Miscellaneous + - [7f9c6a6](https://github.com/yt-dlp/yt-dlp/commit/7f9c6a63b16e145495479e9f666f5b9e2ee69e2f) by [bashonly](https://github.com/bashonly) + - [812cdfa](https://github.com/yt-dlp/yt-dlp/commit/812cdfa06c33a40e73a8e04b3e6f42c084666a43) by [pukkandan](https://github.com/pukkandan) + +### 2023.06.21 + +#### Important changes +- YouTube: Improved throttling and signature fixes + +#### Core changes +- [Add `--compat-option playlist-match-filter`](https://github.com/yt-dlp/yt-dlp/commit/93b39cdbd9dcf351bfa0c4ee252805b4617fdca9) by [pukkandan](https://github.com/pukkandan) +- [Add `--no-quiet`](https://github.com/yt-dlp/yt-dlp/commit/d669772c65e8630162fd6555d0a578b246591921) by [pukkandan](https://github.com/pukkandan) +- [Add option `--color`](https://github.com/yt-dlp/yt-dlp/commit/8417f26b8a819cd7ffcd4e000ca3e45033e670fb) ([#6904](https://github.com/yt-dlp/yt-dlp/issues/6904)) by [Grub4K](https://github.com/Grub4K) +- [Add option `--netrc-cmd`](https://github.com/yt-dlp/yt-dlp/commit/db3ad8a67661d7b234a6954d9c6a4a9b1749f5eb) ([#6682](https://github.com/yt-dlp/yt-dlp/issues/6682)) by [NDagestad](https://github.com/NDagestad), [pukkandan](https://github.com/pukkandan) +- [Add option `--xff`](https://github.com/yt-dlp/yt-dlp/commit/c16644642b08e2bf4130a6c5fa01395d8718c990) by [pukkandan](https://github.com/pukkandan) +- [Auto-select default format in `-f-`](https://github.com/yt-dlp/yt-dlp/commit/372a0f3b9dadd1e52234b498aa4c7040ef868c7d) ([#7101](https://github.com/yt-dlp/yt-dlp/issues/7101)) by [ivanskodje](https://github.com/ivanskodje), [pukkandan](https://github.com/pukkandan) +- [Deprecate internal `Youtubedl-no-compression` header](https://github.com/yt-dlp/yt-dlp/commit/955c89584b66fcd0fcfab3e611f1edeb1ca63886) ([#6876](https://github.com/yt-dlp/yt-dlp/issues/6876)) by [coletdjnz](https://github.com/coletdjnz) +- [Do not translate newlines in `--print-to-file`](https://github.com/yt-dlp/yt-dlp/commit/9874e82b5a61582169300bea561b3e8899ad1ef7) by [pukkandan](https://github.com/pukkandan) +- [Ensure pre-processor errors do not block `--print`](https://github.com/yt-dlp/yt-dlp/commit/f005a35aa7e4f67a0c603a946c0dd714c151b2d6) by [pukkandan](https://github.com/pukkandan) (With fixes in [17ba434](https://github.com/yt-dlp/yt-dlp/commit/17ba4343cf99701692a7f4798fd42b50f644faba)) +- [Fix `filepath` being copied to underlying format dict](https://github.com/yt-dlp/yt-dlp/commit/84078a8b38f403495d00b46654c8750774d821de) by [pukkandan](https://github.com/pukkandan) +- [Improve HTTP redirect handling](https://github.com/yt-dlp/yt-dlp/commit/08916a49c777cb6e000eec092881eb93ec22076c) ([#7094](https://github.com/yt-dlp/yt-dlp/issues/7094)) by [coletdjnz](https://github.com/coletdjnz) +- [Populate `filename` and `urls` fields at all stages of `--print`](https://github.com/yt-dlp/yt-dlp/commit/170605840ea9d5ad75da6576485ea7d125b428ee) by [pukkandan](https://github.com/pukkandan) (With fixes in [b5f61b6](https://github.com/yt-dlp/yt-dlp/commit/b5f61b69d4561b81fc98c226b176f0c15493e688)) +- [Relaxed validation for numeric format filters](https://github.com/yt-dlp/yt-dlp/commit/c3f624ef0a5d7a6ae1c5ffeb243087e9fc7d79dc) by [pukkandan](https://github.com/pukkandan) +- [Support decoding multiple content encodings](https://github.com/yt-dlp/yt-dlp/commit/daafbf49b3482edae4d70dd37070be99742a926e) ([#7142](https://github.com/yt-dlp/yt-dlp/issues/7142)) by [coletdjnz](https://github.com/coletdjnz) +- [Support loading info.json with a list at it's root](https://github.com/yt-dlp/yt-dlp/commit/ab1de9cb1e39cf421c2b7dc6756c6ff1955bb313) by [pukkandan](https://github.com/pukkandan) +- [Workaround erroneous urllib Windows proxy parsing](https://github.com/yt-dlp/yt-dlp/commit/3f66b6fe50f8d5b545712f8b19d5ae62f5373980) ([#7092](https://github.com/yt-dlp/yt-dlp/issues/7092)) by [coletdjnz](https://github.com/coletdjnz) +- **cookies** + - [Defer extraction of v11 key from keyring](https://github.com/yt-dlp/yt-dlp/commit/9b7a48abd1b187eae1e3f6c9839c47d43ccec00b) by [Grub4K](https://github.com/Grub4K) + - [Move `YoutubeDLCookieJar` to cookies module](https://github.com/yt-dlp/yt-dlp/commit/b87e01c123fd560b6a674ce00f45a9459d82d98a) ([#7091](https://github.com/yt-dlp/yt-dlp/issues/7091)) by [coletdjnz](https://github.com/coletdjnz) + - [Support custom Safari cookies path](https://github.com/yt-dlp/yt-dlp/commit/a58182b75a05fe0a10c5e94a536711d3ade19c20) ([#6783](https://github.com/yt-dlp/yt-dlp/issues/6783)) by [NextFire](https://github.com/NextFire) + - [Update for chromium changes](https://github.com/yt-dlp/yt-dlp/commit/b38d4c941d1993ab27e4c0f8e024e23c2ec0f8f8) ([#6897](https://github.com/yt-dlp/yt-dlp/issues/6897)) by [mbway](https://github.com/mbway) +- **Cryptodome**: [Fix `__bool__`](https://github.com/yt-dlp/yt-dlp/commit/98ac902c4979e4529b166e873473bef42baa2e3e) by [pukkandan](https://github.com/pukkandan) +- **jsinterp** + - [Do not compile regex](https://github.com/yt-dlp/yt-dlp/commit/7aeda6cc9e73ada0b0a0b6a6748c66bef63a20a8) by [pukkandan](https://github.com/pukkandan) + - [Fix division](https://github.com/yt-dlp/yt-dlp/commit/b4a252fba81f53631c07ca40ce7583f5d19a8a36) ([#7279](https://github.com/yt-dlp/yt-dlp/issues/7279)) by [bashonly](https://github.com/bashonly) + - [Fix global object extraction](https://github.com/yt-dlp/yt-dlp/commit/01aba2519a0884ef17d5f85608dbd2a455577147) by [pukkandan](https://github.com/pukkandan) + - [Handle `NaN` in bitwise operators](https://github.com/yt-dlp/yt-dlp/commit/1d7656184c6b8aa46b29149893894b3c24f1df00) by [pukkandan](https://github.com/pukkandan) + - [Handle negative numbers better](https://github.com/yt-dlp/yt-dlp/commit/7cf51f21916292cd80bdeceb37489f5322f166dd) by [pukkandan](https://github.com/pukkandan) +- **outtmpl** + - [Allow `\n` in replacements and default.](https://github.com/yt-dlp/yt-dlp/commit/78fde6e3398ff11e5d383a66b28664badeab5180) by [pukkandan](https://github.com/pukkandan) + - [Fix some minor bugs](https://github.com/yt-dlp/yt-dlp/commit/ebe1b4e34f43c3acad30e4bcb8484681a030c114) by [pukkandan](https://github.com/pukkandan) (With fixes in [1619ab3](https://github.com/yt-dlp/yt-dlp/commit/1619ab3e67d8dc4f86fc7ed292c79345bc0d91a0)) + - [Support `str.format` syntax inside replacements](https://github.com/yt-dlp/yt-dlp/commit/ec9311c41b111110bc52cfbd6ea682c6fb23f77a) by [pukkandan](https://github.com/pukkandan) +- **update** + - [Better error handling](https://github.com/yt-dlp/yt-dlp/commit/d2e84d5eb01c66fc5304e8566348d65a7be24ed7) by [pukkandan](https://github.com/pukkandan) + - [Do not restart into versions without `--update-to`](https://github.com/yt-dlp/yt-dlp/commit/02948a17d903f544363bb20b51a6d8baed7bba08) by [pukkandan](https://github.com/pukkandan) + - [Implement `--update-to` repo](https://github.com/yt-dlp/yt-dlp/commit/665472a7de3880578c0b7b3f95c71570c056368e) by [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) +- **upstream** + - [Merged with youtube-dl 07af47](https://github.com/yt-dlp/yt-dlp/commit/42f2d40b475db66486a4b4fe5b56751a640db5db) by [pukkandan](https://github.com/pukkandan) + - [Merged with youtube-dl d1c6c5](https://github.com/yt-dlp/yt-dlp/commit/4823ec9f461512daa1b8ab362893bb86a6320b26) by [pukkandan](https://github.com/pukkandan) (With fixes in [edbe5b5](https://github.com/yt-dlp/yt-dlp/commit/edbe5b589dd0860a67b4e03f58db3cd2539d91c2) by [bashonly](https://github.com/bashonly)) +- **utils** + - `FormatSorter`: [Improve `size` and `br`](https://github.com/yt-dlp/yt-dlp/commit/eedda5252c05327748dede204a8fccafa0288118) by [pukkandan](https://github.com/pukkandan), [u-spec-png](https://github.com/u-spec-png) + - `js_to_json`: [Implement template strings](https://github.com/yt-dlp/yt-dlp/commit/0898c5c8ccadfc404472456a7a7751b72afebadd) ([#6623](https://github.com/yt-dlp/yt-dlp/issues/6623)) by [Grub4K](https://github.com/Grub4K) + - `locked_file`: [Fix for virtiofs](https://github.com/yt-dlp/yt-dlp/commit/45998b3e371b819ce0dbe50da703809a048cc2fe) ([#6840](https://github.com/yt-dlp/yt-dlp/issues/6840)) by [brandon-dacrib](https://github.com/brandon-dacrib) + - `strftime_or_none`: [Handle negative timestamps](https://github.com/yt-dlp/yt-dlp/commit/a35af4306d24c56c6358f89cdf204860d1cd62b4) by [dirkf](https://github.com/dirkf), [pukkandan](https://github.com/pukkandan) + - `traverse_obj` + - [Allow iterables in traversal](https://github.com/yt-dlp/yt-dlp/commit/21b5ec86c2c37d10c5bb97edd7051d3aac16bb3e) ([#6902](https://github.com/yt-dlp/yt-dlp/issues/6902)) by [Grub4K](https://github.com/Grub4K) + - [More fixes](https://github.com/yt-dlp/yt-dlp/commit/b079c26f0af8085bccdadc72c61c8164ca5ab0f8) ([#6959](https://github.com/yt-dlp/yt-dlp/issues/6959)) by [Grub4K](https://github.com/Grub4K) + - `write_string`: [Fix noconsole behavior](https://github.com/yt-dlp/yt-dlp/commit/3b479100df02e20dd949e046003ae96ddbfced57) by [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- [Do not exit early for unsuitable `url_result`](https://github.com/yt-dlp/yt-dlp/commit/baa922b5c74b10e3b86ff5e6cf6529b3aae8efab) by [pukkandan](https://github.com/pukkandan) +- [Do not warn for invalid chapter data in description](https://github.com/yt-dlp/yt-dlp/commit/84ffeb7d5e72e3829319ba7720a8480fc4c7503b) by [pukkandan](https://github.com/pukkandan) +- [Extract more metadata from ISM](https://github.com/yt-dlp/yt-dlp/commit/f68434cc74cfd3db01b266476a2eac8329fbb267) by [pukkandan](https://github.com/pukkandan) +- **abematv**: [Add fallback for title and description extraction and extract more metadata](https://github.com/yt-dlp/yt-dlp/commit/c449c0655d7c8549e6e1389c26b628053b253d39) ([#6994](https://github.com/yt-dlp/yt-dlp/issues/6994)) by [Lesmiscore](https://github.com/Lesmiscore) +- **acast**: [Support embeds](https://github.com/yt-dlp/yt-dlp/commit/c91ac833ea99b00506e470a44cf930e4e23378c9) ([#7212](https://github.com/yt-dlp/yt-dlp/issues/7212)) by [pabs3](https://github.com/pabs3) +- **adobepass**: [Handle `Charter_Direct` MSO as `Spectrum`](https://github.com/yt-dlp/yt-dlp/commit/ea0570820336a0fe9c3b530d1b0d1e59313274f4) ([#6824](https://github.com/yt-dlp/yt-dlp/issues/6824)) by [bashonly](https://github.com/bashonly) +- **aeonco**: [Support Youtube embeds](https://github.com/yt-dlp/yt-dlp/commit/ed81b74802b4247ee8d9dc0ef87eb52baefede1c) ([#6591](https://github.com/yt-dlp/yt-dlp/issues/6591)) by [alexklapheke](https://github.com/alexklapheke) +- **afreecatv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/fdd69db38924c38194ef236b26325d66ac815c88) ([#6283](https://github.com/yt-dlp/yt-dlp/issues/6283)) by [blmarket](https://github.com/blmarket) +- **ARDBetaMediathek**: [Add thumbnail](https://github.com/yt-dlp/yt-dlp/commit/f78eb41e1c0f1dcdb10317358a26bf541dc7ee15) ([#6890](https://github.com/yt-dlp/yt-dlp/issues/6890)) by [StefanLobbenmeier](https://github.com/StefanLobbenmeier) +- **bibeltv**: [Fix extraction, support live streams and series](https://github.com/yt-dlp/yt-dlp/commit/4ad58667c102bd82a7c4cca8aa395ec1682e3b4c) ([#6505](https://github.com/yt-dlp/yt-dlp/issues/6505)) by [flashdagger](https://github.com/flashdagger) +- **bilibili** + - [Support festival videos](https://github.com/yt-dlp/yt-dlp/commit/ab29e47029e2f5b48abbbab78e82faf7cf6e9506) ([#6547](https://github.com/yt-dlp/yt-dlp/issues/6547)) by [qbnu](https://github.com/qbnu) + - SpaceVideo: [Extract signature](https://github.com/yt-dlp/yt-dlp/commit/6f10cdcf7eeaeae5b75e0a4428cd649c156a2d83) ([#7149](https://github.com/yt-dlp/yt-dlp/issues/7149)) by [elyse0](https://github.com/elyse0) +- **biliIntl**: [Add comment extraction](https://github.com/yt-dlp/yt-dlp/commit/b093c38cc9f26b59a8504211d792f053142c847d) ([#6079](https://github.com/yt-dlp/yt-dlp/issues/6079)) by [HobbyistDev](https://github.com/HobbyistDev) +- **bitchute**: [Add more fallback subdomains](https://github.com/yt-dlp/yt-dlp/commit/0c4e0fbcade0fc92d14c2a6d63e360fe067f6192) ([#6907](https://github.com/yt-dlp/yt-dlp/issues/6907)) by [Neurognostic](https://github.com/Neurognostic) +- **booyah**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/f7f7a877bf8e87fd4eb0ad2494ad948ca7691114) by [pukkandan](https://github.com/pukkandan) +- **BrainPOP**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/979568f26ece80bca72b48f0dd57d676e431059a) ([#6106](https://github.com/yt-dlp/yt-dlp/issues/6106)) by [MinePlayersPE](https://github.com/MinePlayersPE) +- **bravotv** + - [Detect DRM](https://github.com/yt-dlp/yt-dlp/commit/1fe5bf240e6ade487d18079a62aa36bcc440a27a) ([#7171](https://github.com/yt-dlp/yt-dlp/issues/7171)) by [bashonly](https://github.com/bashonly) + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/06966cb8966b9aa4f60ab9c44c182a057d4ca3a3) ([#6568](https://github.com/yt-dlp/yt-dlp/issues/6568)) by [bashonly](https://github.com/bashonly) +- **camfm**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/4cbfa570a1b9bd65b0f48770693377e8d842dcb0) ([#7083](https://github.com/yt-dlp/yt-dlp/issues/7083)) by [garret1317](https://github.com/garret1317) +- **cbc** + - [Fix live extractor, playlist `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/7a7b1376fbce0067cf37566bb47131bc0022638d) ([#6625](https://github.com/yt-dlp/yt-dlp/issues/6625)) by [makew0rld](https://github.com/makew0rld) + - [Ignore 426 from API](https://github.com/yt-dlp/yt-dlp/commit/4afb208cf07b59291ae3b0c4efc83945ee5b8812) ([#6781](https://github.com/yt-dlp/yt-dlp/issues/6781)) by [jo-nike](https://github.com/jo-nike) + - gem: [Update `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/871c907454693940cb56906ed9ea49fcb7154829) ([#6499](https://github.com/yt-dlp/yt-dlp/issues/6499)) by [makeworld-the-better-one](https://github.com/makeworld-the-better-one) +- **cbs**: [Add `ParamountPressExpress` extractor](https://github.com/yt-dlp/yt-dlp/commit/44369c9afa996e14e9f466754481d878811b5b4a) ([#6604](https://github.com/yt-dlp/yt-dlp/issues/6604)) by [bashonly](https://github.com/bashonly) +- **cbsnews**: [Overhaul extractors](https://github.com/yt-dlp/yt-dlp/commit/f6e43d6fa9804c24525e1fed0a87782754dab7ed) ([#6681](https://github.com/yt-dlp/yt-dlp/issues/6681)) by [bashonly](https://github.com/bashonly) +- **chilloutzone**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/6f4fc5660f40f3458882a8f51601eae4af7be609) ([#6445](https://github.com/yt-dlp/yt-dlp/issues/6445)) by [bashonly](https://github.com/bashonly) +- **clipchamp**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/2f07c4c1da4361af213e5791279b9d152d2e4ce3) ([#6978](https://github.com/yt-dlp/yt-dlp/issues/6978)) by [bashonly](https://github.com/bashonly) +- **comedycentral**: [Add support for movies](https://github.com/yt-dlp/yt-dlp/commit/66468bbf49562ff82670cbbd456c5e8448a6df34) ([#7108](https://github.com/yt-dlp/yt-dlp/issues/7108)) by [sqrtNOT](https://github.com/sqrtNOT) +- **crtvg**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/26c517b29c8727e47948d6fff749d5297f0efb60) ([#7168](https://github.com/yt-dlp/yt-dlp/issues/7168)) by [ItzMaxTV](https://github.com/ItzMaxTV) +- **crunchyroll**: [Rework with support for movies, music and artists](https://github.com/yt-dlp/yt-dlp/commit/032de83ea9ff2f4977d9c71a93bbc1775597b762) ([#6237](https://github.com/yt-dlp/yt-dlp/issues/6237)) by [Grub4K](https://github.com/Grub4K) +- **dacast**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/c25cac2f8e5fbac2737a426d7778fd2f0efc5381) ([#6896](https://github.com/yt-dlp/yt-dlp/issues/6896)) by [bashonly](https://github.com/bashonly) +- **daftsex**: [Update domain and embed player url](https://github.com/yt-dlp/yt-dlp/commit/fc5a7f9b27d2a89b1f3ca7d33a95301c21d832cd) ([#5966](https://github.com/yt-dlp/yt-dlp/issues/5966)) by [JChris246](https://github.com/JChris246) +- **DigitalConcertHall**: [Support films](https://github.com/yt-dlp/yt-dlp/commit/55ed4ff73487feb3177b037dfc2ea527e777da3e) ([#7202](https://github.com/yt-dlp/yt-dlp/issues/7202)) by [ItzMaxTV](https://github.com/ItzMaxTV) +- **discogs**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/6daaf21092888beff11b807cd46f832f1f9c46a0) ([#6624](https://github.com/yt-dlp/yt-dlp/issues/6624)) by [rjy](https://github.com/rjy) +- **dlf**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b423b6a48e0b19260bc95ab7d72d2138d7f124dc) ([#6697](https://github.com/yt-dlp/yt-dlp/issues/6697)) by [nick-cd](https://github.com/nick-cd) +- **drtv**: [Fix radio page extraction](https://github.com/yt-dlp/yt-dlp/commit/9a06b7b1891b48cebbe275652ae8025a36d97d97) ([#6552](https://github.com/yt-dlp/yt-dlp/issues/6552)) by [viktor-enzell](https://github.com/viktor-enzell) +- **Dumpert**: [Fix m3u8 and support new URL pattern](https://github.com/yt-dlp/yt-dlp/commit/f8ae441501596733e2b967430471643a1d7cacb8) ([#6091](https://github.com/yt-dlp/yt-dlp/issues/6091)) by [DataGhost](https://github.com/DataGhost), [pukkandan](https://github.com/pukkandan) +- **elevensports**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/ecfe47973f6603b5367fe2cc3c65274627d94516) ([#7172](https://github.com/yt-dlp/yt-dlp/issues/7172)) by [ItzMaxTV](https://github.com/ItzMaxTV) +- **ettutv**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/83465fc4100a2fb2c188898fbc2f3021f6a9b4dd) ([#6579](https://github.com/yt-dlp/yt-dlp/issues/6579)) by [elyse0](https://github.com/elyse0) +- **europarl**: [Rewrite extractor](https://github.com/yt-dlp/yt-dlp/commit/03789976d301eaed3e957dbc041573098f6af059) ([#7114](https://github.com/yt-dlp/yt-dlp/issues/7114)) by [HobbyistDev](https://github.com/HobbyistDev) +- **eurosport**: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/45e87ea106ad37b2a002663fa30ee41ce97b16cd) ([#7076](https://github.com/yt-dlp/yt-dlp/issues/7076)) by [HobbyistDev](https://github.com/HobbyistDev) +- **facebook**: [Fix metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/3b52a606881e6adadc33444abdeacce562b79330) ([#6856](https://github.com/yt-dlp/yt-dlp/issues/6856)) by [ringus1](https://github.com/ringus1) +- **foxnews**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/97d60ad8cd6c99f01e463a9acfce8693aff2a609) ([#7222](https://github.com/yt-dlp/yt-dlp/issues/7222)) by [bashonly](https://github.com/bashonly) +- **funker530**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/cab94a0cd8b6d3fffed5a6faff030274adbed182) ([#7291](https://github.com/yt-dlp/yt-dlp/issues/7291)) by [Cyberes](https://github.com/Cyberes) +- **generic** + - [Accept values for `fragment_query`, `variant_query`](https://github.com/yt-dlp/yt-dlp/commit/5cc0a8fd2e9fec50026fb92170b57993af939e4a) ([#6600](https://github.com/yt-dlp/yt-dlp/issues/6600)) by [bashonly](https://github.com/bashonly) (With fixes in [9bfe0d1](https://github.com/yt-dlp/yt-dlp/commit/9bfe0d15bd7dbdc6b0e6378fa9f5e2e289b2373b)) + - [Add extractor-args `hls_key`, `variant_query`](https://github.com/yt-dlp/yt-dlp/commit/c2e0fc40a73dd85ab3920f977f579d475e66ef59) ([#6567](https://github.com/yt-dlp/yt-dlp/issues/6567)) by [bashonly](https://github.com/bashonly) + - [Attempt to detect live HLS](https://github.com/yt-dlp/yt-dlp/commit/93e7c6995e07dafb9dcc06c0d06acf6c5bdfecc5) ([#6775](https://github.com/yt-dlp/yt-dlp/issues/6775)) by [bashonly](https://github.com/bashonly) +- **genius**: [Add support for articles](https://github.com/yt-dlp/yt-dlp/commit/460da07439718d9af1e3661da2a23e05a913a2e6) ([#6474](https://github.com/yt-dlp/yt-dlp/issues/6474)) by [bashonly](https://github.com/bashonly) +- **globalplayer**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/30647668a92a0ca5cd108776804baac0996bd9f7) ([#6903](https://github.com/yt-dlp/yt-dlp/issues/6903)) by [garret1317](https://github.com/garret1317) +- **gmanetwork**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/2d97d154fe4fb84fe2ed3a4e1ed5819e89b71e88) ([#5945](https://github.com/yt-dlp/yt-dlp/issues/5945)) by [HobbyistDev](https://github.com/HobbyistDev) +- **gronkh**: [Extract duration and chapters](https://github.com/yt-dlp/yt-dlp/commit/9c92b803fa24e48543ce969468d5404376e315b7) ([#6817](https://github.com/yt-dlp/yt-dlp/issues/6817)) by [satan1st](https://github.com/satan1st) +- **hentaistigma**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/04f8018a0544736a18494bc3899d06b05b78fae6) by [pukkandan](https://github.com/pukkandan) +- **hidive**: [Fix login](https://github.com/yt-dlp/yt-dlp/commit/e6ab678e36c40ded0aae305bbb866cdab554d417) by [pukkandan](https://github.com/pukkandan) +- **hollywoodreporter**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/6bdb64e2a2a6d504d8ce1dc830fbfb8a7f199c63) ([#6614](https://github.com/yt-dlp/yt-dlp/issues/6614)) by [bashonly](https://github.com/bashonly) +- **hotstar**: [Support `/shows/` URLs](https://github.com/yt-dlp/yt-dlp/commit/7f8ddebbb51c9fd4a347306332a718ba41b371b8) ([#7225](https://github.com/yt-dlp/yt-dlp/issues/7225)) by [bashonly](https://github.com/bashonly) +- **hrefli**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/7e35526d5b970a034b9d76215ee3e4bd7631edcd) ([#6762](https://github.com/yt-dlp/yt-dlp/issues/6762)) by [selfisekai](https://github.com/selfisekai) +- **idolplus**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/5c14b213679ed4401288bdc86ae696932e219222) ([#6732](https://github.com/yt-dlp/yt-dlp/issues/6732)) by [ping](https://github.com/ping) +- **iq**: [Set more language codes](https://github.com/yt-dlp/yt-dlp/commit/2d5cae9636714ff922d28c548c349d5f2b48f317) ([#6476](https://github.com/yt-dlp/yt-dlp/issues/6476)) by [D0LLYNH0](https://github.com/D0LLYNH0) +- **iwara** + - [Accept old URLs](https://github.com/yt-dlp/yt-dlp/commit/ab92d8651c48d247dfb7d3f0a824cc986e47c7ed) by [Lesmiscore](https://github.com/Lesmiscore) + - [Fix authentication](https://github.com/yt-dlp/yt-dlp/commit/0a5d7c39e17bb9bd50c9db42bcad40eb82d7f784) ([#7137](https://github.com/yt-dlp/yt-dlp/issues/7137)) by [toomyzoom](https://github.com/toomyzoom) + - [Fix format sorting](https://github.com/yt-dlp/yt-dlp/commit/56793f74c36899742d7abd52afb0deca97d469e1) ([#6651](https://github.com/yt-dlp/yt-dlp/issues/6651)) by [hasezoey](https://github.com/hasezoey) + - [Fix typo](https://github.com/yt-dlp/yt-dlp/commit/d1483ec693c79f0b4ddf493870bcb840aca4da08) by [Lesmiscore](https://github.com/Lesmiscore) + - [Implement login](https://github.com/yt-dlp/yt-dlp/commit/21b9413cf7dd4830b2ece57af21589dd4538fc52) ([#6721](https://github.com/yt-dlp/yt-dlp/issues/6721)) by [toomyzoom](https://github.com/toomyzoom) + - [Overhaul extractors](https://github.com/yt-dlp/yt-dlp/commit/c14af7a741931b364bab3d9546c0f4359f318f8c) ([#6557](https://github.com/yt-dlp/yt-dlp/issues/6557)) by [Lesmiscore](https://github.com/Lesmiscore) + - [Report private videos](https://github.com/yt-dlp/yt-dlp/commit/95a383be1b6fb00c92ee3fb091732c4f6009acb6) ([#6641](https://github.com/yt-dlp/yt-dlp/issues/6641)) by [Lesmiscore](https://github.com/Lesmiscore) +- **JStream**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/3459d3c5af3b2572ed51e8ecfda6c11022a838c6) ([#6252](https://github.com/yt-dlp/yt-dlp/issues/6252)) by [Lesmiscore](https://github.com/Lesmiscore) +- **jwplatform**: [Update `_extract_embed_urls`](https://github.com/yt-dlp/yt-dlp/commit/cf9fd52fabe71d6e7c30d3ea525029ffa561fc9c) ([#6383](https://github.com/yt-dlp/yt-dlp/issues/6383)) by [carusocr](https://github.com/carusocr) +- **kick**: [Make initial request non-fatal](https://github.com/yt-dlp/yt-dlp/commit/0a6918a4a1431960181d8c50e0bbbcb0afbaff9a) by [bashonly](https://github.com/bashonly) +- **LastFM**: [Rewrite playlist extraction](https://github.com/yt-dlp/yt-dlp/commit/026435714cb7c39613a0d7d2acd15d3823b78d94) ([#6379](https://github.com/yt-dlp/yt-dlp/issues/6379)) by [hatienl0i261299](https://github.com/hatienl0i261299), [pukkandan](https://github.com/pukkandan) +- **lbry**: [Extract original quality formats](https://github.com/yt-dlp/yt-dlp/commit/44c0d66442b568d9e1359e669d8b029b08a77fa7) ([#7257](https://github.com/yt-dlp/yt-dlp/issues/7257)) by [bashonly](https://github.com/bashonly) +- **line**: [Remove extractors](https://github.com/yt-dlp/yt-dlp/commit/faa0332ed69e070cf3bd31390589a596e962f392) ([#6734](https://github.com/yt-dlp/yt-dlp/issues/6734)) by [sian1468](https://github.com/sian1468) +- **livestream**: [Support videos with account id](https://github.com/yt-dlp/yt-dlp/commit/bfdf144c7e5d7a93fbfa9d8e65598c72bf2b542a) ([#6324](https://github.com/yt-dlp/yt-dlp/issues/6324)) by [theperfectpunk](https://github.com/theperfectpunk) +- **medaltv**: [Fix clips](https://github.com/yt-dlp/yt-dlp/commit/1e3c2b6ec28d7ab5e31341fa93c47b65be4fbff4) ([#6502](https://github.com/yt-dlp/yt-dlp/issues/6502)) by [xenova](https://github.com/xenova) +- **mediastream**: [Improve `WinSports` and embed extraction](https://github.com/yt-dlp/yt-dlp/commit/03025b6e105139d01cd415ddc51fd692957fd2ba) ([#6426](https://github.com/yt-dlp/yt-dlp/issues/6426)) by [bashonly](https://github.com/bashonly) +- **mgtv**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/59d9fe08312bbb76ee26238d207a8ca35410a48d) ([#7234](https://github.com/yt-dlp/yt-dlp/issues/7234)) by [bashonly](https://github.com/bashonly) +- **Mzaalo**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/dc3c44f349ba85af320e706e2a27ad81a78b1c6e) ([#7163](https://github.com/yt-dlp/yt-dlp/issues/7163)) by [ItzMaxTV](https://github.com/ItzMaxTV) +- **nbc**: [Fix `NBCStations` direct mp4 formats](https://github.com/yt-dlp/yt-dlp/commit/9be0fe1fd967f62cbf3c60bd14e1021a70abc147) ([#6637](https://github.com/yt-dlp/yt-dlp/issues/6637)) by [bashonly](https://github.com/bashonly) +- **nebula**: [Add `beta.nebula.tv`](https://github.com/yt-dlp/yt-dlp/commit/cbfe2e5cbe0f4649a91e323a82b8f5f774f36662) ([#6516](https://github.com/yt-dlp/yt-dlp/issues/6516)) by [unbeatable-101](https://github.com/unbeatable-101) +- **nekohacker**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/489f51279d00318018478fd7461eddbe3b45297e) ([#7003](https://github.com/yt-dlp/yt-dlp/issues/7003)) by [hasezoey](https://github.com/hasezoey) +- **nhk** + - [Add `NhkRadiru` extractor](https://github.com/yt-dlp/yt-dlp/commit/8f0be90ecb3b8d862397177bb226f17b245ef933) ([#6819](https://github.com/yt-dlp/yt-dlp/issues/6819)) by [garret1317](https://github.com/garret1317) + - [Fix API extraction](https://github.com/yt-dlp/yt-dlp/commit/f41b949a2ef646fbc36375febbe3f0c19d742c0f) ([#7180](https://github.com/yt-dlp/yt-dlp/issues/7180)) by [menschel](https://github.com/menschel), [sjthespian](https://github.com/sjthespian) + - `NhkRadiruLive`: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/81c8b9bdd9841b72cbfc1bbff9dab5fb4aa038b0) ([#7332](https://github.com/yt-dlp/yt-dlp/issues/7332)) by [garret1317](https://github.com/garret1317) +- **niconico** + - [Download comments from the new endpoint](https://github.com/yt-dlp/yt-dlp/commit/52ecc33e221f7de7eb6fed6c22489f0c5fdd2c6d) ([#6773](https://github.com/yt-dlp/yt-dlp/issues/6773)) by [Lesmiscore](https://github.com/Lesmiscore) + - live: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/f8f9250fe280d37f0988646cd5cc0072f4d33a6d) ([#5764](https://github.com/yt-dlp/yt-dlp/issues/5764)) by [Lesmiscore](https://github.com/Lesmiscore) + - series: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/c86e433c35fe5da6cb29f3539eef97497f84ed38) ([#6898](https://github.com/yt-dlp/yt-dlp/issues/6898)) by [sqrtNOT](https://github.com/sqrtNOT) +- **nubilesporn**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/d4e6ef40772e0560a8ed33b844ef7549e86837be) ([#6231](https://github.com/yt-dlp/yt-dlp/issues/6231)) by [permunkle](https://github.com/permunkle) +- **odnoklassniki**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/1a2eb5bda51d8b7a78a65acebf72a0dcf9da196b) ([#7217](https://github.com/yt-dlp/yt-dlp/issues/7217)) by [bashonly](https://github.com/bashonly) +- **opencast** + - [Add ltitools to `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/3588be59cee429a0ab5c4ceb2f162298bb44147d) ([#6371](https://github.com/yt-dlp/yt-dlp/issues/6371)) by [C0D3D3V](https://github.com/C0D3D3V) + - [Fix format bug](https://github.com/yt-dlp/yt-dlp/commit/89dbf0848370deaa55af88c3593a2a264124caf5) ([#6512](https://github.com/yt-dlp/yt-dlp/issues/6512)) by [C0D3D3V](https://github.com/C0D3D3V) +- **owncloud**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/c6d4b82a8b8bce59b1c9ce5e6d349ea428dac0a7) ([#6533](https://github.com/yt-dlp/yt-dlp/issues/6533)) by [C0D3D3V](https://github.com/C0D3D3V) +- **Parler**: [Rewrite extractor](https://github.com/yt-dlp/yt-dlp/commit/80ea6d3dea8483cddd39fc89b5ee1fc06670c33c) ([#6446](https://github.com/yt-dlp/yt-dlp/issues/6446)) by [JChris246](https://github.com/JChris246) +- **pgatour**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/3ae182ad89e1427ff7b1684d6a44ff93fa857a0c) ([#6613](https://github.com/yt-dlp/yt-dlp/issues/6613)) by [bashonly](https://github.com/bashonly) +- **playsuisse**: [Support new url format](https://github.com/yt-dlp/yt-dlp/commit/94627c5dde12a72766bdba36e056916c29c40ed1) ([#6528](https://github.com/yt-dlp/yt-dlp/issues/6528)) by [sbor23](https://github.com/sbor23) +- **polskieradio**: [Improve extractors](https://github.com/yt-dlp/yt-dlp/commit/738c90a463257634455ada3e5c18b714c531dede) ([#5948](https://github.com/yt-dlp/yt-dlp/issues/5948)) by [selfisekai](https://github.com/selfisekai) +- **pornez**: [Support new URL formats](https://github.com/yt-dlp/yt-dlp/commit/cbdf9408e6f1e35e98fd6477b3d6902df5b8a47f) ([#6792](https://github.com/yt-dlp/yt-dlp/issues/6792)) by [zhgwn](https://github.com/zhgwn) +- **pornhub**: [Set access cookies to fix extraction](https://github.com/yt-dlp/yt-dlp/commit/62beefa818c75c20b6941389bb197051554a5d41) ([#6685](https://github.com/yt-dlp/yt-dlp/issues/6685)) by [arobase-che](https://github.com/arobase-che), [Schmoaaaaah](https://github.com/Schmoaaaaah) +- **rai**: [Rewrite extractors](https://github.com/yt-dlp/yt-dlp/commit/c6d3f81a4077aaf9cffc6aa2d0dec92f38e74bb0) ([#5940](https://github.com/yt-dlp/yt-dlp/issues/5940)) by [danog](https://github.com/danog), [nixxo](https://github.com/nixxo) +- **recurbate**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/c2502cfed91415c7ccfff925fd3404d230046484) ([#6297](https://github.com/yt-dlp/yt-dlp/issues/6297)) by [mrscrapy](https://github.com/mrscrapy) +- **reddit** + - [Add login support](https://github.com/yt-dlp/yt-dlp/commit/4d9280c9c853733534dda60486fa949bcca36c9e) ([#6950](https://github.com/yt-dlp/yt-dlp/issues/6950)) by [bashonly](https://github.com/bashonly) + - [Support cookies and short URLs](https://github.com/yt-dlp/yt-dlp/commit/7a6f6f24592a8065376f11a58e44878807732cf6) ([#6825](https://github.com/yt-dlp/yt-dlp/issues/6825)) by [bashonly](https://github.com/bashonly) +- **rokfin**: [Re-construct manifest url](https://github.com/yt-dlp/yt-dlp/commit/7a6c8a0807941dd24fbf0d6172e811884f98e027) ([#6507](https://github.com/yt-dlp/yt-dlp/issues/6507)) by [vampirefrog](https://github.com/vampirefrog) +- **rottentomatoes**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/2d306c03d6f2697fcbabb7da35aa62cc078359d3) ([#6844](https://github.com/yt-dlp/yt-dlp/issues/6844)) by [JChris246](https://github.com/JChris246) +- **rozhlas** + - [Extract manifest formats](https://github.com/yt-dlp/yt-dlp/commit/e4cf7741f9302b3faa092962f2895b55cb3d89bb) ([#6590](https://github.com/yt-dlp/yt-dlp/issues/6590)) by [bashonly](https://github.com/bashonly) + - `MujRozhlas`: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/c2b801fea59628d5c873e06a0727fbf2051bbd1f) ([#7129](https://github.com/yt-dlp/yt-dlp/issues/7129)) by [stanoarn](https://github.com/stanoarn) +- **rtvc**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/9b30cd3dfce83c2f0201b28a7a3ef44ab9722664) ([#6578](https://github.com/yt-dlp/yt-dlp/issues/6578)) by [elyse0](https://github.com/elyse0) +- **rumble** + - [Detect timeline format](https://github.com/yt-dlp/yt-dlp/commit/78bc1868ff3352108ab2911033d1ac67a55f151e) by [pukkandan](https://github.com/pukkandan) + - [Fix videos without quality selection](https://github.com/yt-dlp/yt-dlp/commit/6994afc030d2a786d8032075ed71a14d7eac5a4f) by [pukkandan](https://github.com/pukkandan) +- **sbs**: [Overhaul extractor for new API](https://github.com/yt-dlp/yt-dlp/commit/6a765f135ccb654861336ea27a2c1c24ea8e286f) ([#6839](https://github.com/yt-dlp/yt-dlp/issues/6839)) by [bashonly](https://github.com/bashonly), [dirkf](https://github.com/dirkf), [vidiot720](https://github.com/vidiot720) +- **shemaroome**: [Pass `stream_key` header to downloader](https://github.com/yt-dlp/yt-dlp/commit/7bc92517463f5766e9d9b92c3823b5cf403c0e3d) ([#7224](https://github.com/yt-dlp/yt-dlp/issues/7224)) by [bashonly](https://github.com/bashonly) +- **sonyliv**: [Fix login with token](https://github.com/yt-dlp/yt-dlp/commit/4815d35c191e7d375b94492a6486dd2ba43a8954) ([#7223](https://github.com/yt-dlp/yt-dlp/issues/7223)) by [bashonly](https://github.com/bashonly) +- **stageplus**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/e5265dc6517478e589ee3c1ff0cb19bdf4e35ce1) ([#6838](https://github.com/yt-dlp/yt-dlp/issues/6838)) by [bashonly](https://github.com/bashonly) +- **stripchat**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f9213f8a2d7ba46b912afe1dd3ce6bb700a33d72) ([#7306](https://github.com/yt-dlp/yt-dlp/issues/7306)) by [foreignBlade](https://github.com/foreignBlade) +- **substack**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/12037d8b0a578fcc78a5c8f98964e48ee6060e25) ([#7218](https://github.com/yt-dlp/yt-dlp/issues/7218)) by [bashonly](https://github.com/bashonly) +- **sverigesradio**: [Support slug URLs](https://github.com/yt-dlp/yt-dlp/commit/5ee9a7d6e18ceea956e831994cf11c423979354f) ([#7220](https://github.com/yt-dlp/yt-dlp/issues/7220)) by [bashonly](https://github.com/bashonly) +- **tagesschau**: [Fix single audio urls](https://github.com/yt-dlp/yt-dlp/commit/af7585c824a1e405bd8afa46d87b4be322edc93c) ([#6626](https://github.com/yt-dlp/yt-dlp/issues/6626)) by [flashdagger](https://github.com/flashdagger) +- **teamcoco**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/c459d45dd4d417fb80a52e1a04e607776a44baa4) ([#6437](https://github.com/yt-dlp/yt-dlp/issues/6437)) by [bashonly](https://github.com/bashonly) +- **telecaribe**: [Expand livestream support](https://github.com/yt-dlp/yt-dlp/commit/69b2f838d3d3e37dc17367ef64d978db1bea45cf) ([#6601](https://github.com/yt-dlp/yt-dlp/issues/6601)) by [bashonly](https://github.com/bashonly) +- **tencent**: [Fix fatal metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/971d901d129403e875a04dd92109507a03fbc070) ([#7219](https://github.com/yt-dlp/yt-dlp/issues/7219)) by [bashonly](https://github.com/bashonly) +- **thesun**: [Update `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/0181b9a1b31db3fde943f7cd3fe9662f23bff292) ([#6522](https://github.com/yt-dlp/yt-dlp/issues/6522)) by [hatienl0i261299](https://github.com/hatienl0i261299) +- **tiktok** + - [Extract 1080p adaptive formats](https://github.com/yt-dlp/yt-dlp/commit/c2a1bdb00931969193f2a31ea27b9c66a07aaec2) ([#7228](https://github.com/yt-dlp/yt-dlp/issues/7228)) by [bashonly](https://github.com/bashonly) + - [Fix and improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/925936908a3c3ee0e508621db14696b9f6a8b563) ([#6777](https://github.com/yt-dlp/yt-dlp/issues/6777)) by [bashonly](https://github.com/bashonly) + - [Fix mp3 formats](https://github.com/yt-dlp/yt-dlp/commit/8ceb07e870424c219dced8f4348729553f05c5cc) ([#6615](https://github.com/yt-dlp/yt-dlp/issues/6615)) by [bashonly](https://github.com/bashonly) + - [Fix resolution extraction](https://github.com/yt-dlp/yt-dlp/commit/ab6057ec80aa75db6303b8206916d00c376c622c) ([#7237](https://github.com/yt-dlp/yt-dlp/issues/7237)) by [puc9](https://github.com/puc9) + - [Improve `TikTokLive` extractor](https://github.com/yt-dlp/yt-dlp/commit/216bcb66d7dce0762767d751dad10650cb57da9d) ([#6520](https://github.com/yt-dlp/yt-dlp/issues/6520)) by [bashonly](https://github.com/bashonly) +- **triller**: [Support short URLs, detect removed videos](https://github.com/yt-dlp/yt-dlp/commit/33b737bedf8383c0d00d4e1d06a5273dcdfdb756) ([#6636](https://github.com/yt-dlp/yt-dlp/issues/6636)) by [bashonly](https://github.com/bashonly) +- **tv4**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/125ffaa1737dd04716f2f6fbb0595ad3eb7a4b1c) ([#5649](https://github.com/yt-dlp/yt-dlp/issues/5649)) by [dirkf](https://github.com/dirkf), [TxI5](https://github.com/TxI5) +- **tvp**: [Use new API](https://github.com/yt-dlp/yt-dlp/commit/0c7ce146e4d2a84e656d78f6857952bfd25ab389) ([#6989](https://github.com/yt-dlp/yt-dlp/issues/6989)) by [selfisekai](https://github.com/selfisekai) +- **tvplay**: [Remove outdated domains](https://github.com/yt-dlp/yt-dlp/commit/937264419f9bf375d5656785ae6e53282587c15d) ([#7106](https://github.com/yt-dlp/yt-dlp/issues/7106)) by [ivanskodje](https://github.com/ivanskodje) +- **twitch** + - [Extract original size thumbnail](https://github.com/yt-dlp/yt-dlp/commit/80b732b7a9585b2a61e456dc0d2d014a439cbaee) ([#6629](https://github.com/yt-dlp/yt-dlp/issues/6629)) by [JC-Chung](https://github.com/JC-Chung) + - [Fix `is_live`](https://github.com/yt-dlp/yt-dlp/commit/0551511b45f7847f40e4314aa9e624e80d086539) ([#6500](https://github.com/yt-dlp/yt-dlp/issues/6500)) by [elyse0](https://github.com/elyse0) + - [Support mobile clips](https://github.com/yt-dlp/yt-dlp/commit/02312c03cf53eb1da24c9ad022ee79af26060733) ([#6699](https://github.com/yt-dlp/yt-dlp/issues/6699)) by [bepvte](https://github.com/bepvte) + - [Update `_CLIENT_ID` and add extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/01231feb142e80828985aabdec04ac608e3d43e2) ([#7200](https://github.com/yt-dlp/yt-dlp/issues/7200)) by [bashonly](https://github.com/bashonly) + - vod: [Support links from schedule tab](https://github.com/yt-dlp/yt-dlp/commit/dbce5afa6bb61f6272ade613f2e9a3d66b88c7ea) ([#7071](https://github.com/yt-dlp/yt-dlp/issues/7071)) by [falbrechtskirchinger](https://github.com/falbrechtskirchinger) +- **twitter** + - [Add login support](https://github.com/yt-dlp/yt-dlp/commit/d1795f4a6af99c976c9d3ea2dabe5cf4f8965d3c) ([#7258](https://github.com/yt-dlp/yt-dlp/issues/7258)) by [bashonly](https://github.com/bashonly) + - [Default to GraphQL, handle auth errors](https://github.com/yt-dlp/yt-dlp/commit/147e62fc584c3ea6fdb09bb7a47905df68553a22) ([#6957](https://github.com/yt-dlp/yt-dlp/issues/6957)) by [bashonly](https://github.com/bashonly) + - spaces: [Add `release_timestamp`](https://github.com/yt-dlp/yt-dlp/commit/1c16d9df5330819cc79ad588b24aa5b72765c168) ([#7186](https://github.com/yt-dlp/yt-dlp/issues/7186)) by [CeruleanSky](https://github.com/CeruleanSky) +- **urplay**: [Extract all subtitles](https://github.com/yt-dlp/yt-dlp/commit/7bcd4813215ac98daa4949af2ffc677c78307a38) ([#7309](https://github.com/yt-dlp/yt-dlp/issues/7309)) by [hoaluvn](https://github.com/hoaluvn) +- **voot**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/4f7b11cc1c1cebf598107e00cd7295588ed484da) ([#7227](https://github.com/yt-dlp/yt-dlp/issues/7227)) by [bashonly](https://github.com/bashonly) +- **vrt**: [Overhaul extractors](https://github.com/yt-dlp/yt-dlp/commit/1a7dcca378e80a387923ee05c250d8ba122441c6) ([#6244](https://github.com/yt-dlp/yt-dlp/issues/6244)) by [bashonly](https://github.com/bashonly), [bergoid](https://github.com/bergoid), [jeroenj](https://github.com/jeroenj) +- **weverse**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b844a3f8b16500663e7ab6c6ec061cc9b30f71ac) ([#6711](https://github.com/yt-dlp/yt-dlp/issues/6711)) by [bashonly](https://github.com/bashonly) (With fixes in [fd5d93f](https://github.com/yt-dlp/yt-dlp/commit/fd5d93f7040f9776fd541f4e4079dad7d3b3fb4f)) +- **wevidi**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/1ea15603d852971ed7d92f4de12808b27b3d9370) ([#6868](https://github.com/yt-dlp/yt-dlp/issues/6868)) by [truedread](https://github.com/truedread) +- **weyyak**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/6dc00acf0f1f1107a626c21befd1691403e6aeeb) ([#7124](https://github.com/yt-dlp/yt-dlp/issues/7124)) by [ItzMaxTV](https://github.com/ItzMaxTV) +- **whyp**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/2c566ed14101673c651c08c306c30fa5b4010b85) ([#6803](https://github.com/yt-dlp/yt-dlp/issues/6803)) by [CoryTibbettsDev](https://github.com/CoryTibbettsDev) +- **wrestleuniverse** + - [Fix cookies support](https://github.com/yt-dlp/yt-dlp/commit/c8561c6d03f025268d6d3972abeb47987c8d7cbb) by [bashonly](https://github.com/bashonly) + - [Fix extraction, add login](https://github.com/yt-dlp/yt-dlp/commit/ef8fb7f029b816dfc95600727d84400591a3b5c5) ([#6982](https://github.com/yt-dlp/yt-dlp/issues/6982)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) +- **wykop**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/aed945e1b9b7d3af2a907e1a12e6508cc81d6a20) ([#6140](https://github.com/yt-dlp/yt-dlp/issues/6140)) by [selfisekai](https://github.com/selfisekai) +- **ximalaya**: [Sort playlist entries](https://github.com/yt-dlp/yt-dlp/commit/8790ea7b2536332777bce68590386b1aa935fac7) ([#7292](https://github.com/yt-dlp/yt-dlp/issues/7292)) by [linsui](https://github.com/linsui) +- **YahooGyaOIE, YahooGyaOPlayerIE**: [Delete extractors due to website close](https://github.com/yt-dlp/yt-dlp/commit/68be95bd0ca3f76aa63c9812935bd826b3a42e53) ([#6218](https://github.com/yt-dlp/yt-dlp/issues/6218)) by [Lesmiscore](https://github.com/Lesmiscore) +- **yappy**: YappyProfile: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/6f69101dc912690338d32e2aab085c32e44eba3f) ([#7346](https://github.com/yt-dlp/yt-dlp/issues/7346)) by [7vlad7](https://github.com/7vlad7) +- **youku**: [Improve error message](https://github.com/yt-dlp/yt-dlp/commit/ef0848abd425dfda6db62baa8d72897eefb0007f) ([#6690](https://github.com/yt-dlp/yt-dlp/issues/6690)) by [carusocr](https://github.com/carusocr) +- **youporn**: [Extract m3u8 formats](https://github.com/yt-dlp/yt-dlp/commit/ddae33754ae1f32dd9c64cf895c47d20f6b5f336) by [pukkandan](https://github.com/pukkandan) +- **youtube** + - [Add client name to `format_note` when `-v`](https://github.com/yt-dlp/yt-dlp/commit/c795c39f27244cbce846067891827e4847036441) ([#6254](https://github.com/yt-dlp/yt-dlp/issues/6254)) by [Lesmiscore](https://github.com/Lesmiscore), [pukkandan](https://github.com/pukkandan) + - [Add extractor-arg `include_duplicate_formats`](https://github.com/yt-dlp/yt-dlp/commit/86cb922118b236306310a72657f70426c20e28bb) by [pukkandan](https://github.com/pukkandan) + - [Bypass throttling for `-f17`](https://github.com/yt-dlp/yt-dlp/commit/c9abebb851e6188cb34b9eb744c1863dd46af919) by [pukkandan](https://github.com/pukkandan) + - [Construct fragment list lazily](https://github.com/yt-dlp/yt-dlp/commit/2a23d92d9ec44a0168079e38bcf3d383e5c4c7bb) by [pukkandan](https://github.com/pukkandan) (With fixes in [e389d17](https://github.com/yt-dlp/yt-dlp/commit/e389d172b6f42e4f332ae679dc48543fb7b9b61d)) + - [Define strict uploader metadata mapping](https://github.com/yt-dlp/yt-dlp/commit/7666b93604b97e9ada981c6b04ccf5605dd1bd44) ([#6384](https://github.com/yt-dlp/yt-dlp/issues/6384)) by [coletdjnz](https://github.com/coletdjnz) + - [Determine audio language using automatic captions](https://github.com/yt-dlp/yt-dlp/commit/ff9b0e071ffae5543cc309e6f9e647ac51e5846e) by [pukkandan](https://github.com/pukkandan) + - [Extract `channel_is_verified`](https://github.com/yt-dlp/yt-dlp/commit/8213ce28a485e200f6a7e1af1434a987c8e702bd) ([#7213](https://github.com/yt-dlp/yt-dlp/issues/7213)) by [coletdjnz](https://github.com/coletdjnz) + - [Extract `heatmap` data](https://github.com/yt-dlp/yt-dlp/commit/5caf30dbc34f10b0be60676fece635b5c59f0d72) ([#7100](https://github.com/yt-dlp/yt-dlp/issues/7100)) by [tntmod54321](https://github.com/tntmod54321) + - [Extract more metadata for comments](https://github.com/yt-dlp/yt-dlp/commit/c35448b7b14113b35c4415dbfbf488c4731f006f) ([#7179](https://github.com/yt-dlp/yt-dlp/issues/7179)) by [coletdjnz](https://github.com/coletdjnz) + - [Extract uploader metadata for feed/playlist items](https://github.com/yt-dlp/yt-dlp/commit/93e12ed76ef49252dc6869b59d21d0777e5e11af) by [coletdjnz](https://github.com/coletdjnz) + - [Fix comment loop detection for pinned comments](https://github.com/yt-dlp/yt-dlp/commit/141a8dff98874a426d7fbe772e0a8421bb42656f) ([#6714](https://github.com/yt-dlp/yt-dlp/issues/6714)) by [coletdjnz](https://github.com/coletdjnz) + - [Fix continuation loop with no comments](https://github.com/yt-dlp/yt-dlp/commit/18f8fba7c89a87f99cc3313a1795848867e84fff) ([#7148](https://github.com/yt-dlp/yt-dlp/issues/7148)) by [coletdjnz](https://github.com/coletdjnz) + - [Fix parsing `comment_count`](https://github.com/yt-dlp/yt-dlp/commit/071670cbeaa01ddf2cc20a95ae6da25f8f086431) ([#6523](https://github.com/yt-dlp/yt-dlp/issues/6523)) by [nick-cd](https://github.com/nick-cd) + - [Handle incomplete initial data from watch page](https://github.com/yt-dlp/yt-dlp/commit/607510b9f2f67bfe7d33d74031a5c1fe22a24862) ([#6510](https://github.com/yt-dlp/yt-dlp/issues/6510)) by [coletdjnz](https://github.com/coletdjnz) + - [Ignore wrong fps of some formats](https://github.com/yt-dlp/yt-dlp/commit/97afb093d4cbe5df889145afa5f9ede4535e93e4) by [pukkandan](https://github.com/pukkandan) + - [Misc cleanup](https://github.com/yt-dlp/yt-dlp/commit/14a14335b280766fbf5a469ae26836d6c1fe450a) by [coletdjnz](https://github.com/coletdjnz) + - [Prioritize premium formats](https://github.com/yt-dlp/yt-dlp/commit/51a07b0dca4c079d58311c19b6d1c097c24bb021) by [pukkandan](https://github.com/pukkandan) + - [Revert default formats to `https`](https://github.com/yt-dlp/yt-dlp/commit/c6786ff3baaf72a5baa4d56d34058e54cbcf8ceb) by [pukkandan](https://github.com/pukkandan) + - [Support podcasts and releases tabs](https://github.com/yt-dlp/yt-dlp/commit/447afb9eaa65bc677e3245c83e53a8e69c174a3c) by [coletdjnz](https://github.com/coletdjnz) + - [Support shorter relative time format](https://github.com/yt-dlp/yt-dlp/commit/2fb35f6004c7625f0dd493da4a5abf0690f7777c) ([#7191](https://github.com/yt-dlp/yt-dlp/issues/7191)) by [coletdjnz](https://github.com/coletdjnz) + - music_search_url: [Extract title](https://github.com/yt-dlp/yt-dlp/commit/69a40e4a7f6caa5662527ebd2f3c4e8aa02857a2) ([#7102](https://github.com/yt-dlp/yt-dlp/issues/7102)) by [kangalio](https://github.com/kangalio) +- **zaiko** + - [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/345b4c0aedd9d19898ce00d5cef35fe0d277a052) ([#7254](https://github.com/yt-dlp/yt-dlp/issues/7254)) by [c-basalt](https://github.com/c-basalt) + - ZaikoETicket: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/5cc09c004bd5edbbada9b041c08a720cadc4f4df) ([#7347](https://github.com/yt-dlp/yt-dlp/issues/7347)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **zdf**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/ee0ed0338df328cd986f97315c8162b5a151476d) by [bashonly](https://github.com/bashonly) +- **zee5**: [Fix extraction of new content](https://github.com/yt-dlp/yt-dlp/commit/9d7fde89a40360396f0baa2ee8bf507f92108b32) ([#7280](https://github.com/yt-dlp/yt-dlp/issues/7280)) by [bashonly](https://github.com/bashonly) +- **zingmp3**: [Fix and improve extractors](https://github.com/yt-dlp/yt-dlp/commit/17d7ca84ea723c20668bd9bfa938be7ea0e64f6b) ([#6367](https://github.com/yt-dlp/yt-dlp/issues/6367)) by [hatienl0i261299](https://github.com/hatienl0i261299) +- **zoom** + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/79c77e85b70ae3b9942d5a88c14d021a9bd24222) ([#6741](https://github.com/yt-dlp/yt-dlp/issues/6741)) by [shreyasminocha](https://github.com/shreyasminocha) + - [Fix share URL extraction](https://github.com/yt-dlp/yt-dlp/commit/90c1f5120694105496a6ad9e3ecfc6c25de6cae1) ([#6789](https://github.com/yt-dlp/yt-dlp/issues/6789)) by [bashonly](https://github.com/bashonly) + +#### Downloader changes +- **curl**: [Fix progress reporting](https://github.com/yt-dlp/yt-dlp/commit/66aeaac9aa30b5959069ba84e53a5508232deb38) by [pukkandan](https://github.com/pukkandan) +- **fragment**: [Do not sleep between fragments](https://github.com/yt-dlp/yt-dlp/commit/424f3bf03305088df6e01d62f7311be8601ad3f4) by [pukkandan](https://github.com/pukkandan) + +#### Postprocessor changes +- [Fix chapters if duration is not extracted](https://github.com/yt-dlp/yt-dlp/commit/01ddec7e661bf90dc4c34e6924eb9d7629886cef) ([#6037](https://github.com/yt-dlp/yt-dlp/issues/6037)) by [bashonly](https://github.com/bashonly) +- [Print newline for `--progress-template`](https://github.com/yt-dlp/yt-dlp/commit/13ff78095372fd98900a32572cf817994c07ccb5) by [pukkandan](https://github.com/pukkandan) +- **EmbedThumbnail, FFmpegMetadata**: [Fix error on attaching thumbnails and info json for mkv/mka](https://github.com/yt-dlp/yt-dlp/commit/0f0875ed555514f32522a0f30554fb08825d5124) ([#6647](https://github.com/yt-dlp/yt-dlp/issues/6647)) by [Lesmiscore](https://github.com/Lesmiscore) +- **FFmpegFixupM3u8PP**: [Check audio codec before fixup](https://github.com/yt-dlp/yt-dlp/commit/3f7e2bd80e3c5d8a1682f20a1b245fcd974f295d) ([#6778](https://github.com/yt-dlp/yt-dlp/issues/6778)) by [bashonly](https://github.com/bashonly) +- **FixupDuplicateMoov**: [Fix bug in triggering](https://github.com/yt-dlp/yt-dlp/commit/26010b5cec50193b98ad7845d1d77450f9f14c2b) by [pukkandan](https://github.com/pukkandan) + +#### Misc. changes +- [Add automatic duplicate issue detection](https://github.com/yt-dlp/yt-dlp/commit/15b2d3db1d40b0437fca79d8874d392aa54b3cdd) by [pukkandan](https://github.com/pukkandan) +- **build** + - [Fix macOS target](https://github.com/yt-dlp/yt-dlp/commit/44a79958f0b596ee71e1eb25f158610aada29d1b) by [Grub4K](https://github.com/Grub4K) + - [Implement build verification using `--update-to`](https://github.com/yt-dlp/yt-dlp/commit/b73193c99aa23b135732408a5fcf655c68d731c6) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + - [Pin `pyinstaller` version for MacOS](https://github.com/yt-dlp/yt-dlp/commit/427a8fafbb0e18c28d0ed7960be838d7b26b88d3) by [pukkandan](https://github.com/pukkandan) + - [Various build workflow improvements](https://github.com/yt-dlp/yt-dlp/commit/c4efa0aefec8daef1de62fd1693f13edf3c8b03c) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) +- **cleanup** + - Miscellaneous + - [6f2287c](https://github.com/yt-dlp/yt-dlp/commit/6f2287cb18cbfb27518f068d868fa9390fee78ad) by [pukkandan](https://github.com/pukkandan) + - [ad54c91](https://github.com/yt-dlp/yt-dlp/commit/ad54c9130e793ce433bf9da334fa80df9f3aee58) by [freezboltz](https://github.com/freezboltz), [mikf](https://github.com/mikf), [pukkandan](https://github.com/pukkandan) +- **cleanup, utils**: [Split into submodules](https://github.com/yt-dlp/yt-dlp/commit/69bec6730ec9d724bcedeab199d9d684d61423ba) ([#7090](https://github.com/yt-dlp/yt-dlp/issues/7090)) by [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) +- **cli_to_api**: [Add script](https://github.com/yt-dlp/yt-dlp/commit/46f1370e9af6f8af8762f67e27e5acb8f0c48a47) by [pukkandan](https://github.com/pukkandan) +- **devscripts**: `make_changelog`: [Various improvements](https://github.com/yt-dlp/yt-dlp/commit/23c39a4beadee382060bb47fdaa21316ca707d38) by [Grub4K](https://github.com/Grub4K) +- **docs**: [Misc improvements](https://github.com/yt-dlp/yt-dlp/commit/c8bc203fbf3bb09914e53f0833eed622ab7edbb9) by [pukkandan](https://github.com/pukkandan) + +### 2023.03.04 + +#### Extractor changes +- bilibili + - [Fix for downloading wrong subtitles](https://github.com/yt-dlp/yt-dlp/commit/8a83baaf218ab89e6e7faa76b7c7be3a2ec19e3a) ([#6358](https://github.com/yt-dlp/yt-dlp/issues/6358)) by [LXYan2333](https://github.com/LXYan2333) +- ESPNcricinfo + - [Handle new URL pattern](https://github.com/yt-dlp/yt-dlp/commit/640c934823fc2d1ec77ec932566078014058635f) ([#6321](https://github.com/yt-dlp/yt-dlp/issues/6321)) by [venkata-krishnas](https://github.com/venkata-krishnas) +- lefigaro + - [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/eb8fd6d044e8926532772b72be0645c6b8ecb3aa) ([#6309](https://github.com/yt-dlp/yt-dlp/issues/6309)) by [elyse0](https://github.com/elyse0) +- lumni + - [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/1f8489cccbdc6e96027ef527b88717458f0900e8) ([#6302](https://github.com/yt-dlp/yt-dlp/issues/6302)) by [carusocr](https://github.com/carusocr) +- Prankcast + - [Fix tags](https://github.com/yt-dlp/yt-dlp/commit/ed4cc4ea793314c50ae3f82e98248c1de1c25694) ([#6316](https://github.com/yt-dlp/yt-dlp/issues/6316)) by [columndeeply](https://github.com/columndeeply) +- rutube + - [Extract chapters from description](https://github.com/yt-dlp/yt-dlp/commit/22ccd5420b3eb0782776071f12cccd1fedaa1fd0) ([#6345](https://github.com/yt-dlp/yt-dlp/issues/6345)) by [mushbite](https://github.com/mushbite) +- SportDeutschland + - [Rewrite extractor](https://github.com/yt-dlp/yt-dlp/commit/45db357289b4e1eec09093c8bc5446520378f426) by [pukkandan](https://github.com/pukkandan) +- telecaribe + - [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/b40471282286bd2b09c485bf79afd271d229272c) ([#6311](https://github.com/yt-dlp/yt-dlp/issues/6311)) by [elyse0](https://github.com/elyse0) +- tubetugraz + - [Support `--twofactor` (#6424)](https://github.com/yt-dlp/yt-dlp/commit/f44cb4e77bb9be8be291d02ab6f79dc0b4c0d4a1) ([#6427](https://github.com/yt-dlp/yt-dlp/issues/6427)) by [Ferdi265](https://github.com/Ferdi265) +- tunein + - [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/46580ced56c90b559885aded6aa8f46f20a9cdce) ([#6310](https://github.com/yt-dlp/yt-dlp/issues/6310)) by [elyse0](https://github.com/elyse0) +- twitch + - [Update for GraphQL API changes](https://github.com/yt-dlp/yt-dlp/commit/4a6272c6d1bff89969b67cd22b26ebe6d7e72279) ([#6318](https://github.com/yt-dlp/yt-dlp/issues/6318)) by [elyse0](https://github.com/elyse0) +- twitter + - [Fix retweet extraction](https://github.com/yt-dlp/yt-dlp/commit/cf605226521e99c89fc8dff26a319025810e63a0) ([#6422](https://github.com/yt-dlp/yt-dlp/issues/6422)) by [selfisekai](https://github.com/selfisekai) +- xvideos + - quickies: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/283a0b5bc511f3b350eead4488158f50c20ec526) ([#6414](https://github.com/yt-dlp/yt-dlp/issues/6414)) by [Yakabuff](https://github.com/Yakabuff) + +#### Misc. changes +- build + - [Fix publishing to PyPI and homebrew](https://github.com/yt-dlp/yt-dlp/commit/55676fe498345a389a2539d8baaba958d6d61c3e) by [bashonly](https://github.com/bashonly) + - [Only archive if `vars.ARCHIVE_REPO` is set](https://github.com/yt-dlp/yt-dlp/commit/08ff6d59f97b5f5f0128f6bf6fbef56fd836cc52) by [Grub4K](https://github.com/Grub4K) +- cleanup + - Miscellaneous: [392389b](https://github.com/yt-dlp/yt-dlp/commit/392389b7df7b818f794b231f14dc396d4875fbad) by [pukkandan](https://github.com/pukkandan) +- devscripts + - `make_changelog`: [Stop at `Release ...` commit](https://github.com/yt-dlp/yt-dlp/commit/7accdd9845fe7ce9d0aa5a9d16faaa489c1294eb) by [pukkandan](https://github.com/pukkandan) + +### 2023.03.03 + +#### Important changes +- **A new release type has been added!** + * [`nightly`](https://github.com/yt-dlp/yt-dlp/releases/tag/nightly) builds will be made after each push, containing the latest fixes (but also possibly bugs). + * When using `--update`/`-U`, a release binary will only update to its current channel (either `stable` or `nightly`). + * The `--update-to` option has been added allowing the user more control over program upgrades (or downgrades). + * `--update-to` can change the release channel (`stable`, `nightly`) and also upgrade or downgrade to specific tags. + * **Usage**: `--update-to CHANNEL`, `--update-to TAG`, `--update-to CHANNEL@TAG` +- **YouTube throttling fixes!** + +#### Core changes +- [Add option `--break-match-filters`](https://github.com/yt-dlp/yt-dlp/commit/fe2ce85aff0aa03735fc0152bb8cb9c3d4ef0753) by [pukkandan](https://github.com/pukkandan) +- [Fix `--break-on-existing` with `--lazy-playlist`](https://github.com/yt-dlp/yt-dlp/commit/d21056f4cf0a1623daa107f9181074f5725ac436) by [pukkandan](https://github.com/pukkandan) +- dependencies + - [Simplify `Cryptodome`](https://github.com/yt-dlp/yt-dlp/commit/65f6e807804d2af5e00f2aecd72bfc43af19324a) by [pukkandan](https://github.com/pukkandan) +- jsinterp + - [Handle `Date` at epoch 0](https://github.com/yt-dlp/yt-dlp/commit/9acf1ee25f7ad3920ede574a9de95b8c18626af4) by [pukkandan](https://github.com/pukkandan) +- plugins + - [Don't look in `.egg` directories](https://github.com/yt-dlp/yt-dlp/commit/b059188383eee4fa336ef728dda3ff4bb7335625) by [pukkandan](https://github.com/pukkandan) +- update + - [Add option `--update-to`, including to nightly](https://github.com/yt-dlp/yt-dlp/commit/77df20f14cc9ed41dfe3a1fe2d77fd27f5365a94) ([#6220](https://github.com/yt-dlp/yt-dlp/issues/6220)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) +- utils + - `LenientJSONDecoder`: [Parse unclosed objects](https://github.com/yt-dlp/yt-dlp/commit/cc09083636ce21e58ff74f45eac2dbda507462b0) by [pukkandan](https://github.com/pukkandan) + - `Popen`: [Shim undocumented `text_mode` property](https://github.com/yt-dlp/yt-dlp/commit/da8e2912b165005f76779a115a071cd6132ceedf) by [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- [Fix DRM detection in m3u8](https://github.com/yt-dlp/yt-dlp/commit/43a3eaf96393b712d60cbcf5c6cb1e90ed7f42f5) by [pukkandan](https://github.com/pukkandan) +- generic + - [Detect manifest links via extension](https://github.com/yt-dlp/yt-dlp/commit/b38cae49e6f4849c8ee2a774bdc3c1c647ae5f0e) by [bashonly](https://github.com/bashonly) + - [Handle basic-auth when checking redirects](https://github.com/yt-dlp/yt-dlp/commit/8e9fe43cd393e69fa49b3d842aa3180c1d105b8f) by [pukkandan](https://github.com/pukkandan) +- GoogleDrive + - [Fix some audio](https://github.com/yt-dlp/yt-dlp/commit/4d248e29d20d983ededab0b03d4fe69dff9eb4ed) by [pukkandan](https://github.com/pukkandan) +- iprima + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/9fddc12ab022a31754e0eaa358fc4e1dfa974587) ([#6291](https://github.com/yt-dlp/yt-dlp/issues/6291)) by [std-move](https://github.com/std-move) +- mediastream + - [Improve WinSports support](https://github.com/yt-dlp/yt-dlp/commit/2d5a8c5db2bd4ff1c2e45e00cd890a10f8ffca9e) ([#6401](https://github.com/yt-dlp/yt-dlp/issues/6401)) by [bashonly](https://github.com/bashonly) +- ntvru + - [Extract HLS and DASH formats](https://github.com/yt-dlp/yt-dlp/commit/77d6d136468d0c23c8e79bc937898747804f585a) ([#6403](https://github.com/yt-dlp/yt-dlp/issues/6403)) by [bashonly](https://github.com/bashonly) +- tencent + - [Add more formats and info](https://github.com/yt-dlp/yt-dlp/commit/18d295c9e0f95adc179eef345b7af64d6372db78) ([#5950](https://github.com/yt-dlp/yt-dlp/issues/5950)) by [Hill-98](https://github.com/Hill-98) +- yle_areena + - [Extract non-Kaltura videos](https://github.com/yt-dlp/yt-dlp/commit/40d77d89027cd0e0ce31d22aec81db3e1d433900) ([#6402](https://github.com/yt-dlp/yt-dlp/issues/6402)) by [bashonly](https://github.com/bashonly) +- youtube + - [Construct dash formats with `range` query](https://github.com/yt-dlp/yt-dlp/commit/5038f6d713303e0967d002216e7a88652401c22a) by [pukkandan](https://github.com/pukkandan) (With fixes in [f34804b](https://github.com/yt-dlp/yt-dlp/commit/f34804b2f920f62a6e893a14a9e2a2144b14dd23) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz)) + - [Detect and break on looping comments](https://github.com/yt-dlp/yt-dlp/commit/7f51861b1820c37b157a239b1fe30628d907c034) ([#6301](https://github.com/yt-dlp/yt-dlp/issues/6301)) by [coletdjnz](https://github.com/coletdjnz) + - [Extract channel `view_count` when `/about` tab is passed](https://github.com/yt-dlp/yt-dlp/commit/31e183557fcd1b937582f9429f29207c1261f501) by [pukkandan](https://github.com/pukkandan) + +#### Misc. changes +- build + - [Add `cffi` as a dependency for `yt_dlp_linux`](https://github.com/yt-dlp/yt-dlp/commit/776d1c3f0c9b00399896dd2e40e78e9a43218109) by [bashonly](https://github.com/bashonly) + - [Automated builds and nightly releases](https://github.com/yt-dlp/yt-dlp/commit/29cb20bd563c02671b31dd840139e93dd37150a1) ([#6220](https://github.com/yt-dlp/yt-dlp/issues/6220)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) (With fixes in [bfc861a](https://github.com/yt-dlp/yt-dlp/commit/bfc861a91ee65c9b0ac169754f512e052c6827cf) by [pukkandan](https://github.com/pukkandan)) + - [Sign SHA files and release public key](https://github.com/yt-dlp/yt-dlp/commit/12647e03d417feaa9ea6a458bea5ebd747494a53) by [Grub4K](https://github.com/Grub4K) +- cleanup + - [Fix `Changelog`](https://github.com/yt-dlp/yt-dlp/commit/17ca19ab60a6a13eb8a629c51442b5248b0d8394) by [pukkandan](https://github.com/pukkandan) + - jsinterp: [Give functions names to help debugging](https://github.com/yt-dlp/yt-dlp/commit/b2e0343ba0fc5d8702e90f6ba2b71358e2677e0b) by [pukkandan](https://github.com/pukkandan) + - Miscellaneous: [4815bbf](https://github.com/yt-dlp/yt-dlp/commit/4815bbfc41cf641e4a0650289dbff968cb3bde76), [5b28cef](https://github.com/yt-dlp/yt-dlp/commit/5b28cef72db3b531680d89c121631c73ae05354f) by [pukkandan](https://github.com/pukkandan) +- devscripts + - [Script to generate changelog](https://github.com/yt-dlp/yt-dlp/commit/d400e261cf029a3f20d364113b14de973be75404) ([#6220](https://github.com/yt-dlp/yt-dlp/issues/6220)) by [Grub4K](https://github.com/Grub4K) (With fixes in [9344964](https://github.com/yt-dlp/yt-dlp/commit/93449642815a6973a4b09b289982ca7e1f961b5f)) + ### 2023.02.17 * Merge youtube-dl: Upto [commit/2dd6c6e](https://github.com/ytdl-org/youtube-dl/commit/2dd6c6e) @@ -50,8 +1593,8 @@ ### 2023.02.17 * [extractor/txxx] Add extractors by [chio0hai](https://github.com/chio0hai) * [extractor/vocaroo] Add extractor by [SuperSonicHub1](https://github.com/SuperSonicHub1), [qbnu](https://github.com/qbnu) * [extractor/wrestleuniverse] Add extractors by [Grub4K](https://github.com/Grub4K), [bashonly](https://github.com/bashonly) -* [extractor/yappy] Add extractor by [HobbyistDev](https://github.com/HobbyistDev) -* **[extractor/youtube] Fix `uploader_id` extraction** by [bashonly](https://github.com/bashonly) +* [extractor/yappy] Add extractor by [HobbyistDev](https://github.com/HobbyistDev), [dirkf](https://github.com/dirkf) +* [extractor/youtube] **Fix `uploader_id` extraction** by [bashonly](https://github.com/bashonly) * [extractor/youtube] Add hyperpipe instances by [Generator](https://github.com/Generator) * [extractor/youtube] Handle `consent.youtube` * [extractor/youtube] Support `/live/` URL @@ -103,7 +1646,7 @@ ### 2023.02.17 ### 2023.01.06 -* Fix config locations by [Grub4k](https://github.com/Grub4k), [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) +* Fix config locations by [Grub4K](https://github.com/Grub4K), [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) * [downloader/aria2c] Disable native progress * [utils] `mimetype2ext`: `weba` is not standard * [utils] `windows_enable_vt_mode`: Better error handling @@ -130,7 +1673,7 @@ ### 2023.01.02 * Add `--compat-options 2021,2022` * This allows devs to change defaults and make other potentially breaking changes more easily. If you need everything to work exactly as-is, put Use `--compat 2022` in your config to guard against future compat changes. * [downloader/aria2c] Native progress for aria2c via RPC by [Lesmiscore](https://github.com/Lesmiscore), [pukkandan](https://github.com/pukkandan) -* Merge youtube-dl: Upto [commit/195f22f](https://github.com/ytdl-org/youtube-dl/commit/195f22f6) by [Grub4k](https://github.com/Grub4k), [pukkandan](https://github.com/pukkandan) +* Merge youtube-dl: Upto [commit/195f22f](https://github.com/ytdl-org/youtube-dl/commit/195f22f6) by [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) * Add pre-processor stage `video` * Let `--parse/replace-in-metadata` run at any post-processing stage * Add `--enable-file-urls` by [coletdjnz](https://github.com/coletdjnz) @@ -245,7 +1788,7 @@ ### 2023.01.02 * [extractor/udemy] Fix lectures that have no URL and detect DRM * [extractor/unsupported] Add more URLs * [extractor/urplay] Support for audio-only formats by [barsnick](https://github.com/barsnick) -* [extractor/wistia] Improve extension detection by [Grub4k](https://github.com/Grub4k), [bashonly](https://github.com/bashonly), [pukkandan](https://github.com/pukkandan) +* [extractor/wistia] Improve extension detection by [Grub4K](https://github.com/Grub4K), [bashonly](https://github.com/bashonly), [pukkandan](https://github.com/pukkandan) * [extractor/yle_areena] Support restricted videos by [docbender](https://github.com/docbender) * [extractor/youku] Fix extractor by [KurtBestor](https://github.com/KurtBestor) * [extractor/youporn] Fix metadata by [marieell](https://github.com/marieell) @@ -1059,7 +2602,7 @@ ### 2022.04.08 * [utils] `format_decimal_suffix`: Fix for very large numbers by [s0u1h](https://github.com/s0u1h) * [utils] `traverse_obj`: Allow filtering by value * [utils] Add `filter_dict`, `get_first`, `try_call` -* [utils] ExtractorError: Fix for older python versions +* [utils] ExtractorError: Fix for older Python versions * [utils] WebSocketsWrapper: Allow omitting `__enter__` invocation by [Lesmiscore](https://github.com/Lesmiscore) * [docs] Add an `.editorconfig` file by [fstirlitz](https://github.com/fstirlitz) * [docs] Clarify the exact `BSD` license of dependencies by [MrRawes](https://github.com/MrRawes) @@ -2523,7 +4066,7 @@ ### 2021.05.20 * [cleanup] code formatting, youtube tests and readme ### 2021.05.11 -* **Deprecate support for python versions < 3.6** +* **Deprecate support for Python versions < 3.6** * **Subtitle extraction from manifests** by [fstirlitz](https://github.com/fstirlitz). See [be6202f](https://github.com/yt-dlp/yt-dlp/commit/be6202f12b97858b9d716e608394b51065d0419f) for details * **Improve output template:** * Allow slicing lists/strings using `field.start:end:step` @@ -2813,7 +4356,7 @@ ### 2021.02.19 * Remove unnecessary `field_preference` and misuse of `preference` from extractors * Build improvements: * Fix hash output by [shirt](https://github.com/shirt-dev) - * Lock python package versions for x86 and use `wheels` by [shirt](https://github.com/shirt-dev) + * Lock Python package versions for x86 and use `wheels` by [shirt](https://github.com/shirt-dev) * Exclude `vcruntime140.dll` from UPX by [jbruchon](https://github.com/jbruchon) * Set version number based on UTC time, not local time * Publish on PyPi only if token is set @@ -2880,7 +4423,7 @@ ### 2021.02.04 * Fix "Default format spec" appearing in quiet mode * [FormatSort] Allow user to prefer av01 over vp9 (The default is still vp9) * [FormatSort] fix bug where `quality` had more priority than `hasvid` -* [pyinst] Automatically detect python architecture and working directory +* [pyinst] Automatically detect Python architecture and working directory * Strip out internal fields such as `_filename` from infojson diff --git a/Collaborators.md b/Collaborators.md index 83dfbe3893..ee748eb7fd 100644 --- a/Collaborators.md +++ b/Collaborators.md @@ -8,7 +8,7 @@ # Collaborators ## [pukkandan](https://github.com/pukkandan) [![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/pukkandan) -[![gh-sponsor](https://img.shields.io/badge/_-Github-red.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/pukkandan) +[![gh-sponsor](https://img.shields.io/badge/_-Github-white.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/pukkandan) * Owner of the fork @@ -26,9 +26,10 @@ ## [shirt](https://github.com/shirt-dev) ## [coletdjnz](https://github.com/coletdjnz) -[![gh-sponsor](https://img.shields.io/badge/_-Github-red.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/coletdjnz) +[![gh-sponsor](https://img.shields.io/badge/_-Github-white.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/coletdjnz) * Improved plugin architecture +* Rewrote the networking infrastructure, implemented support for `requests` * YouTube improvements including: age-gate bypass, private playlists, multiple-clients (to avoid throttling) and a lot of under-the-hood improvements * Added support for new websites YoutubeWebArchive, MainStreaming, PRX, nzherald, Mediaklikk, StarTV etc * Improved/fixed support for Patreon, panopto, gfycat, itv, pbs, SouthParkDE etc @@ -44,26 +45,26 @@ ## [Ashish0804](https://github.com/Ashish0804) [Inactive] * Improved/fixed support for HiDive, HotStar, Hungama, LBRY, LinkedInLearning, Mxplayer, SonyLiv, TV2, Vimeo, VLive etc -## [Lesmiscore](https://github.com/Lesmiscore) (nao20010128nao) - -**Bitcoin**: bc1qfd02r007cutfdjwjmyy9w23rjvtls6ncve7r3s -**Monacoin**: mona1q3tf7dzvshrhfe3md379xtvt2n22duhglv5dskr - -* Download live from start to end for YouTube -* Added support for new websites AbemaTV, mildom, PixivSketch, skeb, radiko, voicy, mirrativ, openrec, whowatch, damtomo, 17.live, mixch etc -* Improved/fixed support for fc2, YahooJapanNews, tver, iwara etc - - ## [bashonly](https://github.com/bashonly) -* `--cookies-from-browser` support for Firefox containers -* Added support for new websites Genius, Kick, NBCStations, Triller, VideoKen etc -* Improved/fixed support for Anvato, Brightcove, Instagram, ParamountPlus, Reddit, SlidesLive, TikTok, Twitter, Vimeo etc +* `--update-to`, self-updater rewrite, automated/nightly/master releases +* `--cookies-from-browser` support for Firefox containers, external downloader cookie handling overhaul +* Added support for new websites like Dacast, Kick, NBCStations, Triller, VideoKen, Weverse, WrestleUniverse etc +* Improved/fixed support for Anvato, Brightcove, Reddit, SlidesLive, TikTok, Twitter, Vimeo etc ## [Grub4K](https://github.com/Grub4K) -[![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/Grub4K) [![gh-sponsor](https://img.shields.io/badge/_-Github-red.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/Grub4K) +[![gh-sponsor](https://img.shields.io/badge/_-Github-white.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/Grub4K) [![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/Grub4K) -* Rework internals like `traverse_obj`, various core refactors and bugs fixes -* Helped fix crunchyroll, Twitter, wrestleuniverse, wistia, slideslive etc +* `--update-to`, self-updater rewrite, automated/nightly/master releases +* Reworked internals like `traverse_obj`, various core refactors and bugs fixes +* Implemented proper progress reporting for parallel downloads +* Improved/fixed/added Bundestag, crunchyroll, pr0gramm, Twitter, WrestleUniverse etc + + +## [sepro](https://github.com/seproDev) + +* UX improvements: Warn when ffmpeg is missing, warn when double-clicking exe +* Code cleanup: Remove dead extractors, mark extractors as broken, enable/apply ruff rules +* Improved/fixed/added ArdMediathek, DRTV, Floatplane, MagentaMusik, Naver, Nebula, OnDemandKorea, Vbox7 etc diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index bc2f056c05..0000000000 --- a/MANIFEST.in +++ /dev/null @@ -1,10 +0,0 @@ -include AUTHORS -include Changelog.md -include LICENSE -include README.md -include completions/*/* -include supportedsites.md -include yt-dlp.1 -include requirements.txt -recursive-include devscripts * -recursive-include test * diff --git a/Makefile b/Makefile index d5d47629b9..6c72ead1ef 100644 --- a/Makefile +++ b/Makefile @@ -2,29 +2,32 @@ all: lazy-extractors yt-dlp doc pypi-files clean: clean-test clean-dist clean-all: clean clean-cache completions: completion-bash completion-fish completion-zsh -doc: README.md CONTRIBUTING.md issuetemplates supportedsites +doc: README.md CONTRIBUTING.md CONTRIBUTORS issuetemplates supportedsites ot: offlinetest tar: yt-dlp.tar.gz -# Keep this list in sync with MANIFEST.in +# Keep this list in sync with pyproject.toml includes/artifacts # intended use: when building a source distribution, -# make pypi-files && python setup.py sdist +# make pypi-files && python3 -m build -sn . pypi-files: AUTHORS Changelog.md LICENSE README.md README.txt supportedsites \ - completions yt-dlp.1 requirements.txt setup.cfg devscripts/* test/* + completions yt-dlp.1 pyproject.toml setup.cfg devscripts/* test/* -.PHONY: all clean install test tar pypi-files completions ot offlinetest codetest supportedsites +.PHONY: all clean clean-all clean-test clean-dist clean-cache \ + completions completion-bash completion-fish completion-zsh \ + doc issuetemplates supportedsites ot offlinetest codetest test \ + tar pypi-files lazy-extractors install uninstall clean-test: rm -rf test/testdata/sigs/player-*.js tmp/ *.annotations.xml *.aria2 *.description *.dump *.frag \ *.frag.aria2 *.frag.urls *.info.json *.live_chat.json *.meta *.part* *.tmp *.temp *.unknown_video *.ytdl \ - *.3gp *.ape *.ass *.avi *.desktop *.f4v *.flac *.flv *.gif *.jpeg *.jpg *.m4a *.m4v *.mhtml *.mkv *.mov *.mp3 \ - *.mp4 *.mpga *.oga *.ogg *.opus *.png *.sbv *.srt *.swf *.swp *.tt *.ttml *.url *.vtt *.wav *.webloc *.webm *.webp + *.3gp *.ape *.ass *.avi *.desktop *.f4v *.flac *.flv *.gif *.jpeg *.jpg *.lrc *.m4a *.m4v *.mhtml *.mkv *.mov *.mp3 *.mp4 \ + *.mpg *.mpga *.oga *.ogg *.opus *.png *.sbv *.srt *.ssa *.swf *.tt *.ttml *.url *.vtt *.wav *.webloc *.webm *.webp clean-dist: rm -rf yt-dlp.1.temp.md yt-dlp.1 README.txt MANIFEST build/ dist/ .coverage cover/ yt-dlp.tar.gz completions/ \ - yt_dlp/extractor/lazy_extractors.py *.spec CONTRIBUTING.md.tmp yt-dlp yt-dlp.exe yt_dlp.egg-info/ AUTHORS .mailmap + yt_dlp/extractor/lazy_extractors.py *.spec CONTRIBUTING.md.tmp yt-dlp yt-dlp.exe yt_dlp.egg-info/ AUTHORS clean-cache: find . \( \ - -type d -name .pytest_cache -o -type d -name __pycache__ -o -name "*.pyc" -o -name "*.class" \ + -type d -name ".*_cache" -o -type d -name __pycache__ -o -name "*.pyc" -o -name "*.class" \ \) -prune -exec rm -rf {} \; completion-bash: completions/bash/yt-dlp @@ -37,12 +40,15 @@ BINDIR ?= $(PREFIX)/bin MANDIR ?= $(PREFIX)/man SHAREDIR ?= $(PREFIX)/share PYTHON ?= /usr/bin/env python3 +GNUTAR ?= tar -# set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local -SYSCONFDIR = $(shell if [ $(PREFIX) = /usr -o $(PREFIX) = /usr/local ]; then echo /etc; else echo $(PREFIX)/etc; fi) - -# set markdown input format to "markdown-smart" for pandoc version 2 and to "markdown" for pandoc prior to version 2 -MARKDOWN = $(shell if [ `pandoc -v | head -n1 | cut -d" " -f2 | head -c1` = "2" ]; then echo markdown-smart; else echo markdown; fi) +# set markdown input format to "markdown-smart" for pandoc version 2+ and to "markdown" for pandoc prior to version 2 +PANDOC_VERSION_CMD = pandoc -v 2>/dev/null | head -n1 | cut -d' ' -f2 | head -c1 +PANDOC_VERSION != $(PANDOC_VERSION_CMD) +PANDOC_VERSION ?= $(shell $(PANDOC_VERSION_CMD)) +MARKDOWN_CMD = if [ "$(PANDOC_VERSION)" = "1" -o "$(PANDOC_VERSION)" = "0" ]; then echo markdown; else echo markdown-smart; fi +MARKDOWN != $(MARKDOWN_CMD) +MARKDOWN ?= $(shell $(MARKDOWN_CMD)) install: lazy-extractors yt-dlp yt-dlp.1 completions mkdir -p $(DESTDIR)$(BINDIR) @@ -64,33 +70,38 @@ uninstall: rm -f $(DESTDIR)$(SHAREDIR)/fish/vendor_completions.d/yt-dlp.fish codetest: - flake8 . + ruff check . + autopep8 --diff . test: - $(PYTHON) -m pytest + $(PYTHON) -m pytest -Werror $(MAKE) codetest offlinetest: codetest - $(PYTHON) -m pytest -k "not download" + $(PYTHON) -m pytest -Werror -m "not download" -# XXX: This is hard to maintain -CODE_FOLDERS = yt_dlp yt_dlp/downloader yt_dlp/extractor yt_dlp/postprocessor yt_dlp/compat yt_dlp/dependencies -yt-dlp: yt_dlp/*.py yt_dlp/*/*.py +CODE_FOLDERS_CMD = find yt_dlp -type f -name '__init__.py' | sed 's,/__init__.py,,' | grep -v '/__' | sort +CODE_FOLDERS != $(CODE_FOLDERS_CMD) +CODE_FOLDERS ?= $(shell $(CODE_FOLDERS_CMD)) +CODE_FILES_CMD = for f in $(CODE_FOLDERS) ; do echo "$$f" | sed 's,$$,/*.py,' ; done +CODE_FILES != $(CODE_FILES_CMD) +CODE_FILES ?= $(shell $(CODE_FILES_CMD)) +yt-dlp: $(CODE_FILES) mkdir -p zip for d in $(CODE_FOLDERS) ; do \ mkdir -p zip/$$d ;\ cp -pPR $$d/*.py zip/$$d/ ;\ done - touch -t 200001010101 zip/yt_dlp/*.py zip/yt_dlp/*/*.py + (cd zip && touch -t 200001010101 $(CODE_FILES)) mv zip/yt_dlp/__main__.py zip/ - cd zip ; zip -q ../yt-dlp yt_dlp/*.py yt_dlp/*/*.py __main__.py + (cd zip && zip -q ../yt-dlp $(CODE_FILES) __main__.py) rm -rf zip echo '#!$(PYTHON)' > yt-dlp cat yt-dlp.zip >> yt-dlp rm yt-dlp.zip chmod a+x yt-dlp -README.md: yt_dlp/*.py yt_dlp/*/*.py devscripts/make_readme.py +README.md: $(CODE_FILES) devscripts/make_readme.py COLUMNS=80 $(PYTHON) yt_dlp/__main__.py --ignore-config --help | $(PYTHON) devscripts/make_readme.py CONTRIBUTING.md: README.md devscripts/make_contributing.py @@ -115,41 +126,48 @@ yt-dlp.1: README.md devscripts/prepare_manpage.py pandoc -s -f $(MARKDOWN) -t man yt-dlp.1.temp.md -o yt-dlp.1 rm -f yt-dlp.1.temp.md -completions/bash/yt-dlp: yt_dlp/*.py yt_dlp/*/*.py devscripts/bash-completion.in +completions/bash/yt-dlp: $(CODE_FILES) devscripts/bash-completion.in mkdir -p completions/bash $(PYTHON) devscripts/bash-completion.py -completions/zsh/_yt-dlp: yt_dlp/*.py yt_dlp/*/*.py devscripts/zsh-completion.in +completions/zsh/_yt-dlp: $(CODE_FILES) devscripts/zsh-completion.in mkdir -p completions/zsh $(PYTHON) devscripts/zsh-completion.py -completions/fish/yt-dlp.fish: yt_dlp/*.py yt_dlp/*/*.py devscripts/fish-completion.in +completions/fish/yt-dlp.fish: $(CODE_FILES) devscripts/fish-completion.in mkdir -p completions/fish $(PYTHON) devscripts/fish-completion.py -_EXTRACTOR_FILES = $(shell find yt_dlp/extractor -name '*.py' -and -not -name 'lazy_extractors.py') +_EXTRACTOR_FILES_CMD = find yt_dlp/extractor -name '*.py' -and -not -name 'lazy_extractors.py' +_EXTRACTOR_FILES != $(_EXTRACTOR_FILES_CMD) +_EXTRACTOR_FILES ?= $(shell $(_EXTRACTOR_FILES_CMD)) yt_dlp/extractor/lazy_extractors.py: devscripts/make_lazy_extractors.py devscripts/lazy_load_template.py $(_EXTRACTOR_FILES) $(PYTHON) devscripts/make_lazy_extractors.py $@ yt-dlp.tar.gz: all - @tar -czf yt-dlp.tar.gz --transform "s|^|yt-dlp/|" --owner 0 --group 0 \ + @$(GNUTAR) -czf yt-dlp.tar.gz --transform "s|^|yt-dlp/|" --owner 0 --group 0 \ --exclude '*.DS_Store' \ --exclude '*.kate-swp' \ --exclude '*.pyc' \ --exclude '*.pyo' \ --exclude '*~' \ --exclude '__pycache__' \ - --exclude '.pytest_cache' \ + --exclude '.*_cache' \ --exclude '.git' \ -- \ README.md supportedsites.md Changelog.md LICENSE \ CONTRIBUTING.md Collaborators.md CONTRIBUTORS AUTHORS \ - Makefile MANIFEST.in yt-dlp.1 README.txt completions \ - setup.py setup.cfg yt-dlp yt_dlp requirements.txt \ - devscripts test + Makefile yt-dlp.1 README.txt completions .gitignore \ + setup.cfg yt-dlp yt_dlp pyproject.toml devscripts test -AUTHORS: .mailmap - git shortlog -s -n | cut -f2 | sort > AUTHORS +AUTHORS: Changelog.md + @if [ -d '.git' ] && command -v git > /dev/null ; then \ + echo 'Generating $@ from git commit history' ; \ + git shortlog -s -n HEAD | cut -f2 | sort > $@ ; \ + fi -.mailmap: - git shortlog -s -e -n | awk '!(out[$$NF]++) { $$1="";sub(/^[ \t]+/,""); print}' > .mailmap +CONTRIBUTORS: Changelog.md + @if [ -d '.git' ] && command -v git > /dev/null ; then \ + echo 'Updating $@ from git commit history' ; \ + $(PYTHON) devscripts/make_changelog.py -v -c > /dev/null ; \ + fi diff --git a/README.md b/README.md index 9b91775bc7..ca32e09bfb 100644 --- a/README.md +++ b/README.md @@ -12,22 +12,20 @@ [![License: Unlicense](https://img.shields.io/badge/-Unlicense-blue.svg?style=for-the-badge)](LICENSE "License") [![CI Status](https://img.shields.io/github/actions/workflow/status/yt-dlp/yt-dlp/core.yml?branch=master&label=Tests&style=for-the-badge)](https://github.com/yt-dlp/yt-dlp/actions "CI Status") [![Commits](https://img.shields.io/github/commit-activity/m/yt-dlp/yt-dlp?label=commits&style=for-the-badge)](https://github.com/yt-dlp/yt-dlp/commits "Commit History") -[![Last Commit](https://img.shields.io/github/last-commit/yt-dlp/yt-dlp/master?label=&style=for-the-badge&display_timestamp=committer)](https://github.com/yt-dlp/yt-dlp/commits "Commit History") +[![Last Commit](https://img.shields.io/github/last-commit/yt-dlp/yt-dlp/master?label=&style=for-the-badge&display_timestamp=committer)](https://github.com/yt-dlp/yt-dlp/pulse/monthly "Last activity") -yt-dlp is a [youtube-dl](https://github.com/ytdl-org/youtube-dl) fork based on the now inactive [youtube-dlc](https://github.com/blackjack4494/yt-dlc). The main focus of this project is adding new features and patches while also keeping up to date with the original project +yt-dlp is a feature-rich command-line audio/video downloader with support for [thousands of sites](supportedsites.md). The project is a fork of [youtube-dl](https://github.com/ytdl-org/youtube-dl) based on the now inactive [youtube-dlc](https://github.com/blackjack4494/yt-dlc). -* [NEW FEATURES](#new-features) - * [Differences in default behavior](#differences-in-default-behavior) * [INSTALLATION](#installation) * [Detailed instructions](https://github.com/yt-dlp/yt-dlp/wiki/Installation) - * [Update](#update) * [Release Files](#release-files) + * [Update](#update) * [Dependencies](#dependencies) * [Compile](#compile) * [USAGE AND OPTIONS](#usage-and-options) @@ -49,7 +47,7 @@ * [Extractor Options](#extractor-options) * [CONFIGURATION](#configuration) * [Configuration file encoding](#configuration-file-encoding) - * [Authentication with .netrc file](#authentication-with-netrc-file) + * [Authentication with netrc](#authentication-with-netrc) * [Notes about environment variables](#notes-about-environment-variables) * [OUTPUT TEMPLATE](#output-template) * [Output template examples](#output-template-examples) @@ -65,7 +63,10 @@ * [Developing Plugins](#developing-plugins) * [EMBEDDING YT-DLP](#embedding-yt-dlp) * [Embedding examples](#embedding-examples) -* [DEPRECATED OPTIONS](#deprecated-options) +* [CHANGES FROM YOUTUBE-DL](#changes-from-youtube-dl) + * [New features](#new-features) + * [Differences in default behavior](#differences-in-default-behavior) + * [Deprecated options](#deprecated-options) * [CONTRIBUTING](CONTRIBUTING.md#contributing-to-yt-dlp) * [Opening an Issue](CONTRIBUTING.md#opening-an-issue) * [Developer Instructions](CONTRIBUTING.md#developer-instructions) @@ -74,96 +75,6 @@ -# NEW FEATURES - -* Merged with **youtube-dl v2021.12.17+ [commit/2dd6c6e](https://github.com/ytdl-org/youtube-dl/commit/2dd6c6e)** ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21)) and **youtube-dlc v2020.11.11-3+ [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl) - -* **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in YouTube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API - -* **[Format Sorting](#sorting-formats)**: The default format sorting options have been changed so that higher resolution and better codecs will be now preferred instead of simply using larger bitrate. Furthermore, you can now specify the sort order using `-S`. This allows for much easier format selection than what is possible by simply using `--format` ([examples](#format-selection-examples)) - -* **Merged with animelover1984/youtube-dl**: You get most of the features and improvements from [animelover1984/youtube-dl](https://github.com/animelover1984/youtube-dl) including `--write-comments`, `BiliBiliSearch`, `BilibiliChannel`, Embedding thumbnail in mp4/ogg/opus, playlist infojson etc. Note that NicoNico livestreams are not available. See [#31](https://github.com/yt-dlp/yt-dlp/pull/31) for details. - -* **YouTube improvements**: - * Supports Clips, Stories (`ytstories:`), Search (including filters)**\***, YouTube Music Search, Channel-specific search, Search prefixes (`ytsearch:`, `ytsearchdate:`)**\***, Mixes, YouTube Music Albums/Channels ([except self-uploaded music](https://github.com/yt-dlp/yt-dlp/issues/723)), and Feeds (`:ytfav`, `:ytwatchlater`, `:ytsubs`, `:ythistory`, `:ytrec`, `:ytnotif`) - * Fix for [n-sig based throttling](https://github.com/ytdl-org/youtube-dl/issues/29326) **\*** - * Supports some (but not all) age-gated content without cookies - * Download livestreams from the start using `--live-from-start` (*experimental*) - * `255kbps` audio is extracted (if available) from YouTube Music when premium cookies are given - * Channel URLs download all uploads of the channel, including shorts and live - -* **Cookies from browser**: Cookies can be automatically extracted from all major web browsers using `--cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER]` - -* **Download time range**: Videos can be downloaded partially based on either timestamps or chapters using `--download-sections` - -* **Split video by chapters**: Videos can be split into multiple files based on chapters using `--split-chapters` - -* **Multi-threaded fragment downloads**: Download multiple fragments of m3u8/mpd videos in parallel. Use `--concurrent-fragments` (`-N`) option to set the number of threads used - -* **Aria2c with HLS/DASH**: You can use `aria2c` as the external downloader for DASH(mpd) and HLS(m3u8) formats - -* **New and fixed extractors**: Many new extractors have been added and a lot of existing ones have been fixed. See the [changelog](Changelog.md) or the [list of supported sites](supportedsites.md) - -* **New MSOs**: Philo, Spectrum, SlingTV, Cablevision, RCN etc. - -* **Subtitle extraction from manifests**: Subtitles can be extracted from streaming media manifests. See [commit/be6202f](https://github.com/yt-dlp/yt-dlp/commit/be6202f12b97858b9d716e608394b51065d0419f) for details - -* **Multiple paths and output templates**: You can give different [output templates](#output-template) and download paths for different types of files. You can also set a temporary path where intermediary files are downloaded to using `--paths` (`-P`) - -* **Portable Configuration**: Configuration files are automatically loaded from the home and root directories. See [CONFIGURATION](#configuration) for details - -* **Output template improvements**: Output templates can now have date-time formatting, numeric offsets, object traversal etc. See [output template](#output-template) for details. Even more advanced operations can also be done with the help of `--parse-metadata` and `--replace-in-metadata` - -* **Other new options**: Many new options have been added such as `--alias`, `--print`, `--concat-playlist`, `--wait-for-video`, `--retry-sleep`, `--sleep-requests`, `--convert-thumbnails`, `--force-download-archive`, `--force-overwrites`, `--break-on-reject` etc - -* **Improvements**: Regex and other operators in `--format`/`--match-filter`, multiple `--postprocessor-args` and `--downloader-args`, faster archive checking, more [format selection options](#format-selection), merge multi-video/audio, multiple `--config-locations`, `--exec` at different stages, etc - -* **Plugins**: Extractors and PostProcessors can be loaded from an external file. See [plugins](#plugins) for details - -* **Self-updater**: The releases can be updated using `yt-dlp -U` - -See [changelog](Changelog.md) or [commits](https://github.com/yt-dlp/yt-dlp/commits) for the full list of changes - -Features marked with a **\*** have been back-ported to youtube-dl - -### Differences in default behavior - -Some of yt-dlp's default options are different from that of youtube-dl and youtube-dlc: - -* The options `--auto-number` (`-A`), `--title` (`-t`) and `--literal` (`-l`), no longer work. See [removed options](#Removed) for details -* `avconv` is not supported as an alternative to `ffmpeg` -* yt-dlp stores config files in slightly different locations to youtube-dl. See [CONFIGURATION](#configuration) for a list of correct locations -* The default [output template](#output-template) is `%(title)s [%(id)s].%(ext)s`. There is no real reason for this change. This was changed before yt-dlp was ever made public and now there are no plans to change it back to `%(title)s-%(id)s.%(ext)s`. Instead, you may use `--compat-options filename` -* The default [format sorting](#sorting-formats) is different from youtube-dl and prefers higher resolution and better codecs rather than higher bitrates. You can use the `--format-sort` option to change this to any order you prefer, or use `--compat-options format-sort` to use youtube-dl's sorting order -* The default format selector is `bv*+ba/b`. This means that if a combined video + audio format that is better than the best video-only format is found, the former will be preferred. Use `-f bv+ba/b` or `--compat-options format-spec` to revert this -* Unlike youtube-dlc, yt-dlp does not allow merging multiple audio/video streams into one file by default (since this conflicts with the use of `-f bv*+ba`). If needed, this feature must be enabled using `--audio-multistreams` and `--video-multistreams`. You can also use `--compat-options multistreams` to enable both -* `--no-abort-on-error` is enabled by default. Use `--abort-on-error` or `--compat-options abort-on-error` to abort on errors instead -* When writing metadata files such as thumbnails, description or infojson, the same information (if available) is also written for playlists. Use `--no-write-playlist-metafiles` or `--compat-options no-playlist-metafiles` to not write these files -* `--add-metadata` attaches the `infojson` to `mkv` files in addition to writing the metadata when used with `--write-info-json`. Use `--no-embed-info-json` or `--compat-options no-attach-info-json` to revert this -* Some metadata are embedded into different fields when using `--add-metadata` as compared to youtube-dl. Most notably, `comment` field contains the `webpage_url` and `synopsis` contains the `description`. You can [use `--parse-metadata`](#modifying-metadata) to modify this to your liking or use `--compat-options embed-metadata` to revert this -* `playlist_index` behaves differently when used with options like `--playlist-reverse` and `--playlist-items`. See [#302](https://github.com/yt-dlp/yt-dlp/issues/302) for details. You can use `--compat-options playlist-index` if you want to keep the earlier behavior -* The output of `-F` is listed in a new format. Use `--compat-options list-formats` to revert this -* Live chats (if available) are considered as subtitles. Use `--sub-langs all,-live_chat` to download all subtitles except live chat. You can also use `--compat-options no-live-chat` to prevent any live chat/danmaku from downloading -* YouTube channel URLs download all uploads of the channel. To download only the videos in a specific tab, pass the tab's URL. If the channel does not show the requested tab, an error will be raised. Also, `/live` URLs raise an error if there are no live videos instead of silently downloading the entire channel. You may use `--compat-options no-youtube-channel-redirect` to revert all these redirections -* Unavailable videos are also listed for YouTube playlists. Use `--compat-options no-youtube-unavailable-videos` to remove this -* The upload dates extracted from YouTube are in UTC [when available](https://github.com/yt-dlp/yt-dlp/blob/89e4d86171c7b7c997c77d4714542e0383bf0db0/yt_dlp/extractor/youtube.py#L3898-L3900). Use `--compat-options no-youtube-prefer-utc-upload-date` to prefer the non-UTC upload date. -* If `ffmpeg` is used as the downloader, the downloading and merging of formats happen in a single step when possible. Use `--compat-options no-direct-merge` to revert this -* Thumbnail embedding in `mp4` is done with mutagen if possible. Use `--compat-options embed-thumbnail-atomicparsley` to force the use of AtomicParsley instead -* Some private fields such as filenames are removed by default from the infojson. Use `--no-clean-infojson` or `--compat-options no-clean-infojson` to revert this -* When `--embed-subs` and `--write-subs` are used together, the subtitles are written to disk and also embedded in the media file. You can use just `--embed-subs` to embed the subs and automatically delete the separate file. See [#630 (comment)](https://github.com/yt-dlp/yt-dlp/issues/630#issuecomment-893659460) for more info. `--compat-options no-keep-subs` can be used to revert this -* `certifi` will be used for SSL root certificates, if installed. If you want to use system certificates (e.g. self-signed), use `--compat-options no-certifi` -* yt-dlp's sanitization of invalid characters in filenames is different/smarter than in youtube-dl. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior -* yt-dlp tries to parse the external downloader outputs into the standard progress output if possible (Currently implemented: [~~aria2c~~](https://github.com/yt-dlp/yt-dlp/issues/5931)). You can use `--compat-options no-external-downloader-progress` to get the downloader output as-is - -For ease of use, a few more compat options are available: - -* `--compat-options all`: Use all compat options (Do NOT use) -* `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams` -* `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect` -* `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` -* `--compat-options 2022`: Same as `--compat-options no-external-downloader-progress`. Use this to enable all future compat options - - # INSTALLATION @@ -176,15 +87,7 @@ # INSTALLATION [![All versions](https://img.shields.io/badge/-All_Versions-lightgrey.svg?style=for-the-badge)](https://github.com/yt-dlp/yt-dlp/releases) -You can install yt-dlp using [the binaries](#release-files), [PIP](https://pypi.org/project/yt-dlp) or one using a third-party package manager. See [the wiki](https://github.com/yt-dlp/yt-dlp/wiki/Installation) for detailed instructions - - -## UPDATE -You can use `yt-dlp -U` to update if you are [using the release binaries](#release-files) - -If you [installed with PIP](https://github.com/yt-dlp/yt-dlp/wiki/Installation#with-pip), simply re-run the same command that was used to install the program - -For other third-party package managers, see [the wiki](https://github.com/yt-dlp/yt-dlp/wiki/Installation#third-party-package-managers) or refer their documentation +You can install yt-dlp using [the binaries](#release-files), [pip](https://pypi.org/project/yt-dlp) or one using a third-party package manager. See [the wiki](https://github.com/yt-dlp/yt-dlp/wiki/Installation) for detailed instructions @@ -202,10 +105,9 @@ #### Alternatives File|Description :---|:--- -[yt-dlp_x86.exe](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_x86.exe)|Windows (Vista SP2+) standalone x86 (32-bit) binary +[yt-dlp_x86.exe](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_x86.exe)|Windows (Win7 SP1+) standalone x86 (32-bit) binary [yt-dlp_min.exe](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_min.exe)|Windows (Win7 SP1+) standalone x64 binary built with `py2exe`
([Not recommended](#standalone-py2exe-builds-windows)) [yt-dlp_linux](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_linux)|Linux standalone x64 binary -[yt-dlp_linux.zip](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_linux.zip)|Unpackaged Linux executable (no auto-update) [yt-dlp_linux_armv7l](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_linux_armv7l)|Linux standalone armv7l (32-bit) binary [yt-dlp_linux_aarch64](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_linux_aarch64)|Linux standalone aarch64 (64-bit) binary [yt-dlp_win.zip](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_win.zip)|Unpackaged Windows executable (no auto-update) @@ -218,14 +120,60 @@ #### Misc :---|:--- [yt-dlp.tar.gz](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.tar.gz)|Source tarball [SHA2-512SUMS](https://github.com/yt-dlp/yt-dlp/releases/latest/download/SHA2-512SUMS)|GNU-style SHA512 sums +[SHA2-512SUMS.sig](https://github.com/yt-dlp/yt-dlp/releases/latest/download/SHA2-512SUMS.sig)|GPG signature file for SHA512 sums [SHA2-256SUMS](https://github.com/yt-dlp/yt-dlp/releases/latest/download/SHA2-256SUMS)|GNU-style SHA256 sums +[SHA2-256SUMS.sig](https://github.com/yt-dlp/yt-dlp/releases/latest/download/SHA2-256SUMS.sig)|GPG signature file for SHA256 sums + +The public key that can be used to verify the GPG signatures is [available here](https://github.com/yt-dlp/yt-dlp/blob/master/public.key) +Example usage: +``` +curl -L https://github.com/yt-dlp/yt-dlp/raw/master/public.key | gpg --import +gpg --verify SHA2-256SUMS.sig SHA2-256SUMS +gpg --verify SHA2-512SUMS.sig SHA2-512SUMS +``` +**Note**: The manpages, shell completion (autocomplete) files etc. are available inside the [source tarball](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.tar.gz) -**Note**: The manpages, shell completion files etc. are available in the [source tarball](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.tar.gz) + +## UPDATE +You can use `yt-dlp -U` to update if you are using the [release binaries](#release-files) + +If you [installed with pip](https://github.com/yt-dlp/yt-dlp/wiki/Installation#with-pip), simply re-run the same command that was used to install the program + +For other third-party package managers, see [the wiki](https://github.com/yt-dlp/yt-dlp/wiki/Installation#third-party-package-managers) or refer to their documentation + + + +There are currently three release channels for binaries: `stable`, `nightly` and `master`. + +* `stable` is the default channel, and many of its changes have been tested by users of the `nightly` and `master` channels. +* The `nightly` channel has releases scheduled to build every day around midnight UTC, for a snapshot of the project's new patches and changes. This is the **recommended channel for regular users** of yt-dlp. The `nightly` releases are available from [yt-dlp/yt-dlp-nightly-builds](https://github.com/yt-dlp/yt-dlp-nightly-builds/releases) or as development releases of the `yt-dlp` PyPI package (which can be installed with pip's `--pre` flag). +* The `master` channel features releases that are built after each push to the master branch, and these will have the very latest fixes and additions, but may also be more prone to regressions. They are available from [yt-dlp/yt-dlp-master-builds](https://github.com/yt-dlp/yt-dlp-master-builds/releases). + +When using `--update`/`-U`, a release binary will only update to its current channel. +`--update-to CHANNEL` can be used to switch to a different channel when a newer version is available. `--update-to [CHANNEL@]TAG` can also be used to upgrade or downgrade to specific tags from a channel. + +You may also use `--update-to ` (`/`) to update to a channel on a completely different repository. Be careful with what repository you are updating to though, there is no verification done for binaries from different repositories. + +Example usage: + +* `yt-dlp --update-to master` switch to the `master` channel and update to its latest release +* `yt-dlp --update-to stable@2023.07.06` upgrade/downgrade to release to `stable` channel tag `2023.07.06` +* `yt-dlp --update-to 2023.10.07` upgrade/downgrade to tag `2023.10.07` if it exists on the current channel +* `yt-dlp --update-to example/yt-dlp@2023.09.24` upgrade/downgrade to the release from the `example/yt-dlp` repository, tag `2023.09.24` + +**Important**: Any user experiencing an issue with the `stable` release should install or update to the `nightly` release before submitting a bug report: +``` +# To update to nightly from stable executable/binary: +yt-dlp --update-to nightly + +# To install nightly with pip: +python3 -m pip install -U --pre "yt-dlp[default]" +``` ## DEPENDENCIES -Python versions 3.7+ (CPython and PyPy) are supported. Other versions and implementations may or may not work correctly. +Python versions 3.8+ (CPython and PyPy) are supported. Other versions and implementations may or may not work correctly. -# DEPRECATED OPTIONS +# CHANGES FROM YOUTUBE-DL + +### New features + +* Forked from [**yt-dlc@f9401f2**](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee) and merged with [**youtube-dl@a08f2b7**](https://github.com/ytdl-org/youtube-dl/commit/a08f2b7e4567cdc50c0614ee0a4ffdff49b8b6e6) ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21)) + +* **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in YouTube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API + +* **[Format Sorting](#sorting-formats)**: The default format sorting options have been changed so that higher resolution and better codecs will be now preferred instead of simply using larger bitrate. Furthermore, you can now specify the sort order using `-S`. This allows for much easier format selection than what is possible by simply using `--format` ([examples](#format-selection-examples)) + +* **Merged with animelover1984/youtube-dl**: You get most of the features and improvements from [animelover1984/youtube-dl](https://github.com/animelover1984/youtube-dl) including `--write-comments`, `BiliBiliSearch`, `BilibiliChannel`, Embedding thumbnail in mp4/ogg/opus, playlist infojson etc. Note that NicoNico livestreams are not available. See [#31](https://github.com/yt-dlp/yt-dlp/pull/31) for details. + +* **YouTube improvements**: + * Supports Clips, Stories (`ytstories:`), Search (including filters)**\***, YouTube Music Search, Channel-specific search, Search prefixes (`ytsearch:`, `ytsearchdate:`)**\***, Mixes, and Feeds (`:ytfav`, `:ytwatchlater`, `:ytsubs`, `:ythistory`, `:ytrec`, `:ytnotif`) + * Fix for [n-sig based throttling](https://github.com/ytdl-org/youtube-dl/issues/29326) **\*** + * Supports some (but not all) age-gated content without cookies + * Download livestreams from the start using `--live-from-start` (*experimental*) + * Channel URLs download all uploads of the channel, including shorts and live + +* **Cookies from browser**: Cookies can be automatically extracted from all major web browsers using `--cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER]` + +* **Download time range**: Videos can be downloaded partially based on either timestamps or chapters using `--download-sections` + +* **Split video by chapters**: Videos can be split into multiple files based on chapters using `--split-chapters` + +* **Multi-threaded fragment downloads**: Download multiple fragments of m3u8/mpd videos in parallel. Use `--concurrent-fragments` (`-N`) option to set the number of threads used + +* **Aria2c with HLS/DASH**: You can use `aria2c` as the external downloader for DASH(mpd) and HLS(m3u8) formats + +* **New and fixed extractors**: Many new extractors have been added and a lot of existing ones have been fixed. See the [changelog](Changelog.md) or the [list of supported sites](supportedsites.md) + +* **New MSOs**: Philo, Spectrum, SlingTV, Cablevision, RCN etc. + +* **Subtitle extraction from manifests**: Subtitles can be extracted from streaming media manifests. See [commit/be6202f](https://github.com/yt-dlp/yt-dlp/commit/be6202f12b97858b9d716e608394b51065d0419f) for details + +* **Multiple paths and output templates**: You can give different [output templates](#output-template) and download paths for different types of files. You can also set a temporary path where intermediary files are downloaded to using `--paths` (`-P`) + +* **Portable Configuration**: Configuration files are automatically loaded from the home and root directories. See [CONFIGURATION](#configuration) for details + +* **Output template improvements**: Output templates can now have date-time formatting, numeric offsets, object traversal etc. See [output template](#output-template) for details. Even more advanced operations can also be done with the help of `--parse-metadata` and `--replace-in-metadata` + +* **Other new options**: Many new options have been added such as `--alias`, `--print`, `--concat-playlist`, `--wait-for-video`, `--retry-sleep`, `--sleep-requests`, `--convert-thumbnails`, `--force-download-archive`, `--force-overwrites`, `--break-match-filter` etc + +* **Improvements**: Regex and other operators in `--format`/`--match-filter`, multiple `--postprocessor-args` and `--downloader-args`, faster archive checking, more [format selection options](#format-selection), merge multi-video/audio, multiple `--config-locations`, `--exec` at different stages, etc + +* **Plugins**: Extractors and PostProcessors can be loaded from an external file. See [plugins](#plugins) for details + +* **Self updater**: The releases can be updated using `yt-dlp -U`, and downgraded using `--update-to` if required + +* **Automated builds**: [Nightly/master builds](#update-channels) can be used with `--update-to nightly` and `--update-to master` + +See [changelog](Changelog.md) or [commits](https://github.com/yt-dlp/yt-dlp/commits) for the full list of changes + +Features marked with a **\*** have been back-ported to youtube-dl + +### Differences in default behavior + +Some of yt-dlp's default options are different from that of youtube-dl and youtube-dlc: + +* yt-dlp supports only [Python 3.8+](## "Windows 7"), and *may* remove support for more versions as they [become EOL](https://devguide.python.org/versions/#python-release-cycle); while [youtube-dl still supports Python 2.6+ and 3.2+](https://github.com/ytdl-org/youtube-dl/issues/30568#issue-1118238743) +* The options `--auto-number` (`-A`), `--title` (`-t`) and `--literal` (`-l`), no longer work. See [removed options](#Removed) for details +* `avconv` is not supported as an alternative to `ffmpeg` +* yt-dlp stores config files in slightly different locations to youtube-dl. See [CONFIGURATION](#configuration) for a list of correct locations +* The default [output template](#output-template) is `%(title)s [%(id)s].%(ext)s`. There is no real reason for this change. This was changed before yt-dlp was ever made public and now there are no plans to change it back to `%(title)s-%(id)s.%(ext)s`. Instead, you may use `--compat-options filename` +* The default [format sorting](#sorting-formats) is different from youtube-dl and prefers higher resolution and better codecs rather than higher bitrates. You can use the `--format-sort` option to change this to any order you prefer, or use `--compat-options format-sort` to use youtube-dl's sorting order +* The default format selector is `bv*+ba/b`. This means that if a combined video + audio format that is better than the best video-only format is found, the former will be preferred. Use `-f bv+ba/b` or `--compat-options format-spec` to revert this +* Unlike youtube-dlc, yt-dlp does not allow merging multiple audio/video streams into one file by default (since this conflicts with the use of `-f bv*+ba`). If needed, this feature must be enabled using `--audio-multistreams` and `--video-multistreams`. You can also use `--compat-options multistreams` to enable both +* `--no-abort-on-error` is enabled by default. Use `--abort-on-error` or `--compat-options abort-on-error` to abort on errors instead +* When writing metadata files such as thumbnails, description or infojson, the same information (if available) is also written for playlists. Use `--no-write-playlist-metafiles` or `--compat-options no-playlist-metafiles` to not write these files +* `--add-metadata` attaches the `infojson` to `mkv` files in addition to writing the metadata when used with `--write-info-json`. Use `--no-embed-info-json` or `--compat-options no-attach-info-json` to revert this +* Some metadata are embedded into different fields when using `--add-metadata` as compared to youtube-dl. Most notably, `comment` field contains the `webpage_url` and `synopsis` contains the `description`. You can [use `--parse-metadata`](#modifying-metadata) to modify this to your liking or use `--compat-options embed-metadata` to revert this +* `playlist_index` behaves differently when used with options like `--playlist-reverse` and `--playlist-items`. See [#302](https://github.com/yt-dlp/yt-dlp/issues/302) for details. You can use `--compat-options playlist-index` if you want to keep the earlier behavior +* The output of `-F` is listed in a new format. Use `--compat-options list-formats` to revert this +* Live chats (if available) are considered as subtitles. Use `--sub-langs all,-live_chat` to download all subtitles except live chat. You can also use `--compat-options no-live-chat` to prevent any live chat/danmaku from downloading +* YouTube channel URLs download all uploads of the channel. To download only the videos in a specific tab, pass the tab's URL. If the channel does not show the requested tab, an error will be raised. Also, `/live` URLs raise an error if there are no live videos instead of silently downloading the entire channel. You may use `--compat-options no-youtube-channel-redirect` to revert all these redirections +* Unavailable videos are also listed for YouTube playlists. Use `--compat-options no-youtube-unavailable-videos` to remove this +* The upload dates extracted from YouTube are in UTC [when available](https://github.com/yt-dlp/yt-dlp/blob/89e4d86171c7b7c997c77d4714542e0383bf0db0/yt_dlp/extractor/youtube.py#L3898-L3900). Use `--compat-options no-youtube-prefer-utc-upload-date` to prefer the non-UTC upload date. +* If `ffmpeg` is used as the downloader, the downloading and merging of formats happen in a single step when possible. Use `--compat-options no-direct-merge` to revert this +* Thumbnail embedding in `mp4` is done with mutagen if possible. Use `--compat-options embed-thumbnail-atomicparsley` to force the use of AtomicParsley instead +* Some internal metadata such as filenames are removed by default from the infojson. Use `--no-clean-infojson` or `--compat-options no-clean-infojson` to revert this +* When `--embed-subs` and `--write-subs` are used together, the subtitles are written to disk and also embedded in the media file. You can use just `--embed-subs` to embed the subs and automatically delete the separate file. See [#630 (comment)](https://github.com/yt-dlp/yt-dlp/issues/630#issuecomment-893659460) for more info. `--compat-options no-keep-subs` can be used to revert this +* `certifi` will be used for SSL root certificates, if installed. If you want to use system certificates (e.g. self-signed), use `--compat-options no-certifi` +* yt-dlp's sanitization of invalid characters in filenames is different/smarter than in youtube-dl. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior +* ~~yt-dlp tries to parse the external downloader outputs into the standard progress output if possible (Currently implemented: [aria2c](https://github.com/yt-dlp/yt-dlp/issues/5931)). You can use `--compat-options no-external-downloader-progress` to get the downloader output as-is~~ +* yt-dlp versions between 2021.09.01 and 2023.01.02 applies `--match-filter` to nested playlists. This was an unintentional side-effect of [8f18ac](https://github.com/yt-dlp/yt-dlp/commit/8f18aca8717bb0dd49054555af8d386e5eda3a88) and is fixed in [d7b460](https://github.com/yt-dlp/yt-dlp/commit/d7b460d0e5fc710950582baed2e3fc616ed98a80). Use `--compat-options playlist-match-filter` to revert this +* yt-dlp versions between 2021.11.10 and 2023.06.21 estimated `filesize_approx` values for fragmented/manifest formats. This was added for convenience in [f2fe69](https://github.com/yt-dlp/yt-dlp/commit/f2fe69c7b0d208bdb1f6292b4ae92bc1e1a7444a), but was reverted in [0dff8e](https://github.com/yt-dlp/yt-dlp/commit/0dff8e4d1e6e9fb938f4256ea9af7d81f42fd54f) due to the potentially extreme inaccuracy of the estimated values. Use `--compat-options manifest-filesize-approx` to keep extracting the estimated values +* yt-dlp uses modern http client backends such as `requests`. Use `--compat-options prefer-legacy-http-handler` to prefer the legacy http handler (`urllib`) to be used for standard http requests. +* The sub-modules `swfinterp`, `casefold` are removed. +* Passing `--simulate` (or calling `extract_info` with `download=False`) no longer alters the default format selection. See [#9843](https://github.com/yt-dlp/yt-dlp/issues/9843) for details. + +For ease of use, a few more compat options are available: + +* `--compat-options all`: Use all compat options (**Do NOT use this!**) +* `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext` +* `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext` +* `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` +* `--compat-options 2022`: Same as `--compat-options 2023,playlist-match-filter,no-external-downloader-progress,prefer-legacy-http-handler,manifest-filesize-approx` +* `--compat-options 2023`: Currently does nothing. Use this to enable all future compat options + +The following compat options restore vulnerable behavior from before security patches: + +* `--compat-options allow-unsafe-ext`: Allow files with any extension (including unsafe ones) to be downloaded ([GHSA-79w7-vh3h-8g4j]()) + + > :warning: Only use if a valid file download is rejected because its extension is detected as uncommon + > + > **This option can enable remote code execution! Consider [opening an issue]() instead!** + +### Deprecated options These are all the deprecated options and the current alternative to achieve the same effect @@ -2099,13 +2270,14 @@ #### Redundant options --reject-title REGEX --match-filter "title !~= (?i)REGEX" --min-views COUNT --match-filter "view_count >=? COUNT" --max-views COUNT --match-filter "view_count <=? COUNT" + --break-on-reject Use --break-match-filter --user-agent UA --add-header "User-Agent:UA" --referer URL --add-header "Referer:URL" --playlist-start NUMBER -I NUMBER: --playlist-end NUMBER -I :NUMBER --playlist-reverse -I ::-1 --no-playlist-reverse Default - + --no-colors --color no_color #### Not recommended While these options still work, their use is not recommended since there are other alternatives to achieve the same @@ -2128,7 +2300,10 @@ #### Not recommended --youtube-skip-hls-manifest --extractor-args "youtube:skip=hls" (Alias: --no-youtube-include-hls-manifest) --youtube-include-dash-manifest Default (Alias: --no-youtube-skip-dash-manifest) --youtube-include-hls-manifest Default (Alias: --no-youtube-skip-hls-manifest) - + --geo-bypass --xff "default" + --no-geo-bypass --xff "never" + --geo-bypass-country CODE --xff CODE + --geo-bypass-ip-block IP_BLOCK --xff IP_BLOCK #### Developer options These options are not intended to be used by the end-user @@ -2139,7 +2314,6 @@ #### Developer options --allow-unplayable-formats List unplayable formats also --no-allow-unplayable-formats Default - #### Old aliases These are aliases that are no longer documented for various reasons @@ -2185,6 +2359,7 @@ #### No longer supported --write-annotations No supported site has annotations now --no-write-annotations Default --compat-options seperate-video-versions No longer needed + --compat-options no-youtube-prefer-utc-upload-date No longer supported #### Removed These options were deprecated since 2014 and have now been entirely removed @@ -2192,6 +2367,7 @@ #### Removed -A, --auto-number -o "%(autonumber)s-%(id)s.%(ext)s" -t, -l, --title, --literal -o "%(title)s-%(id)s.%(ext)s" + # CONTRIBUTING See [CONTRIBUTING.md](CONTRIBUTING.md#contributing-to-yt-dlp) for instructions on [Opening an Issue](CONTRIBUTING.md#opening-an-issue) and [Contributing code to the project](CONTRIBUTING.md#developer-instructions) diff --git a/bundle/__init__.py b/bundle/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/bundle/docker/compose.yml b/bundle/docker/compose.yml new file mode 100644 index 0000000000..5f89ca6d09 --- /dev/null +++ b/bundle/docker/compose.yml @@ -0,0 +1,10 @@ +services: + static: + build: static + environment: + channel: ${channel} + origin: ${origin} + version: ${version} + volumes: + - ~/build:/build + - ../..:/yt-dlp diff --git a/bundle/docker/static/Dockerfile b/bundle/docker/static/Dockerfile new file mode 100644 index 0000000000..dae2dff3d8 --- /dev/null +++ b/bundle/docker/static/Dockerfile @@ -0,0 +1,21 @@ +FROM alpine:3.19 as base + +RUN apk --update add --no-cache \ + build-base \ + python3 \ + pipx \ + ; + +RUN pipx install pyinstaller +# Requires above step to prepare the shared venv +RUN ~/.local/share/pipx/shared/bin/python -m pip install -U wheel +RUN apk --update add --no-cache \ + scons \ + patchelf \ + binutils \ + ; +RUN pipx install staticx + +WORKDIR /yt-dlp +COPY entrypoint.sh /entrypoint.sh +ENTRYPOINT /entrypoint.sh diff --git a/bundle/docker/static/entrypoint.sh b/bundle/docker/static/entrypoint.sh new file mode 100755 index 0000000000..2202759742 --- /dev/null +++ b/bundle/docker/static/entrypoint.sh @@ -0,0 +1,13 @@ +#!/bin/ash +set -e + +source ~/.local/share/pipx/venvs/pyinstaller/bin/activate +python -m devscripts.install_deps --include secretstorage --include curl-cffi +python -m devscripts.make_lazy_extractors +python devscripts/update-version.py -c "${channel}" -r "${origin}" "${version}" +python -m bundle.pyinstaller +deactivate + +source ~/.local/share/pipx/venvs/staticx/bin/activate +staticx /yt-dlp/dist/yt-dlp_linux /build/yt-dlp_linux +deactivate diff --git a/bundle/py2exe.py b/bundle/py2exe.py new file mode 100755 index 0000000000..5b7f4883bc --- /dev/null +++ b/bundle/py2exe.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +# Allow execution from anywhere +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import warnings + +from py2exe import freeze + +from devscripts.utils import read_version + +VERSION = read_version() + + +def main(): + warnings.warn( + 'py2exe builds do not support pycryptodomex and needs VC++14 to run. ' + 'It is recommended to run "pyinst.py" to build using pyinstaller instead') + + freeze( + console=[{ + 'script': './yt_dlp/__main__.py', + 'dest_base': 'yt-dlp', + 'icon_resources': [(1, 'devscripts/logo.ico')], + }], + version_info={ + 'version': VERSION, + 'description': 'A feature-rich command-line audio/video downloader', + 'comments': 'Official repository: ', + 'product_name': 'yt-dlp', + 'product_version': VERSION, + }, + options={ + 'bundle_files': 0, + 'compressed': 1, + 'optimize': 2, + 'dist_dir': './dist', + 'excludes': [ + # py2exe cannot import Crypto + 'Crypto', + 'Cryptodome', + # requests >=2.32.0 breaks py2exe builds due to certifi dependency + 'requests', + 'urllib3', + ], + 'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'], + # Modules that are only imported dynamically must be added here + 'includes': ['yt_dlp.compat._legacy', 'yt_dlp.compat._deprecated', + 'yt_dlp.utils._legacy', 'yt_dlp.utils._deprecated'], + }, + zipfile=None, + ) + + +if __name__ == '__main__': + main() diff --git a/pyinst.py b/bundle/pyinstaller.py old mode 100644 new mode 100755 similarity index 94% rename from pyinst.py rename to bundle/pyinstaller.py index c36f6acd4f..4184c4bc9f --- a/pyinst.py +++ b/bundle/pyinstaller.py @@ -4,7 +4,7 @@ import os import sys -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import platform @@ -68,7 +68,7 @@ def exe(onedir): 'dist/', onedir and f'{name}/', name, - OS_NAME == 'win32' and '.exe' + OS_NAME == 'win32' and '.exe', ))) @@ -113,7 +113,7 @@ def windows_set_version(exe, version): ), kids=[ StringFileInfo([StringTable('040904B0', [ - StringStruct('Comments', 'yt-dlp%s Command Line Interface' % suffix), + StringStruct('Comments', f'yt-dlp{suffix} Command Line Interface'), StringStruct('CompanyName', 'https://github.com/yt-dlp'), StringStruct('FileDescription', 'yt-dlp%s' % (MACHINE and f' ({MACHINE})')), StringStruct('FileVersion', version), @@ -123,8 +123,8 @@ def windows_set_version(exe, version): StringStruct('ProductName', f'yt-dlp{suffix}'), StringStruct( 'ProductVersion', f'{version}{suffix} on Python {platform.python_version()}'), - ])]), VarFileInfo([VarStruct('Translation', [0, 1200])]) - ] + ])]), VarFileInfo([VarStruct('Translation', [0, 1200])]), + ], )) diff --git a/devscripts/SizeOfImage.patch b/devscripts/SizeOfImage.patch deleted file mode 100644 index d5845af464..0000000000 Binary files a/devscripts/SizeOfImage.patch and /dev/null differ diff --git a/devscripts/SizeOfImage_w.patch b/devscripts/SizeOfImage_w.patch deleted file mode 100644 index c1a338ff3e..0000000000 Binary files a/devscripts/SizeOfImage_w.patch and /dev/null differ diff --git a/devscripts/__init__.py b/devscripts/__init__.py index 750dbdca78..e69de29bb2 100644 --- a/devscripts/__init__.py +++ b/devscripts/__init__.py @@ -1 +0,0 @@ -# Empty file needed to make devscripts.utils properly importable from outside diff --git a/devscripts/bash-completion.py b/devscripts/bash-completion.py index 9b4a9d4e2f..3918ebde86 100755 --- a/devscripts/bash-completion.py +++ b/devscripts/bash-completion.py @@ -9,8 +9,8 @@ import yt_dlp -BASH_COMPLETION_FILE = "completions/bash/yt-dlp" -BASH_COMPLETION_TEMPLATE = "devscripts/bash-completion.in" +BASH_COMPLETION_FILE = 'completions/bash/yt-dlp' +BASH_COMPLETION_TEMPLATE = 'devscripts/bash-completion.in' def build_completion(opt_parser): @@ -21,9 +21,9 @@ def build_completion(opt_parser): opts_flag.append(option.get_opt_string()) with open(BASH_COMPLETION_TEMPLATE) as f: template = f.read() - with open(BASH_COMPLETION_FILE, "w") as f: + with open(BASH_COMPLETION_FILE, 'w') as f: # just using the special char - filled_template = template.replace("{{flags}}", " ".join(opts_flag)) + filled_template = template.replace('{{flags}}', ' '.join(opts_flag)) f.write(filled_template) diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json new file mode 100644 index 0000000000..5189de2d77 --- /dev/null +++ b/devscripts/changelog_override.json @@ -0,0 +1,189 @@ +[ + { + "action": "add", + "when": "29cb20bd563c02671b31dd840139e93dd37150a1", + "short": "[priority] **A new release type has been added!**\n * [`nightly`](https://github.com/yt-dlp/yt-dlp/releases/tag/nightly) builds will be made after each push, containing the latest fixes (but also possibly bugs).\n * When using `--update`/`-U`, a release binary will only update to its current channel (either `stable` or `nightly`).\n * The `--update-to` option has been added allowing the user more control over program upgrades (or downgrades).\n * `--update-to` can change the release channel (`stable`, `nightly`) and also upgrade or downgrade to specific tags.\n * **Usage**: `--update-to CHANNEL`, `--update-to TAG`, `--update-to CHANNEL@TAG`" + }, + { + "action": "add", + "when": "5038f6d713303e0967d002216e7a88652401c22a", + "short": "[priority] **YouTube throttling fixes!**" + }, + { + "action": "remove", + "when": "2e023649ea4e11151545a34dc1360c114981a236" + }, + { + "action": "add", + "when": "01aba2519a0884ef17d5f85608dbd2a455577147", + "short": "[priority] YouTube: Improved throttling and signature fixes" + }, + { + "action": "change", + "when": "c86e433c35fe5da6cb29f3539eef97497f84ed38", + "short": "[extractor/niconico:series] Fix extraction (#6898)", + "authors": ["sqrtNOT"] + }, + { + "action": "change", + "when": "69a40e4a7f6caa5662527ebd2f3c4e8aa02857a2", + "short": "[extractor/youtube:music_search_url] Extract title (#7102)", + "authors": ["kangalio"] + }, + { + "action": "change", + "when": "8417f26b8a819cd7ffcd4e000ca3e45033e670fb", + "short": "Add option `--color` (#6904)", + "authors": ["Grub4K"] + }, + { + "action": "change", + "when": "b4e0d75848e9447cee2cd3646ce54d4744a7ff56", + "short": "Improve `--download-sections`\n - Support negative time-ranges\n - Add `*from-url` to obey time-ranges in URL", + "authors": ["pukkandan"] + }, + { + "action": "change", + "when": "1e75d97db21152acc764b30a688e516f04b8a142", + "short": "[extractor/youtube] Add `ios` to default clients used\n - IOS is affected neither by 403 nor by nsig so helps mitigate them preemptively\n - IOS also has higher bit-rate 'premium' formats though they are not labeled as such", + "authors": ["pukkandan"] + }, + { + "action": "change", + "when": "f2ff0f6f1914b82d4a51681a72cc0828115dcb4a", + "short": "[extractor/motherless] Add gallery support, fix groups (#7211)", + "authors": ["rexlambert22", "Ti4eeT4e"] + }, + { + "action": "change", + "when": "a4486bfc1dc7057efca9dd3fe70d7fa25c56f700", + "short": "[misc] Revert \"Add automatic duplicate issue detection\"", + "authors": ["pukkandan"] + }, + { + "action": "add", + "when": "1ceb657bdd254ad961489e5060f2ccc7d556b729", + "short": "[priority] Security: [[CVE-2023-35934](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-35934)] Fix [Cookie leak](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj)\n - `--add-header Cookie:` is deprecated and auto-scoped to input URL domains\n - Cookies are scoped when passed to external downloaders\n - Add `cookies` field to info.json and deprecate `http_headers.Cookie`" + }, + { + "action": "change", + "when": "b03fa7834579a01cc5fba48c0e73488a16683d48", + "short": "[ie/twitter] Revert 92315c03774cfabb3a921884326beb4b981f786b", + "authors": ["pukkandan"] + }, + { + "action": "change", + "when": "fcd6a76adc49d5cd8783985c7ce35384b72e545f", + "short": "[test] Add tests for socks proxies (#7908)", + "authors": ["coletdjnz"] + }, + { + "action": "change", + "when": "4bf912282a34b58b6b35d8f7e6be535770c89c76", + "short": "[rh:urllib] Remove dot segments during URL normalization (#7662)", + "authors": ["coletdjnz"] + }, + { + "action": "change", + "when": "59e92b1f1833440bb2190f847eb735cf0f90bc85", + "short": "[rh:urllib] Simplify gzip decoding (#7611)", + "authors": ["Grub4K"] + }, + { + "action": "add", + "when": "c1d71d0d9f41db5e4306c86af232f5f6220a130b", + "short": "[priority] **The minimum *recommended* Python version has been raised to 3.8**\nSince Python 3.7 has reached end-of-life, support for it will be dropped soon. [Read more](https://github.com/yt-dlp/yt-dlp/issues/7803)" + }, + { + "action": "add", + "when": "61bdf15fc7400601c3da1aa7a43917310a5bf391", + "short": "[priority] Security: [[CVE-2023-40581](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-40581)] [Prevent RCE when using `--exec` with `%q` on Windows](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-42h4-v29r-42qg)\n - The shell escape function is now using `\"\"` instead of `\\\"`.\n - `utils.Popen` has been patched to properly quote commands." + }, + { + "action": "change", + "when": "8a8b54523addf46dfd50ef599761a81bc22362e6", + "short": "[rh:requests] Add handler for `requests` HTTP library (#3668)\n\n\tAdds support for HTTPS proxies and persistent connections (keep-alive)", + "authors": ["bashonly", "coletdjnz", "Grub4K"] + }, + { + "action": "add", + "when": "1d03633c5a1621b9f3a756f0a4f9dc61fab3aeaa", + "short": "[priority] **The release channels have been adjusted!**\n\t* [`master`](https://github.com/yt-dlp/yt-dlp-master-builds) builds are made after each push, containing the latest fixes (but also possibly bugs). This was previously the `nightly` channel.\n\t* [`nightly`](https://github.com/yt-dlp/yt-dlp-nightly-builds) builds are now made once a day, if there were any changes." + }, + { + "action": "add", + "when": "f04b5bedad7b281bee9814686bba1762bae092eb", + "short": "[priority] Security: [[CVE-2023-46121](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-46121)] Patch [Generic Extractor MITM Vulnerability via Arbitrary Proxy Injection](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-3ch3-jhc6-5r8x)\n\t- Disallow smuggling of arbitrary `http_headers`; extractors now only use specific headers" + }, + { + "action": "change", + "when": "15f22b4880b6b3f71f350c64d70976ae65b9f1ca", + "short": "[webvtt] Allow spaces before newlines for CueBlock (#7681)", + "authors": ["TSRBerry"] + }, + { + "action": "change", + "when": "4ce57d3b873c2887814cbec03d029533e82f7db5", + "short": "[ie] Support multi-period MPD streams (#6654)", + "authors": ["alard", "pukkandan"] + }, + { + "action": "change", + "when": "aa7e9ae4f48276bd5d0173966c77db9484f65a0a", + "short": "[ie/xvideos] Support new URL format (#9502)", + "authors": ["sta1us"] + }, + { + "action": "remove", + "when": "22e4dfacb61f62dfbb3eb41b31c7b69ba1059b80" + }, + { + "action": "change", + "when": "e3a3ed8a981d9395c4859b6ef56cd02bc3148db2", + "short": "[cleanup:ie] No `from` stdlib imports in extractors", + "authors": ["pukkandan"] + }, + { + "action": "add", + "when": "9590cc6b4768e190183d7d071a6c78170889116a", + "short": "[priority] Security: [[CVE-2024-22423](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2024-22423)] [Prevent RCE when using `--exec` with `%q` on Windows](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-hjq6-52gw-2g7p)\n - The shell escape function now properly escapes `%`, `\\` and `\\n`.\n - `utils.Popen` has been patched accordingly." + }, + { + "action": "change", + "when": "41ba4a808b597a3afed78c89675a30deb6844450", + "short": "[ie/tiktok] Extract via mobile API only if extractor-arg is passed (#9938)", + "authors": ["bashonly"] + }, + { + "action": "remove", + "when": "6e36d17f404556f0e3a43f441c477a71a91877d9" + }, + { + "action": "change", + "when": "beaf832c7a9d57833f365ce18f6115b88071b296", + "short": "[ie/soundcloud] Add `formats` extractor-arg (#10004)", + "authors": ["bashonly", "Grub4K"] + }, + { + "action": "change", + "when": "5c019f6328ad40d66561eac3c4de0b3cd070d0f6", + "short": "[cleanup] Misc (#9765)", + "authors": ["bashonly", "Grub4K", "seproDev"] + }, + { + "action": "change", + "when": "e6a22834df1776ec4e486526f6df2bf53cb7e06f", + "short": "[ie/orf:on] Add `prefer_segments_playlist` extractor-arg (#10314)", + "authors": ["seproDev"] + }, + { + "action": "add", + "when": "6aaf96a3d6e7d0d426e97e11a2fcf52fda00e733", + "short": "[priority] Security: [[CVE-2024-38519](https://nvd.nist.gov/vuln/detail/CVE-2024-38519)] [Properly sanitize file-extension to prevent file system modification and RCE](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-79w7-vh3h-8g4j)\n - Unsafe extensions are now blocked from being downloaded" + }, + { + "action": "add", + "when": "6075a029dba70a89675ae1250e7cdfd91f0eba41", + "short": "[priority] Security: [[ie/douyutv] Do not use dangerous javascript source/URL](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-3v33-3wmw-3785)\n - A dependency on potentially malicious third-party JavaScript code has been removed from the Douyu extractors" + } +] diff --git a/devscripts/changelog_override.schema.json b/devscripts/changelog_override.schema.json new file mode 100644 index 0000000000..9bd747b701 --- /dev/null +++ b/devscripts/changelog_override.schema.json @@ -0,0 +1,96 @@ +{ + "$schema": "http://json-schema.org/draft/2020-12/schema", + "type": "array", + "uniqueItems": true, + "items": { + "type": "object", + "oneOf": [ + { + "type": "object", + "properties": { + "action": { + "enum": [ + "add" + ] + }, + "when": { + "type": "string", + "pattern": "^([0-9a-f]{40}|\\d{4}\\.\\d{2}\\.\\d{2})$" + }, + "hash": { + "type": "string", + "pattern": "^[0-9a-f]{40}$" + }, + "short": { + "type": "string" + }, + "authors": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": [ + "action", + "short" + ] + }, + { + "type": "object", + "properties": { + "action": { + "enum": [ + "remove" + ] + }, + "when": { + "type": "string", + "pattern": "^([0-9a-f]{40}|\\d{4}\\.\\d{2}\\.\\d{2})$" + }, + "hash": { + "type": "string", + "pattern": "^[0-9a-f]{40}$" + } + }, + "required": [ + "action", + "hash" + ] + }, + { + "type": "object", + "properties": { + "action": { + "enum": [ + "change" + ] + }, + "when": { + "type": "string", + "pattern": "^([0-9a-f]{40}|\\d{4}\\.\\d{2}\\.\\d{2})$" + }, + "hash": { + "type": "string", + "pattern": "^[0-9a-f]{40}$" + }, + "short": { + "type": "string" + }, + "authors": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": [ + "action", + "hash", + "short", + "authors" + ] + } + ] + } +} diff --git a/devscripts/cli_to_api.py b/devscripts/cli_to_api.py new file mode 100755 index 0000000000..9c2710e09f --- /dev/null +++ b/devscripts/cli_to_api.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 + +# Allow direct execution +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import yt_dlp +import yt_dlp.options + +create_parser = yt_dlp.options.create_parser + + +def parse_patched_options(opts): + patched_parser = create_parser() + patched_parser.defaults.update({ + 'ignoreerrors': False, + 'retries': 0, + 'fragment_retries': 0, + 'extract_flat': False, + 'concat_playlist': 'never', + }) + yt_dlp.options.create_parser = lambda: patched_parser + try: + return yt_dlp.parse_options(opts) + finally: + yt_dlp.options.create_parser = create_parser + + +default_opts = parse_patched_options([]).ydl_opts + + +def cli_to_api(opts, cli_defaults=False): + opts = (yt_dlp.parse_options if cli_defaults else parse_patched_options)(opts).ydl_opts + + diff = {k: v for k, v in opts.items() if default_opts[k] != v} + if 'postprocessors' in diff: + diff['postprocessors'] = [pp for pp in diff['postprocessors'] + if pp not in default_opts['postprocessors']] + return diff + + +if __name__ == '__main__': + from pprint import pprint + + print('\nThe arguments passed translate to:\n') + pprint(cli_to_api(sys.argv[1:])) + print('\nCombining these with the CLI defaults gives:\n') + pprint(cli_to_api(sys.argv[1:], True)) diff --git a/devscripts/install_deps.py b/devscripts/install_deps.py new file mode 100755 index 0000000000..d292505458 --- /dev/null +++ b/devscripts/install_deps.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 + +# Allow execution from anywhere +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import argparse +import re +import subprocess + +from pathlib import Path + +from devscripts.tomlparse import parse_toml +from devscripts.utils import read_file + + +def parse_args(): + parser = argparse.ArgumentParser(description='Install dependencies for yt-dlp') + parser.add_argument( + 'input', nargs='?', metavar='TOMLFILE', default=Path(__file__).parent.parent / 'pyproject.toml', + help='input file (default: %(default)s)') + parser.add_argument( + '-e', '--exclude', metavar='DEPENDENCY', action='append', + help='exclude a dependency') + parser.add_argument( + '-i', '--include', metavar='GROUP', action='append', + help='include an optional dependency group') + parser.add_argument( + '-o', '--only-optional', action='store_true', + help='only install optional dependencies') + parser.add_argument( + '-p', '--print', action='store_true', + help='only print requirements to stdout') + parser.add_argument( + '-u', '--user', action='store_true', + help='install with pip as --user') + return parser.parse_args() + + +def main(): + args = parse_args() + project_table = parse_toml(read_file(args.input))['project'] + recursive_pattern = re.compile(rf'{project_table["name"]}\[(?P[\w-]+)\]') + optional_groups = project_table['optional-dependencies'] + excludes = args.exclude or [] + + def yield_deps(group): + for dep in group: + if mobj := recursive_pattern.fullmatch(dep): + yield from optional_groups.get(mobj.group('group_name'), []) + else: + yield dep + + targets = [] + if not args.only_optional: # `-o` should exclude 'dependencies' and the 'default' group + targets.extend(project_table['dependencies']) + if 'default' not in excludes: # `--exclude default` should exclude entire 'default' group + targets.extend(yield_deps(optional_groups['default'])) + + for include in filter(None, map(optional_groups.get, args.include or [])): + targets.extend(yield_deps(include)) + + targets = [t for t in targets if re.match(r'[\w-]+', t).group(0).lower() not in excludes] + + if args.print: + for target in targets: + print(target) + return + + pip_args = [sys.executable, '-m', 'pip', 'install', '-U'] + if args.user: + pip_args.append('--user') + pip_args.extend(targets) + + return subprocess.call(pip_args) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/devscripts/lazy_load_template.py b/devscripts/lazy_load_template.py index c8815e01bc..6f52165c5c 100644 --- a/devscripts/lazy_load_template.py +++ b/devscripts/lazy_load_template.py @@ -6,6 +6,7 @@ age_restricted, bug_reports_message, classproperty, + variadic, write_string, ) diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py new file mode 100644 index 0000000000..00634fb911 --- /dev/null +++ b/devscripts/make_changelog.py @@ -0,0 +1,510 @@ +from __future__ import annotations + +# Allow direct execution +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import enum +import itertools +import json +import logging +import re +from collections import defaultdict +from dataclasses import dataclass +from functools import lru_cache +from pathlib import Path + +from devscripts.utils import read_file, run_process, write_file + +BASE_URL = 'https://github.com' +LOCATION_PATH = Path(__file__).parent +HASH_LENGTH = 7 + +logger = logging.getLogger(__name__) + + +class CommitGroup(enum.Enum): + PRIORITY = 'Important' + CORE = 'Core' + EXTRACTOR = 'Extractor' + DOWNLOADER = 'Downloader' + POSTPROCESSOR = 'Postprocessor' + NETWORKING = 'Networking' + MISC = 'Misc.' + + @classmethod + @lru_cache + def subgroup_lookup(cls): + return { + name: group + for group, names in { + cls.MISC: { + 'build', + 'ci', + 'cleanup', + 'devscripts', + 'docs', + 'test', + }, + cls.NETWORKING: { + 'rh', + }, + }.items() + for name in names + } + + @classmethod + @lru_cache + def group_lookup(cls): + result = { + 'fd': cls.DOWNLOADER, + 'ie': cls.EXTRACTOR, + 'pp': cls.POSTPROCESSOR, + 'upstream': cls.CORE, + } + result.update({item.name.lower(): item for item in iter(cls)}) + return result + + @classmethod + def get(cls, value: str) -> tuple[CommitGroup | None, str | None]: + group, _, subgroup = (group.strip().lower() for group in value.partition('/')) + + result = cls.group_lookup().get(group) + if not result: + if subgroup: + return None, value + subgroup = group + result = cls.subgroup_lookup().get(subgroup) + + return result, subgroup or None + + +@dataclass +class Commit: + hash: str | None + short: str + authors: list[str] + + def __str__(self): + result = f'{self.short!r}' + + if self.hash: + result += f' ({self.hash[:HASH_LENGTH]})' + + if self.authors: + authors = ', '.join(self.authors) + result += f' by {authors}' + + return result + + +@dataclass +class CommitInfo: + details: str | None + sub_details: tuple[str, ...] + message: str + issues: list[str] + commit: Commit + fixes: list[Commit] + + def key(self): + return ((self.details or '').lower(), self.sub_details, self.message) + + +def unique(items): + return sorted({item.strip().lower(): item for item in items if item}.values()) + + +class Changelog: + MISC_RE = re.compile(r'(?:^|\b)(?:lint(?:ing)?|misc|format(?:ting)?|fixes)(?:\b|$)', re.IGNORECASE) + ALWAYS_SHOWN = (CommitGroup.PRIORITY,) + + def __init__(self, groups, repo, collapsible=False): + self._groups = groups + self._repo = repo + self._collapsible = collapsible + + def __str__(self): + return '\n'.join(self._format_groups(self._groups)).replace('\t', ' ') + + def _format_groups(self, groups): + first = True + for item in CommitGroup: + if self._collapsible and item not in self.ALWAYS_SHOWN and first: + first = False + yield '\n

Changelog

\n' + + group = groups[item] + if group: + yield self.format_module(item.value, group) + + if self._collapsible: + yield '\n
' + + def format_module(self, name, group): + result = f'\n#### {name} changes\n' if name else '\n' + return result + '\n'.join(self._format_group(group)) + + def _format_group(self, group): + sorted_group = sorted(group, key=CommitInfo.key) + detail_groups = itertools.groupby(sorted_group, lambda item: (item.details or '').lower()) + for _, items in detail_groups: + items = list(items) + details = items[0].details + + if details == 'cleanup': + items = self._prepare_cleanup_misc_items(items) + + prefix = '-' + if details: + if len(items) == 1: + prefix = f'- **{details}**:' + else: + yield f'- **{details}**' + prefix = '\t-' + + sub_detail_groups = itertools.groupby(items, lambda item: tuple(map(str.lower, item.sub_details))) + for sub_details, entries in sub_detail_groups: + if not sub_details: + for entry in entries: + yield f'{prefix} {self.format_single_change(entry)}' + continue + + entries = list(entries) + sub_prefix = f'{prefix} {", ".join(entries[0].sub_details)}' + if len(entries) == 1: + yield f'{sub_prefix}: {self.format_single_change(entries[0])}' + continue + + yield sub_prefix + for entry in entries: + yield f'\t{prefix} {self.format_single_change(entry)}' + + def _prepare_cleanup_misc_items(self, items): + cleanup_misc_items = defaultdict(list) + sorted_items = [] + for item in items: + if self.MISC_RE.search(item.message): + cleanup_misc_items[tuple(item.commit.authors)].append(item) + else: + sorted_items.append(item) + + for commit_infos in cleanup_misc_items.values(): + sorted_items.append(CommitInfo( + 'cleanup', ('Miscellaneous',), ', '.join( + self._format_message_link(None, info.commit.hash) + for info in sorted(commit_infos, key=lambda item: item.commit.hash or '')), + [], Commit(None, '', commit_infos[0].commit.authors), [])) + + return sorted_items + + def format_single_change(self, info: CommitInfo): + message, sep, rest = info.message.partition('\n') + if '[' not in message: + # If the message doesn't already contain markdown links, try to add a link to the commit + message = self._format_message_link(message, info.commit.hash) + + if info.issues: + message = f'{message} ({self._format_issues(info.issues)})' + + if info.commit.authors: + message = f'{message} by {self._format_authors(info.commit.authors)}' + + if info.fixes: + fix_message = ', '.join(f'{self._format_message_link(None, fix.hash)}' for fix in info.fixes) + + authors = sorted({author for fix in info.fixes for author in fix.authors}, key=str.casefold) + if authors != info.commit.authors: + fix_message = f'{fix_message} by {self._format_authors(authors)}' + + message = f'{message} (With fixes in {fix_message})' + + return message if not sep else f'{message}{sep}{rest}' + + def _format_message_link(self, message, commit_hash): + assert message or commit_hash, 'Improperly defined commit message or override' + message = message if message else commit_hash[:HASH_LENGTH] + return f'[{message}]({self.repo_url}/commit/{commit_hash})' if commit_hash else message + + def _format_issues(self, issues): + return ', '.join(f'[#{issue}]({self.repo_url}/issues/{issue})' for issue in issues) + + @staticmethod + def _format_authors(authors): + return ', '.join(f'[{author}]({BASE_URL}/{author})' for author in authors) + + @property + def repo_url(self): + return f'{BASE_URL}/{self._repo}' + + +class CommitRange: + COMMAND = 'git' + COMMIT_SEPARATOR = '-----' + + AUTHOR_INDICATOR_RE = re.compile(r'Authored by:? ', re.IGNORECASE) + MESSAGE_RE = re.compile(r''' + (?:\[(?P[^\]]+)\]\ )? + (?:(?P`?[\w.-]+`?): )? + (?P.+?) + (?:\ \((?P\#\d+(?:,\ \#\d+)*)\))? + ''', re.VERBOSE | re.DOTALL) + EXTRACTOR_INDICATOR_RE = re.compile(r'(?:Fix|Add)\s+Extractors?', re.IGNORECASE) + REVERT_RE = re.compile(r'(?:\[[^\]]+\]\s+)?(?i:Revert)\s+([\da-f]{40})') + FIXES_RE = re.compile(r'(?i:Fix(?:es)?(?:\s+bugs?)?(?:\s+in|\s+for)?|Revert|Improve)\s+([\da-f]{40})') + UPSTREAM_MERGE_RE = re.compile(r'Update to ytdl-commit-([\da-f]+)') + + def __init__(self, start, end, default_author=None): + self._start, self._end = start, end + self._commits, self._fixes = self._get_commits_and_fixes(default_author) + self._commits_added = [] + + def __iter__(self): + return iter(itertools.chain(self._commits.values(), self._commits_added)) + + def __len__(self): + return len(self._commits) + len(self._commits_added) + + def __contains__(self, commit): + if isinstance(commit, Commit): + if not commit.hash: + return False + commit = commit.hash + + return commit in self._commits + + def _get_commits_and_fixes(self, default_author): + result = run_process( + self.COMMAND, 'log', f'--format=%H%n%s%n%b%n{self.COMMIT_SEPARATOR}', + f'{self._start}..{self._end}' if self._start else self._end).stdout + + commits, reverts = {}, {} + fixes = defaultdict(list) + lines = iter(result.splitlines(False)) + for i, commit_hash in enumerate(lines): + short = next(lines) + skip = short.startswith('Release ') or short == '[version] update' + + authors = [default_author] if default_author else [] + for line in iter(lambda: next(lines), self.COMMIT_SEPARATOR): + match = self.AUTHOR_INDICATOR_RE.match(line) + if match: + authors = sorted(map(str.strip, line[match.end():].split(',')), key=str.casefold) + + commit = Commit(commit_hash, short, authors) + if skip and (self._start or not i): + logger.debug(f'Skipped commit: {commit}') + continue + elif skip: + logger.debug(f'Reached Release commit, breaking: {commit}') + break + + revert_match = self.REVERT_RE.fullmatch(commit.short) + if revert_match: + reverts[revert_match.group(1)] = commit + continue + + fix_match = self.FIXES_RE.search(commit.short) + if fix_match: + commitish = fix_match.group(1) + fixes[commitish].append(commit) + + commits[commit.hash] = commit + + for commitish, revert_commit in reverts.items(): + reverted = commits.pop(commitish, None) + if reverted: + logger.debug(f'{commitish} fully reverted {reverted}') + else: + commits[revert_commit.hash] = revert_commit + + for commitish, fix_commits in fixes.items(): + if commitish in commits: + hashes = ', '.join(commit.hash[:HASH_LENGTH] for commit in fix_commits) + logger.info(f'Found fix(es) for {commitish[:HASH_LENGTH]}: {hashes}') + for fix_commit in fix_commits: + del commits[fix_commit.hash] + else: + logger.debug(f'Commit with fixes not in changes: {commitish[:HASH_LENGTH]}') + + return commits, fixes + + def apply_overrides(self, overrides): + for override in overrides: + when = override.get('when') + if when and when not in self and when != self._start: + logger.debug(f'Ignored {when!r} override') + continue + + override_hash = override.get('hash') or when + if override['action'] == 'add': + commit = Commit(override.get('hash'), override['short'], override.get('authors') or []) + logger.info(f'ADD {commit}') + self._commits_added.append(commit) + + elif override['action'] == 'remove': + if override_hash in self._commits: + logger.info(f'REMOVE {self._commits[override_hash]}') + del self._commits[override_hash] + + elif override['action'] == 'change': + if override_hash not in self._commits: + continue + commit = Commit(override_hash, override['short'], override.get('authors') or []) + logger.info(f'CHANGE {self._commits[commit.hash]} -> {commit}') + self._commits[commit.hash] = commit + + self._commits = dict(reversed(self._commits.items())) + + def groups(self): + group_dict = defaultdict(list) + for commit in self: + upstream_re = self.UPSTREAM_MERGE_RE.search(commit.short) + if upstream_re: + commit.short = f'[upstream] Merged with youtube-dl {upstream_re.group(1)}' + + match = self.MESSAGE_RE.fullmatch(commit.short) + if not match: + logger.error(f'Error parsing short commit message: {commit.short!r}') + continue + + prefix, sub_details_alt, message, issues = match.groups() + issues = [issue.strip()[1:] for issue in issues.split(',')] if issues else [] + + if prefix: + groups, details, sub_details = zip(*map(self.details_from_prefix, prefix.split(','))) + group = next(iter(filter(None, groups)), None) + details = ', '.join(unique(details)) + sub_details = list(itertools.chain.from_iterable(sub_details)) + else: + group = CommitGroup.CORE + details = None + sub_details = [] + + if sub_details_alt: + sub_details.append(sub_details_alt) + sub_details = tuple(unique(sub_details)) + + if not group: + if self.EXTRACTOR_INDICATOR_RE.search(commit.short): + group = CommitGroup.EXTRACTOR + logger.error(f'Assuming [ie] group for {commit.short!r}') + else: + group = CommitGroup.CORE + + commit_info = CommitInfo( + details, sub_details, message.strip(), + issues, commit, self._fixes[commit.hash]) + + logger.debug(f'Resolved {commit.short!r} to {commit_info!r}') + group_dict[group].append(commit_info) + + return group_dict + + @staticmethod + def details_from_prefix(prefix): + if not prefix: + return CommitGroup.CORE, None, () + + prefix, *sub_details = prefix.split(':') + + group, details = CommitGroup.get(prefix) + if group is CommitGroup.PRIORITY and details: + details = details.partition('/')[2].strip() + + if details and '/' in details: + logger.error(f'Prefix is overnested, using first part: {prefix}') + details = details.partition('/')[0].strip() + + if details == 'common': + details = None + elif group is CommitGroup.NETWORKING and details == 'rh': + details = 'Request Handler' + + return group, details, sub_details + + +def get_new_contributors(contributors_path, commits): + contributors = set() + if contributors_path.exists(): + for line in read_file(contributors_path).splitlines(): + author, _, _ = line.strip().partition(' (') + authors = author.split('/') + contributors.update(map(str.casefold, authors)) + + new_contributors = set() + for commit in commits: + for author in commit.authors: + author_folded = author.casefold() + if author_folded not in contributors: + contributors.add(author_folded) + new_contributors.add(author) + + return sorted(new_contributors, key=str.casefold) + + +def create_changelog(args): + logging.basicConfig( + datefmt='%Y-%m-%d %H-%M-%S', format='{asctime} | {levelname:<8} | {message}', + level=logging.WARNING - 10 * args.verbosity, style='{', stream=sys.stderr) + + commits = CommitRange(None, args.commitish, args.default_author) + + if not args.no_override: + if args.override_path.exists(): + overrides = json.loads(read_file(args.override_path)) + commits.apply_overrides(overrides) + else: + logger.warning(f'File {args.override_path.as_posix()} does not exist') + + logger.info(f'Loaded {len(commits)} commits') + + new_contributors = get_new_contributors(args.contributors_path, commits) + if new_contributors: + if args.contributors: + write_file(args.contributors_path, '\n'.join(new_contributors) + '\n', mode='a') + logger.info(f'New contributors: {", ".join(new_contributors)}') + + return Changelog(commits.groups(), args.repo, args.collapsible) + + +def create_parser(): + import argparse + + parser = argparse.ArgumentParser( + description='Create a changelog markdown from a git commit range') + parser.add_argument( + 'commitish', default='HEAD', nargs='?', + help='The commitish to create the range from (default: %(default)s)') + parser.add_argument( + '-v', '--verbosity', action='count', default=0, + help='increase verbosity (can be used twice)') + parser.add_argument( + '-c', '--contributors', action='store_true', + help='update CONTRIBUTORS file (default: %(default)s)') + parser.add_argument( + '--contributors-path', type=Path, default=LOCATION_PATH.parent / 'CONTRIBUTORS', + help='path to the CONTRIBUTORS file') + parser.add_argument( + '--no-override', action='store_true', + help='skip override json in commit generation (default: %(default)s)') + parser.add_argument( + '--override-path', type=Path, default=LOCATION_PATH / 'changelog_override.json', + help='path to the changelog_override.json file') + parser.add_argument( + '--default-author', default='pukkandan', + help='the author to use without a author indicator (default: %(default)s)') + parser.add_argument( + '--repo', default='yt-dlp/yt-dlp', + help='the github repository to use for the operations (default: %(default)s)') + parser.add_argument( + '--collapsible', action='store_true', + help='make changelog collapsible (default: %(default)s)') + + return parser + + +if __name__ == '__main__': + print(create_changelog(create_parser().parse_args())) diff --git a/devscripts/make_issue_template.py b/devscripts/make_issue_template.py index 1ee00f2b89..a5d59f3c03 100644 --- a/devscripts/make_issue_template.py +++ b/devscripts/make_issue_template.py @@ -9,12 +9,7 @@ import re -from devscripts.utils import ( - get_filename_args, - read_file, - read_version, - write_file, -) +from devscripts.utils import get_filename_args, read_file, write_file VERBOSE_TMPL = ''' - type: checkboxes @@ -24,6 +19,8 @@ options: - label: Run **your** yt-dlp command with **-vU** flag added (`yt-dlp -vU `) required: true + - label: "If using API, add `'verbose': True` to `YoutubeDL` params instead" + required: false - label: Copy the WHOLE output (starting with `[debug] Command-line config`) and insert it below required: true - type: textarea @@ -33,19 +30,18 @@ description: | It should start like this: placeholder: | - [debug] Command-line config: ['-vU', 'test:youtube'] - [debug] Portable config "yt-dlp.conf": ['-i'] + [debug] Command-line config: ['-vU', 'https://www.youtube.com/watch?v=BaW_jenozKc'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version %(version)s [9d339c4] (win32_exe) + [debug] yt-dlp version nightly@... from yt-dlp/yt-dlp [b634ba742] (win_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 - [debug] Checking exe version: ffmpeg -bsfs - [debug] Checking exe version: ffprobe -bsfs [debug] exe versions: ffmpeg N-106550-g072101bd52-20220410 (fdk,setts), ffprobe N-106624-g391ce570c8-20220415, phantomjs 2.1.1 [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} - [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: %(version)s, Current version: %(version)s - yt-dlp is up to date (%(version)s) + [debug] Request Handlers: urllib, requests + [debug] Loaded 1893 extractors + [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp-nightly-builds/releases/latest + yt-dlp is up to date (nightly@... from yt-dlp/yt-dlp-nightly-builds) + [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell validations: @@ -64,7 +60,7 @@ def main(): - fields = {'version': read_version(), 'no_skip': NO_SKIP} + fields = {'no_skip': NO_SKIP} fields['verbose'] = VERBOSE_TMPL % fields fields['verbose_optional'] = re.sub(r'(\n\s+validations:)?\n\s+required: true', '', fields['verbose']) diff --git a/devscripts/make_readme.py b/devscripts/make_readme.py index fad993a199..cbb5859aa1 100755 --- a/devscripts/make_readme.py +++ b/devscripts/make_readme.py @@ -45,32 +45,42 @@ def apply_patch(text, patch): delim = f'\n{" " * switch_col_width}' PATCHES = ( - ( # Standardize update message + ( # Standardize `--update` message r'(?m)^( -U, --update\s+).+(\n \s.+)*$', r'\1Update this program to the latest version', ), - ( # Headings + ( # Headings r'(?m)^ (\w.+\n)( (?=\w))?', - r'## \1' + r'## \1', ), - ( # Do not split URLs + ( # Fixup `--date` formatting + rf'(?m)( --date DATE.+({delim}[^\[]+)*)\[.+({delim}.+)*$', + (rf'\1[now|today|yesterday][-N[day|week|month|year]].{delim}' + f'E.g. "--date today-2weeks" downloads only{delim}' + 'videos uploaded on the same day two weeks ago'), + ), + ( # Do not split URLs rf'({delim[:-1]})? (?P

', - r'', - r'

(.*?)

', - r']*>(.*?)'], - webpage, 'title') - description = self._og_search_description(webpage, default=None) or self._html_search_meta( - 'dcterms.abstract', webpage, 'description', default=None) - if description is None: - description = self._html_search_meta( - 'description', webpage, 'meta description', default=None) - if description is None: - description = self._html_search_regex( - r'(.+?)

', - webpage, 'teaser text', default=None) - - # Thumbnail is sometimes not present. - # It is in the mobile version, but that seems to use a different URL - # structure altogether. - thumbnail = self._og_search_thumbnail(webpage, default=None) - - media_streams = re.findall(r'''(?x) - mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s* - "([^"]+)"''', webpage) - - if media_streams: - QUALITIES = qualities(['lo', 'hi', 'hq']) - formats = [] - for furl in set(media_streams): - if furl.endswith('.f4m'): - fid = 'f4m' - else: - fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl) - fid = fid_m.group(1) if fid_m else None - formats.append({ - 'quality': QUALITIES(fid), - 'format_id': fid, - 'url': furl, - }) - info = { - 'formats': formats, - } - else: # request JSON file - if not document_id: - video_id = self._search_regex( - (r'/play/(?:config|media|sola)/(\d+)', r'contentId["\']\s*:\s*(\d+)'), - webpage, 'media id', default=None) - info = self._extract_media_info( - 'http://www.ardmediathek.de/play/media/%s' % video_id, - webpage, video_id) - - info.update({ - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - }) - info.update(self._ARD_extract_episode_info(info['title'])) - - return info - - class ARDIE(InfoExtractor): _VALID_URL = r'(?Phttps?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P[^/?#&]+))\.html' _TESTS = [{ # available till 7.12.2023 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html', - 'md5': 'a438f671e87a7eba04000336a119ccc4', + 'md5': '94812e6438488fb923c361a44469614b', 'info_dict': { 'id': 'maischberger-video-424', 'display_id': 'maischberger-video-424', @@ -398,16 +229,38 @@ def _real_extract(self, url): } -class ARDBetaMediathekIE(ARDMediathekBaseIE): +class ARDBetaMediathekIE(InfoExtractor): + IE_NAME = 'ARDMediathek' _VALID_URL = r'''(?x)https:// (?:(?:beta|www)\.)?ardmediathek\.de/ - (?:(?P[^/]+)/)? - (?:player|live|video|(?Psendung|sammlung))/ - (?:(?P(?(playlist)[^?#]+?|[^?#]+))/)? - (?P(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+) - (?(playlist)/(?P\d+)?/?(?:[?#]|$))''' + (?:[^/]+/)? + (?:player|live|video)/ + (?:[^?#]+/)? + (?P[a-zA-Z0-9]+) + /?(?:[?#]|$)''' + _GEO_COUNTRIES = ['DE'] + _TOKEN_URL = 'https://sso.ardmediathek.de/sso/token' _TESTS = [{ + 'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0', + 'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4', + 'info_dict': { + 'display_id': 'Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0', + 'id': '12939099', + 'title': 'Liebe auf vier Pfoten', + 'description': r're:^Claudia Schmitt, Anwältin in Salzburg', + 'duration': 5222, + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:aee7cbf8f06de976?w=960&ch=ae4d0f2ee47d8b9b', + 'timestamp': 1701343800, + 'upload_date': '20231130', + 'ext': 'mp4', + 'episode': 'Liebe auf vier Pfoten', + 'series': 'Filme im MDR', + 'age_limit': 0, + 'channel': 'MDR', + '_old_archive_ids': ['ardbetamediathek Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0'], + }, + }, { 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', 'info_dict': { @@ -424,7 +277,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'skip': 'Error', }, { 'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll', - 'md5': 'f1837e563323b8a642a8ddeff0131f51', + 'md5': '1e73ded21cb79bac065117e80c81dc88', 'info_dict': { 'id': '10049223', 'ext': 'mp4', @@ -432,13 +285,31 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'timestamp': 1636398000, 'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b', 'upload_date': '20211108', + 'display_id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll', + 'duration': 915, + 'episode': 'tagesschau, 20:00 Uhr', + 'series': 'tagesschau', + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678', + 'channel': 'ARD-Aktuell', + '_old_archive_ids': ['ardbetamediathek Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll'], }, }, { - 'url': 'https://www.ardmediathek.de/sendung/beforeigners/beforeigners/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw/1', - 'playlist_count': 6, + 'url': 'https://www.ardmediathek.de/video/7-tage/7-tage-unter-harten-jungs/hr-fernsehen/N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3', + 'md5': 'c428b9effff18ff624d4f903bda26315', 'info_dict': { - 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw', - 'title': 'beforeigners/beforeigners/staffel-1', + 'id': '94834686', + 'ext': 'mp4', + 'duration': 2700, + 'episode': '7 Tage ... unter harten Jungs', + 'description': 'md5:0f215470dcd2b02f59f4bd10c963f072', + 'upload_date': '20231005', + 'timestamp': 1696491171, + 'display_id': 'N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3', + 'series': '7 Tage ...', + 'channel': 'HR', + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:f6e6d5ffac41925c?w=960&ch=fa32ba69bc87989a', + 'title': '7 Tage ... unter harten Jungs', + '_old_archive_ids': ['ardbetamediathek N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3'], }, }, { 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', @@ -455,191 +326,254 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): }, { 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/', + 'only_matching': True, + }] + + def _extract_episode_info(self, title): + patterns = [ + # Pattern for title like "Homo sapiens (S06/E07) - Originalversion" + # from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw + r'.*(?P \(S(?P\d+)/E(?P\d+)\)).*', + # E.g.: title="Fritjof aus Norwegen (2) (AD)" + # from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/ + r'.*(?P \((?:Folge |Teil )?(?P\d+)(?:/\d+)?\)).*', + r'.*(?PFolge (?P\d+)(?:\:| -|) )\"(?P.+)\".*', + # E.g.: title="Folge 25/42: Symmetrie" + # from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/ + # E.g.: title="Folge 1063 - Vertrauen" + # from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/ + r'.*(?PFolge (?P\d+)(?:/\d+)?(?:\:| -|) ).*', + # As a fallback use the full title + r'(?P.*)', + ] + + return traverse_obj(patterns, (..., {functools.partial(re.match, string=title)}, { + 'season_number': ('season_number', {int_or_none}), + 'episode_number': ('episode_number', {int_or_none}), + 'episode': (( + ('episode', {str_or_none}), + ('ep_info', {lambda x: title.replace(x, '')}), + ('title', {str}), + ), {str.strip}), + }), get_all=False) + + def _real_extract(self, url): + display_id = self._match_id(url) + query = {'embedded': 'false', 'mcV6': 'true'} + headers = {} + + if self._get_cookies(self._TOKEN_URL).get('ams'): + token = self._download_json( + self._TOKEN_URL, display_id, 'Fetching token for age verification', + 'Unable to fetch age verification token', fatal=False) + id_token = traverse_obj(token, ('idToken', {str})) + decoded_token = traverse_obj(id_token, ({jwt_decode_hs256}, {dict})) + user_id = traverse_obj(decoded_token, (('user_id', 'sub'), {str}), get_all=False) + if not user_id: + self.report_warning('Unable to extract token, continuing without authentication') + else: + headers['x-authorization'] = f'Bearer {id_token}' + query['userId'] = user_id + if decoded_token.get('age_rating') != 18: + self.report_warning('Account is not verified as 18+; video may be unavailable') + + page_data = self._download_json( + f'https://api.ardmediathek.de/page-gateway/pages/ard/item/{display_id}', + display_id, query=query, headers=headers) + + # For user convenience we use the old contentId instead of the longer crid + # Ref: https://github.com/yt-dlp/yt-dlp/issues/8731#issuecomment-1874398283 + old_id = traverse_obj(page_data, ('tracking', 'atiCustomVars', 'contentId', {int})) + if old_id is not None: + video_id = str(old_id) + archive_ids = [make_archive_id(ARDBetaMediathekIE, display_id)] + else: + self.report_warning(f'Could not extract contentId{bug_reports_message()}') + video_id = display_id + archive_ids = None + + player_data = traverse_obj( + page_data, ('widgets', lambda _, v: v['type'] in ('player_ondemand', 'player_live'), {dict}), get_all=False) + is_live = player_data.get('type') == 'player_live' + media_data = traverse_obj(player_data, ('mediaCollection', 'embedded', {dict})) + + if player_data.get('blockedByFsk'): + self.raise_login_required('This video is only available for age verified users or after 22:00') + + formats = [] + subtitles = {} + for stream in traverse_obj(media_data, ('streams', ..., {dict})): + kind = stream.get('kind') + # Prioritize main stream over sign language and others + preference = 1 if kind == 'main' else None + for media in traverse_obj(stream, ('media', lambda _, v: url_or_none(v['url']))): + media_url = media['url'] + + audio_kind = traverse_obj(media, ( + 'audios', 0, 'kind', {str}), default='').replace('standard', '') + lang_code = traverse_obj(media, ('audios', 0, 'languageCode', {str})) or 'deu' + lang = join_nonempty(lang_code, audio_kind) + language_preference = 10 if lang == 'deu' else -10 + + if determine_ext(media_url) == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + media_url, video_id, m3u8_id=f'hls-{kind}', preference=preference, fatal=False, live=is_live) + for f in fmts: + f['language'] = lang + f['language_preference'] = language_preference + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': media_url, + 'format_id': f'http-{kind}', + 'preference': preference, + 'language': lang, + 'language_preference': language_preference, + **traverse_obj(media, { + 'format_note': ('forcedLabel', {str}), + 'width': ('maxHResolutionPx', {int_or_none}), + 'height': ('maxVResolutionPx', {int_or_none}), + 'vcodec': ('videoCodec', {str}), + }), + }) + + for sub in traverse_obj(media_data, ('subtitles', ..., {dict})): + for sources in traverse_obj(sub, ('sources', lambda _, v: url_or_none(v['url']))): + subtitles.setdefault(sub.get('languageCode') or 'deu', []).append({ + 'url': sources['url'], + 'ext': {'webvtt': 'vtt', 'ebutt': 'ttml'}.get(sources.get('kind')), + }) + + age_limit = traverse_obj(page_data, ('fskRating', {lambda x: remove_start(x, 'FSK')}, {int_or_none})) + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + 'age_limit': age_limit, + **traverse_obj(media_data, ('meta', { + 'title': 'title', + 'description': 'synopsis', + 'timestamp': ('broadcastedOnDateTime', {parse_iso8601}), + 'series': 'seriesTitle', + 'thumbnail': ('images', 0, 'url', {url_or_none}), + 'duration': ('durationSeconds', {int_or_none}), + 'channel': 'clipSourceName', + })), + **self._extract_episode_info(page_data.get('title')), + '_old_archive_ids': archive_ids, + } + + +class ARDMediathekCollectionIE(InfoExtractor): + _VALID_URL = r'''(?x)https:// + (?:(?:beta|www)\.)?ardmediathek\.de/ + (?:[^/?#]+/)? + (?P<playlist>sendung|serie|sammlung)/ + (?:(?P<display_id>[^?#]+?)/)? + (?P<id>[a-zA-Z0-9]+) + (?:/(?P<season>\d+)(?:/(?P<version>OV|AD))?)?/?(?:[?#]|$)''' + _GEO_COUNTRIES = ['DE'] + + _TESTS = [{ + 'url': 'https://www.ardmediathek.de/serie/quiz/staffel-1-originalversion/Y3JpZDovL3dkci5kZS9vbmUvcXVpeg/1/OV', + 'info_dict': { + 'id': 'Y3JpZDovL3dkci5kZS9vbmUvcXVpeg_1_OV', + 'display_id': 'quiz/staffel-1-originalversion', + 'title': 'Staffel 1 Originalversion', + }, + 'playlist_count': 3, + }, { + 'url': 'https://www.ardmediathek.de/serie/babylon-berlin/staffel-4-mit-audiodeskription/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu/4/AD', + 'info_dict': { + 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu_4_AD', + 'display_id': 'babylon-berlin/staffel-4-mit-audiodeskription', + 'title': 'Staffel 4 mit Audiodeskription', + }, + 'playlist_count': 12, + }, { + 'url': 'https://www.ardmediathek.de/serie/babylon-berlin/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu/1/', + 'info_dict': { + 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu_1', + 'display_id': 'babylon-berlin/staffel-1', + 'title': 'Staffel 1', + }, + 'playlist_count': 8, + }, { + 'url': 'https://www.ardmediathek.de/sendung/tatort/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydA', + 'info_dict': { + 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydA', + 'display_id': 'tatort', + 'title': 'Tatort', + }, + 'playlist_mincount': 500, + }, { + 'url': 'https://www.ardmediathek.de/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2', + 'info_dict': { + 'id': '5eOHzt8XB2sqeFXbIoJlg2', + 'display_id': 'die-kirche-bleibt-im-dorf', + 'title': 'Die Kirche bleibt im Dorf', + 'description': 'Die Kirche bleibt im Dorf', + }, + 'playlist_count': 4, }, { # playlist of type 'sendung' 'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/', 'only_matching': True, + }, { + # playlist of type 'serie' + 'url': 'https://www.ardmediathek.de/serie/nachtstreife/staffel-1/Y3JpZDovL3N3ci5kZS9zZGIvc3RJZC8xMjQy/1', + 'only_matching': True, }, { # playlist of type 'sammlung' 'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/', 'only_matching': True, - }, { - 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/', - 'only_matching': True, - }, { - 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet', - 'only_matching': True, }] - def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber): - """ Query the ARD server for playlist information - and returns the data in "raw" format """ - if mode == 'sendung': - graphQL = json.dumps({ - 'query': '''{ - showPage( - client: "%s" - showId: "%s" - pageNumber: %d - ) { - pagination { - pageSize - totalElements - } - teasers { # Array - mediumTitle - links { target { id href title } } - type - } - }}''' % (client, playlist_id, pageNumber), - }).encode() - else: # mode == 'sammlung' - graphQL = json.dumps({ - 'query': '''{ - morePage( - client: "%s" - compilationId: "%s" - pageNumber: %d - ) { - widget { - pagination { - pageSize - totalElements - } - teasers { # Array - mediumTitle - links { target { id href title } } - type - } - } - }}''' % (client, playlist_id, pageNumber), - }).encode() - # Ressources for ARD graphQL debugging: - # https://api-test.ardmediathek.de/public-gateway - show_page = self._download_json( - 'https://api.ardmediathek.de/public-gateway', - '[Playlist] %s' % display_id, - data=graphQL, - headers={'Content-Type': 'application/json'})['data'] - # align the structure of the returned data: - if mode == 'sendung': - show_page = show_page['showPage'] - else: # mode == 'sammlung' - show_page = show_page['morePage']['widget'] - return show_page - - def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode): - """ Collects all playlist entries and returns them as info dict. - Supports playlists of mode 'sendung' and 'sammlung', and also nested - playlists. """ - entries = [] - pageNumber = 0 - while True: # iterate by pageNumber - show_page = self._ARD_load_playlist_snipped( - playlist_id, display_id, client, mode, pageNumber) - for teaser in show_page['teasers']: # process playlist items - if '/compilation/' in teaser['links']['target']['href']: - # alternativ cond.: teaser['type'] == "compilation" - # => This is an nested compilation, e.g. like: - # https://www.ardmediathek.de/ard/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2/ - link_mode = 'sammlung' - else: - link_mode = 'video' - - item_url = 'https://www.ardmediathek.de/%s/%s/%s/%s/%s' % ( - client, link_mode, display_id, - # perform HTLM quoting of episode title similar to ARD: - re.sub('^-|-$', '', # remove '-' from begin/end - re.sub('[^a-zA-Z0-9]+', '-', # replace special chars by - - teaser['links']['target']['title'].lower() - .replace('ä', 'ae').replace('ö', 'oe') - .replace('ü', 'ue').replace('ß', 'ss'))), - teaser['links']['target']['id']) - entries.append(self.url_result( - item_url, - ie=ARDBetaMediathekIE.ie_key())) - - if (show_page['pagination']['pageSize'] * (pageNumber + 1) - >= show_page['pagination']['totalElements']): - # we've processed enough pages to get all playlist entries - break - pageNumber = pageNumber + 1 - - return self.playlist_result(entries, playlist_id, playlist_title=display_id) + _PAGE_SIZE = 100 def _real_extract(self, url): - video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group( - 'id', 'display_id', 'playlist', 'client', 'season') - display_id, client = display_id or video_id, client or 'ard' + playlist_id, display_id, playlist_type, season_number, version = self._match_valid_url(url).group( + 'id', 'display_id', 'playlist', 'season', 'version') - if playlist_type: - # TODO: Extract only specified season - return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type) + def call_api(page_num): + api_path = 'compilations/ard' if playlist_type == 'sammlung' else 'widgets/ard/asset' + return self._download_json( + f'https://api.ardmediathek.de/page-gateway/{api_path}/{playlist_id}', playlist_id, + f'Downloading playlist page {page_num}', query={ + 'pageNumber': page_num, + 'pageSize': self._PAGE_SIZE, + **({ + 'seasoned': 'true', + 'seasonNumber': season_number, + 'withOriginalversion': 'true' if version == 'OV' else 'false', + 'withAudiodescription': 'true' if version == 'AD' else 'false', + } if season_number else {}), + }) - player_page = self._download_json( - 'https://api.ardmediathek.de/public-gateway', - display_id, data=json.dumps({ - 'query': '''{ - playerPage(client:"%s", clipId: "%s") { - blockedByFsk - broadcastedOn - maturityContentRating - mediaCollection { - _duration - _geoblocked - _isLive - _mediaArray { - _mediaStreamArray { - _quality - _server - _stream - } - } - _previewImage - _subtitleUrl - _type - } - show { - title - } - synopsis - title - tracking { - atiCustomVars { - contentId - } - } - } -}''' % (client, video_id), - }).encode(), headers={ - 'Content-Type': 'application/json' - })['data']['playerPage'] - title = player_page['title'] - content_id = str_or_none(try_get( - player_page, lambda x: x['tracking']['atiCustomVars']['contentId'])) - media_collection = player_page.get('mediaCollection') or {} - if not media_collection and content_id: - media_collection = self._download_json( - 'https://www.ardmediathek.de/play/media/' + content_id, - content_id, fatal=False) or {} - info = self._parse_media_info( - media_collection, content_id or video_id, - player_page.get('blockedByFsk')) - age_limit = None - description = player_page.get('synopsis') - maturity_content_rating = player_page.get('maturityContentRating') - if maturity_content_rating: - age_limit = int_or_none(maturity_content_rating.lstrip('FSK')) - if not age_limit and description: - age_limit = int_or_none(self._search_regex( - r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) - info.update({ - 'age_limit': age_limit, - 'display_id': display_id, - 'title': title, - 'description': description, - 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), - 'series': try_get(player_page, lambda x: x['show']['title']), - }) - info.update(self._ARD_extract_episode_info(info['title'])) - return info + def fetch_page(page_num): + for item in traverse_obj(call_api(page_num), ('teasers', ..., {dict})): + item_id = traverse_obj(item, ('links', 'target', ('urlId', 'id')), 'id', get_all=False) + if not item_id or item_id == playlist_id: + continue + item_mode = 'sammlung' if item.get('type') == 'compilation' else 'video' + yield self.url_result( + f'https://www.ardmediathek.de/{item_mode}/{item_id}', + ie=(ARDMediathekCollectionIE if item_mode == 'sammlung' else ARDBetaMediathekIE), + **traverse_obj(item, { + 'id': ('id', {str}), + 'title': ('longTitle', {str}), + 'duration': ('duration', {int_or_none}), + 'timestamp': ('broadcastedOn', {parse_iso8601}), + })) + + page_data = call_api(0) + full_id = join_nonempty(playlist_id, season_number, version, delim='_') + + return self.playlist_result( + OnDemandPagedList(fetch_page, self._PAGE_SIZE), full_id, display_id=display_id, + title=page_data.get('title'), description=page_data.get('synopsis')) diff --git a/yt_dlp/extractor/arkena.py b/yt_dlp/extractor/arkena.py index de36ec8868..aa6c5ca4d6 100644 --- a/yt_dlp/extractor/arkena.py +++ b/yt_dlp/extractor/arkena.py @@ -64,7 +64,7 @@ def _real_extract(self, url): raise ExtractorError('Invalid URL', expected=True) media = self._download_json( - 'https://video.qbrick.com/api/v1/public/accounts/%s/medias/%s' % (account_id, video_id), + f'https://video.qbrick.com/api/v1/public/accounts/{account_id}/medias/{video_id}', video_id, query={ # https://video.qbrick.com/docs/api/examples/library-api.html 'fields': 'asset/resources/*/renditions/*(height,id,language,links/*(href,mimeType),type,size,videos/*(audios/*(codec,sampleRate),bitrate,codec,duration,height,width),width),created,metadata/*(title,description),tags', @@ -131,8 +131,8 @@ def _real_extract(self, url): formats.extend(self._extract_f4m_formats( href, video_id, f4m_id='hds', fatal=False)) elif mime_type == 'application/dash+xml': - formats.extend(self._extract_f4m_formats( - href, video_id, f4m_id='hds', fatal=False)) + formats.extend(self._extract_mpd_formats( + href, video_id, mpd_id='dash', fatal=False)) elif mime_type == 'application/vnd.ms-sstr+xml': formats.extend(self._extract_ism_formats( href, video_id, ism_id='mss', fatal=False)) diff --git a/yt_dlp/extractor/arnes.py b/yt_dlp/extractor/arnes.py index a493714d1f..f196f611ab 100644 --- a/yt_dlp/extractor/arnes.py +++ b/yt_dlp/extractor/arnes.py @@ -1,11 +1,9 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse, -) from ..utils import ( - format_field, float_or_none, + format_field, int_or_none, parse_iso8601, remove_start, @@ -35,7 +33,7 @@ class ArnesIE(InfoExtractor): 'view_count': int, 'tags': ['linearna_algebra'], 'start_time': 10, - } + }, }, { 'url': 'https://video.arnes.si/api/asset/s1YjnV7hadlC/play.mp4', 'only_matching': True, @@ -93,6 +91,6 @@ def _real_extract(self, url): 'duration': float_or_none(video.get('duration'), 1000), 'view_count': int_or_none(video.get('views')), 'tags': video.get('hashtags'), - 'start_time': int_or_none(compat_parse_qs( - compat_urllib_parse_urlparse(url).query).get('t', [None])[0]), + 'start_time': int_or_none(urllib.parse.parse_qs( + urllib.parse.urlparse(url).query).get('t', [None])[0]), } diff --git a/yt_dlp/extractor/art19.py b/yt_dlp/extractor/art19.py new file mode 100644 index 0000000000..deec7ad012 --- /dev/null +++ b/yt_dlp/extractor/art19.py @@ -0,0 +1,303 @@ +import re + +from .common import InfoExtractor +from ..utils import float_or_none, int_or_none, parse_iso8601, url_or_none +from ..utils.traversal import traverse_obj + + +class Art19IE(InfoExtractor): + _UUID_REGEX = r'[\da-f]{8}-?[\da-f]{4}-?[\da-f]{4}-?[\da-f]{4}-?[\da-f]{12}' + _VALID_URL = [ + rf'https?://(?:www\.)?art19\.com/shows/[^/#?]+/episodes/(?P<id>{_UUID_REGEX})', + rf'https?://rss\.art19\.com/episodes/(?P<id>{_UUID_REGEX})\.mp3', + ] + _EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL[0]})'] + + _TESTS = [{ + 'url': 'https://rss.art19.com/episodes/5ba1413c-48b8-472b-9cc3-cfd952340bdb.mp3', + 'info_dict': { + 'id': '5ba1413c-48b8-472b-9cc3-cfd952340bdb', + 'ext': 'mp3', + 'title': 'Why Did DeSantis Drop Out?', + 'series': 'The Daily Briefing', + 'release_timestamp': 1705941275, + 'description': 'md5:da38961da4a3f7e419471365e3c6b49f', + 'episode': 'Episode 582', + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', + 'series_id': 'ed52a0ab-08b1-4def-8afc-549e4d93296d', + 'upload_date': '20240122', + 'timestamp': 1705940815, + 'episode_number': 582, + 'modified_date': '20240122', + 'episode_id': '5ba1413c-48b8-472b-9cc3-cfd952340bdb', + 'modified_timestamp': 1705941275, + 'release_date': '20240122', + 'duration': 527.4, + }, + }, { + 'url': 'https://art19.com/shows/scamfluencers/episodes/8319b776-4153-4d22-8630-631f204a03dd', + 'info_dict': { + 'id': '8319b776-4153-4d22-8630-631f204a03dd', + 'ext': 'mp3', + 'title': 'Martha Stewart: The Homemaker Hustler Part 2', + 'modified_date': '20240116', + 'upload_date': '20240105', + 'modified_timestamp': 1705435802, + 'episode_id': '8319b776-4153-4d22-8630-631f204a03dd', + 'series_id': 'd3c9b8ca-26b3-42f4-9bd8-21d1a9031e75', + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', + 'description': 'md5:4aa7cfd1358dc57e729835bc208d7893', + 'release_timestamp': 1705305660, + 'release_date': '20240115', + 'timestamp': 1704481536, + 'episode_number': 88, + 'series': 'Scamfluencers', + 'duration': 2588.37501, + 'episode': 'Episode 88', + }, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.nu.nl/formule-1/6291456/verstappen-wordt-een-synoniem-voor-formule-1.html', + 'info_dict': { + 'id': '7d42626a-7301-47db-bb8a-3b6f054d77d7', + 'ext': 'mp3', + 'title': "'Verstappen wordt een synoniem voor Formule 1'", + 'season': 'Seizoen 6', + 'description': 'md5:39a7159a31c4cda312b2e893bdd5c071', + 'episode_id': '7d42626a-7301-47db-bb8a-3b6f054d77d7', + 'duration': 3061.82111, + 'series_id': '93f4e113-2a60-4609-a564-755058fa40d8', + 'release_date': '20231126', + 'modified_timestamp': 1701156004, + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', + 'season_number': 6, + 'episode_number': 52, + 'modified_date': '20231128', + 'upload_date': '20231126', + 'timestamp': 1701025981, + 'season_id': '36097c1e-7455-490d-a2fe-e2f10b4d5f26', + 'series': 'De Boordradio', + 'release_timestamp': 1701026308, + 'episode': 'Episode 52', + }, + }, { + 'url': 'https://www.wishtv.com/podcast-episode/larry-bucshon-announces-retirement-from-congress/', + 'info_dict': { + 'id': '8da368bd-08d1-46d0-afaa-c134a4af7dc0', + 'ext': 'mp3', + 'title': 'Larry Bucshon announces retirement from congress', + 'upload_date': '20240115', + 'episode_number': 148, + 'episode': 'Episode 148', + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', + 'release_date': '20240115', + 'timestamp': 1705328205, + 'release_timestamp': 1705329275, + 'series': 'All INdiana Politics', + 'modified_date': '20240117', + 'modified_timestamp': 1705458901, + 'series_id': 'c4af6c27-b10f-4ff2-9f84-0f407df86ff1', + 'episode_id': '8da368bd-08d1-46d0-afaa-c134a4af7dc0', + 'description': 'md5:53b5239e4d14973a87125c217c255b2a', + 'duration': 1256.18848, + }, + }] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + yield from super()._extract_embed_urls(url, webpage) + for episode_id in re.findall( + rf'<div[^>]+\bclass=[\'"][^\'"]*art19-web-player[^\'"]*[\'"][^>]+\bdata-episode-id=[\'"]({cls._UUID_REGEX})[\'"]', webpage): + yield f'https://rss.art19.com/episodes/{episode_id}.mp3' + + def _real_extract(self, url): + episode_id = self._match_id(url) + + player_metadata = self._download_json( + f'https://art19.com/episodes/{episode_id}', episode_id, + note='Downloading player metadata', fatal=False, + headers={'Accept': 'application/vnd.art19.v0+json'}) + rss_metadata = self._download_json( + f'https://rss.art19.com/episodes/{episode_id}.json', episode_id, fatal=False, + note='Downloading RSS metadata') + + formats = [{ + 'format_id': 'direct', + 'url': f'https://rss.art19.com/episodes/{episode_id}.mp3', + 'vcodec': 'none', + 'acodec': 'mp3', + }] + for fmt_id, fmt_data in traverse_obj(rss_metadata, ('content', 'media', {dict.items}, ...)): + if fmt_id == 'waveform_bin': + continue + fmt_url = traverse_obj(fmt_data, ('url', {url_or_none})) + if not fmt_url: + continue + formats.append({ + 'format_id': fmt_id, + 'url': fmt_url, + 'vcodec': 'none', + 'acodec': fmt_id, + 'quality': -2 if fmt_id == 'ogg' else -1, + }) + + return { + 'id': episode_id, + 'formats': formats, + **traverse_obj(player_metadata, ('episode', { + 'title': ('title', {str}), + 'description': ('description_plain', {str}), + 'episode_id': ('id', {str}), + 'episode_number': ('episode_number', {int_or_none}), + 'season_id': ('season_id', {str}), + 'series_id': ('series_id', {str}), + 'timestamp': ('created_at', {parse_iso8601}), + 'release_timestamp': ('released_at', {parse_iso8601}), + 'modified_timestamp': ('updated_at', {parse_iso8601}), + })), + **traverse_obj(rss_metadata, ('content', { + 'title': ('episode_title', {str}), + 'description': ('episode_description_plain', {str}), + 'episode_id': ('episode_id', {str}), + 'episode_number': ('episode_number', {int_or_none}), + 'season': ('season_title', {str}), + 'season_id': ('season_id', {str}), + 'season_number': ('season_number', {int_or_none}), + 'series': ('series_title', {str}), + 'series_id': ('series_id', {str}), + 'thumbnail': ('cover_image', {url_or_none}), + 'duration': ('duration', {float_or_none}), + })), + } + + +class Art19ShowIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?art19\.com/shows/(?P<id>[\w-]+)(?:/embed)?/?' + _VALID_URL = [ + rf'{_VALID_URL_BASE}(?:$|[#?])', + r'https?://rss\.art19\.com/(?P<id>[\w-]+)/?(?:$|[#?])', + ] + _EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL_BASE}[^\'"])'] + + _TESTS = [{ + 'url': 'https://www.art19.com/shows/5898c087-a14f-48dc-b6fc-a2280a1ff6e0/', + 'info_dict': { + '_type': 'playlist', + 'id': '5898c087-a14f-48dc-b6fc-a2280a1ff6e0', + 'display_id': 'echt-gebeurd', + 'title': 'Echt Gebeurd', + 'description': 'md5:5fd11dc80b76e51ffd34b6067fd5e560', + 'timestamp': 1492642167, + 'upload_date': '20170419', + 'modified_timestamp': int, + 'modified_date': str, + 'tags': 'count:7', + }, + 'playlist_mincount': 425, + }, { + 'url': 'https://www.art19.com/shows/echt-gebeurd', + 'info_dict': { + '_type': 'playlist', + 'id': '5898c087-a14f-48dc-b6fc-a2280a1ff6e0', + 'display_id': 'echt-gebeurd', + 'title': 'Echt Gebeurd', + 'description': 'md5:5fd11dc80b76e51ffd34b6067fd5e560', + 'timestamp': 1492642167, + 'upload_date': '20170419', + 'modified_timestamp': int, + 'modified_date': str, + 'tags': 'count:7', + }, + 'playlist_mincount': 425, + }, { + 'url': 'https://rss.art19.com/scamfluencers', + 'info_dict': { + '_type': 'playlist', + 'id': 'd3c9b8ca-26b3-42f4-9bd8-21d1a9031e75', + 'display_id': 'scamfluencers', + 'title': 'Scamfluencers', + 'description': 'md5:7d239d670c0ced6dadbf71c4caf764b7', + 'timestamp': 1647368573, + 'upload_date': '20220315', + 'modified_timestamp': int, + 'modified_date': str, + 'tags': [], + }, + 'playlist_mincount': 90, + }, { + 'url': 'https://art19.com/shows/enthuellt/embed', + 'info_dict': { + '_type': 'playlist', + 'id': 'e2cacf57-bb8a-4263-aa81-719bcdd4f80c', + 'display_id': 'enthuellt', + 'title': 'Enthüllt', + 'description': 'md5:17752246643414a2fd51744fc9a1c08e', + 'timestamp': 1601645860, + 'upload_date': '20201002', + 'modified_timestamp': int, + 'modified_date': str, + 'tags': 'count:10', + }, + 'playlist_mincount': 10, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://deconstructingyourself.com/deconstructing-yourself-podcast', + 'info_dict': { + '_type': 'playlist', + 'id': 'cfbb9b01-c295-4adb-8726-adde7c03cf21', + 'display_id': 'deconstructing-yourself', + 'title': 'Deconstructing Yourself', + 'description': 'md5:dab5082b28b248a35476abf64768854d', + 'timestamp': 1570581181, + 'upload_date': '20191009', + 'modified_timestamp': int, + 'modified_date': str, + 'tags': 'count:5', + }, + 'playlist_mincount': 80, + }, { + 'url': 'https://chicagoreader.com/columns-opinion/podcasts/ben-joravsky-show-podcast-episodes/', + 'info_dict': { + '_type': 'playlist', + 'id': '9dfa2c37-ab87-4c13-8388-4897914313ec', + 'display_id': 'the-ben-joravsky-show', + 'title': 'The Ben Joravsky Show', + 'description': 'md5:c0f3ec0ee0dbea764390e521adc8780a', + 'timestamp': 1550875095, + 'upload_date': '20190222', + 'modified_timestamp': int, + 'modified_date': str, + 'tags': ['Chicago Politics', 'chicago', 'Ben Joravsky'], + }, + 'playlist_mincount': 1900, + }] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + yield from super()._extract_embed_urls(url, webpage) + for series_id in re.findall( + r'<div[^>]+\bclass=[\'"][^\'"]*art19-web-player[^\'"]*[\'"][^>]+\bdata-series-id=[\'"]([\w-]+)[\'"]', webpage): + yield f'https://art19.com/shows/{series_id}' + + def _real_extract(self, url): + series_id = self._match_id(url) + series_metadata = self._download_json( + f'https://art19.com/series/{series_id}', series_id, note='Downloading series metadata', + headers={'Accept': 'application/vnd.art19.v0+json'}) + + return { + '_type': 'playlist', + 'entries': [ + self.url_result(f'https://rss.art19.com/episodes/{episode_id}.mp3', Art19IE) + for episode_id in traverse_obj(series_metadata, ('series', 'episode_ids', ..., {str})) + ], + **traverse_obj(series_metadata, ('series', { + 'id': ('id', {str}), + 'display_id': ('slug', {str}), + 'title': ('title', {str}), + 'description': ('description_plain', {str}), + 'timestamp': ('created_at', {parse_iso8601}), + 'modified_timestamp': ('updated_at', {parse_iso8601}), + })), + 'tags': traverse_obj(series_metadata, ('tags', ..., 'name', {str})), + } diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py index e3cc5afb05..142d4b066b 100644 --- a/yt_dlp/extractor/arte.py +++ b/yt_dlp/extractor/arte.py @@ -5,6 +5,7 @@ ExtractorError, GeoRestrictedError, int_or_none, + join_nonempty, parse_iso8601, parse_qs, strip_or_none, @@ -19,46 +20,22 @@ class ArteTVBaseIE(InfoExtractor): class ArteTVIE(ArteTVBaseIE): - _VALID_URL = r'''(?x) + _VALID_URL = rf'''(?x) (?:https?:// (?: - (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos| - api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s) + (?:www\.)?arte\.tv/(?P<lang>{ArteTVBaseIE._ARTE_LANGUAGES})/videos| + api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>{ArteTVBaseIE._ARTE_LANGUAGES}) ) |arte://program) - /(?P<id>\d{6}-\d{3}-[AF]|LIVE) - ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES} + /(?P<id>\d{{6}}-\d{{3}}-[AF]|LIVE) + ''' _TESTS = [{ 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', 'only_matching': True, - }, { - 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', - 'info_dict': { - 'id': '100103-000-A', - 'title': 'USA: Dyskryminacja na porodówce', - 'description': 'md5:242017b7cce59ffae340a54baefcafb1', - 'alt_title': 'ARTE Reportage', - 'upload_date': '20201103', - 'duration': 554, - 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530', - 'timestamp': 1604417980, - 'ext': 'mp4', - }, - 'params': {'skip_download': 'm3u8'} }, { 'note': 'No alt_title', 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/', - 'info_dict': { - 'id': '110371-000-A', - 'ext': 'mp4', - 'upload_date': '20220718', - 'duration': 154, - 'timestamp': 1658162460, - 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786', - 'title': 'La chaleur, supplice des arbres de rue', - 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530', - }, - 'params': {'skip_download': 'm3u8'} + 'only_matching': True, }, { 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', 'only_matching': True, @@ -67,19 +44,38 @@ class ArteTVIE(ArteTVBaseIE): 'only_matching': True, }, { 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/', + 'only_matching': True, + }, { + 'url': 'https://www.arte.tv/fr/videos/109067-000-A/la-loi-de-teheran/', 'info_dict': { - 'id': '110203-006-A', - 'chapters': 'count:16', - 'description': 'md5:cf592f1df52fe52007e3f8eac813c084', - 'alt_title': 'Zaz', - 'title': 'Baloise Session 2022', - 'timestamp': 1668445200, - 'duration': 4054, - 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/ubQjmVCGyRx3hmBuZEK9QZ/940x530', - 'upload_date': '20221114', + 'id': '109067-000-A', + 'ext': 'mp4', + 'description': 'md5:d2ca367b8ecee028dddaa8bd1aebc739', + 'timestamp': 1713927600, + 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/3rR6PLzfbigSkkeHtkCZNF/940x530', + 'duration': 7599, + 'title': 'La loi de Téhéran', + 'upload_date': '20240424', + 'subtitles': { + 'fr': 'mincount:1', + 'fr-acc': 'mincount:1', + 'fr-forced': 'mincount:1', + }, + }, + }, { + 'note': 'age-restricted', + 'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/', + 'info_dict': { + 'id': '006785-000-A', + 'description': 'md5:c2f94fdfefc8a280e4dab68ab96ab0ba', + 'title': 'The Element of Crime', + 'timestamp': 1696111200, + 'duration': 5849, + 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/q82dTTfyuCXupPsGxXsd7B/940x530', + 'upload_date': '20230930', 'ext': 'mp4', }, - 'expected_warnings': ['geo restricted'] + 'skip': '404 Not Found', }] _GEO_BYPASS = True @@ -130,13 +126,27 @@ class ArteTVIE(ArteTVBaseIE): ), } + @staticmethod + def _fix_accessible_subs_locale(subs): + updated_subs = {} + for lang, sub_formats in subs.items(): + for fmt in sub_formats: + url = fmt.get('url') or '' + suffix = ('acc' if url.endswith('-MAL.m3u8') + else 'forced' if '_VO' not in url + else None) + updated_subs.setdefault(join_nonempty(lang, suffix), []).append(fmt) + return updated_subs + def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') lang = mobj.group('lang') or mobj.group('lang_2') - langauge_code = self._LANG_MAP.get(lang) + language_code = self._LANG_MAP.get(lang) - config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id) + config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id, headers={ + 'x-validated-age': '18', + }) geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {} if geoblocking.get('restrictedArea'): @@ -160,16 +170,16 @@ def _real_extract(self, url): m = self._VERSION_CODE_RE.match(stream_version_code) if m: lang_pref = int(''.join('01'[x] for x in ( - m.group('vlang') == langauge_code, # we prefer voice in the requested language + m.group('vlang') == language_code, # we prefer voice in the requested language not m.group('audio_desc'), # and not the audio description version bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice - m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language + m.group('sub_lang') == language_code, # if subtitles are present, we prefer them in the requested language not m.group('has_sub'), # but we prefer no subtitles otherwise not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles ))) short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?') - if stream['protocol'].startswith('HLS'): + if 'HLS' in stream['protocol']: fmts, subs = self._extract_m3u8_formats_and_subtitles( stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False) for fmt in fmts: @@ -181,6 +191,7 @@ def _real_extract(self, url): secondary_formats.extend(fmts) else: formats.extend(fmts) + subs = self._fix_accessible_subs_locale(subs) self._merge_subtitles(subs, target=subtitles) elif stream['protocol'] in ('HTTPS', 'RTMP'): @@ -236,7 +247,7 @@ class ArteTVEmbedIE(InfoExtractor): 'description': 'md5:be40b667f45189632b78c1425c7c2ce1', 'upload_date': '20201116', }, - 'skip': 'No video available' + 'skip': 'No video available', }, { 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A', 'only_matching': True, @@ -251,7 +262,7 @@ def _real_extract(self, url): class ArteTVPlaylistIE(ArteTVBaseIE): - _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES + _VALID_URL = rf'https?://(?:www\.)?arte\.tv/(?P<lang>{ArteTVBaseIE._ARTE_LANGUAGES})/videos/(?P<id>RC-\d{{6}})' _TESTS = [{ 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', 'only_matching': True, @@ -287,7 +298,7 @@ def _real_extract(self, url): class ArteTVCategoryIE(ArteTVBaseIE): - _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES + _VALID_URL = rf'https?://(?:www\.)?arte\.tv/(?P<lang>{ArteTVBaseIE._ARTE_LANGUAGES})/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' _TESTS = [{ 'url': 'https://www.arte.tv/en/videos/politics-and-society/', 'info_dict': { @@ -301,7 +312,7 @@ class ArteTVCategoryIE(ArteTVBaseIE): @classmethod def suitable(cls, url): return ( - not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, )) + not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE)) and super().suitable(url)) def _real_extract(self, url): @@ -310,12 +321,12 @@ def _real_extract(self, url): items = [] for video in re.finditer( - r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang, + rf'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/{lang}/videos/[\w/-]+)(?P=q)', webpage): video = video.group('url') if video == url: continue - if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )): + if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE)): items.append(video) title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None diff --git a/yt_dlp/extractor/asiancrush.py b/yt_dlp/extractor/asiancrush.py deleted file mode 100644 index 23f310edb3..0000000000 --- a/yt_dlp/extractor/asiancrush.py +++ /dev/null @@ -1,196 +0,0 @@ -import functools -import re - -from .common import InfoExtractor -from .kaltura import KalturaIE -from ..utils import ( - extract_attributes, - int_or_none, - OnDemandPagedList, - parse_age_limit, - strip_or_none, - try_get, -) - - -class AsianCrushBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?(?P<host>(?:(?:asiancrush|yuyutv|midnightpulp)\.com|(?:cocoro|retrocrush)\.tv))' - _KALTURA_KEYS = [ - 'video_url', 'progressive_url', 'download_url', 'thumbnail_url', - 'widescreen_thumbnail_url', 'screencap_widescreen', - ] - _API_SUFFIX = {'retrocrush.tv': '-ott'} - - def _call_api(self, host, endpoint, video_id, query, resource): - return self._download_json( - 'https://api%s.%s/%s' % (self._API_SUFFIX.get(host, ''), host, endpoint), video_id, - 'Downloading %s JSON metadata' % resource, query=query, - headers=self.geo_verification_headers())['objects'] - - def _download_object_data(self, host, object_id, resource): - return self._call_api( - host, 'search', object_id, {'id': object_id}, resource)[0] - - def _get_object_description(self, obj): - return strip_or_none(obj.get('long_description') or obj.get('short_description')) - - def _parse_video_data(self, video): - title = video['name'] - - entry_id, partner_id = [None] * 2 - for k in self._KALTURA_KEYS: - k_url = video.get(k) - if k_url: - mobj = re.search(r'/p/(\d+)/.+?/entryId/([^/]+)/', k_url) - if mobj: - partner_id, entry_id = mobj.groups() - break - - meta_categories = try_get(video, lambda x: x['meta']['categories'], list) or [] - categories = list(filter(None, [c.get('name') for c in meta_categories])) - - show_info = video.get('show_info') or {} - - return { - '_type': 'url_transparent', - 'url': 'kaltura:%s:%s' % (partner_id, entry_id), - 'ie_key': KalturaIE.ie_key(), - 'id': entry_id, - 'title': title, - 'description': self._get_object_description(video), - 'age_limit': parse_age_limit(video.get('mpaa_rating') or video.get('tv_rating')), - 'categories': categories, - 'series': show_info.get('show_name'), - 'season_number': int_or_none(show_info.get('season_num')), - 'season_id': show_info.get('season_id'), - 'episode_number': int_or_none(show_info.get('episode_num')), - } - - -class AsianCrushIE(AsianCrushBaseIE): - _VALID_URL = r'%s/video/(?:[^/]+/)?0+(?P<id>\d+)v\b' % AsianCrushBaseIE._VALID_URL_BASE - _TESTS = [{ - 'url': 'https://www.asiancrush.com/video/004289v/women-who-flirt', - 'md5': 'c3b740e48d0ba002a42c0b72857beae6', - 'info_dict': { - 'id': '1_y4tmjm5r', - 'ext': 'mp4', - 'title': 'Women Who Flirt', - 'description': 'md5:b65c7e0ae03a85585476a62a186f924c', - 'timestamp': 1496936429, - 'upload_date': '20170608', - 'uploader_id': 'craig@crifkin.com', - 'age_limit': 13, - 'categories': 'count:5', - 'duration': 5812, - }, - }, { - 'url': 'https://www.asiancrush.com/video/she-was-pretty/011886v-pretty-episode-3/', - 'only_matching': True, - }, { - 'url': 'https://www.yuyutv.com/video/013886v/the-act-of-killing/', - 'only_matching': True, - }, { - 'url': 'https://www.yuyutv.com/video/peep-show/013922v-warring-factions/', - 'only_matching': True, - }, { - 'url': 'https://www.midnightpulp.com/video/010400v/drifters/', - 'only_matching': True, - }, { - 'url': 'https://www.midnightpulp.com/video/mononoke/016378v-zashikiwarashi-part-1/', - 'only_matching': True, - }, { - 'url': 'https://www.cocoro.tv/video/the-wonderful-wizard-of-oz/008878v-the-wonderful-wizard-of-oz-ep01/', - 'only_matching': True, - }, { - 'url': 'https://www.retrocrush.tv/video/true-tears/012328v-i...gave-away-my-tears', - 'only_matching': True, - }] - - def _real_extract(self, url): - host, video_id = self._match_valid_url(url).groups() - - if host == 'cocoro.tv': - webpage = self._download_webpage(url, video_id) - embed_vars = self._parse_json(self._search_regex( - r'iEmbedVars\s*=\s*({.+?})', webpage, 'embed vars', - default='{}'), video_id, fatal=False) or {} - video_id = embed_vars.get('entry_id') or video_id - - video = self._download_object_data(host, video_id, 'video') - return self._parse_video_data(video) - - -class AsianCrushPlaylistIE(AsianCrushBaseIE): - _VALID_URL = r'%s/series/0+(?P<id>\d+)s\b' % AsianCrushBaseIE._VALID_URL_BASE - _TESTS = [{ - 'url': 'https://www.asiancrush.com/series/006447s/fruity-samurai', - 'info_dict': { - 'id': '6447', - 'title': 'Fruity Samurai', - 'description': 'md5:7535174487e4a202d3872a7fc8f2f154', - }, - 'playlist_count': 13, - }, { - 'url': 'https://www.yuyutv.com/series/013920s/peep-show/', - 'only_matching': True, - }, { - 'url': 'https://www.midnightpulp.com/series/016375s/mononoke/', - 'only_matching': True, - }, { - 'url': 'https://www.cocoro.tv/series/008549s/the-wonderful-wizard-of-oz/', - 'only_matching': True, - }, { - 'url': 'https://www.retrocrush.tv/series/012355s/true-tears', - 'only_matching': True, - }] - _PAGE_SIZE = 1000000000 - - def _fetch_page(self, domain, parent_id, page): - videos = self._call_api( - domain, 'getreferencedobjects', parent_id, { - 'max': self._PAGE_SIZE, - 'object_type': 'video', - 'parent_id': parent_id, - 'start': page * self._PAGE_SIZE, - }, 'page %d' % (page + 1)) - for video in videos: - yield self._parse_video_data(video) - - def _real_extract(self, url): - host, playlist_id = self._match_valid_url(url).groups() - - if host == 'cocoro.tv': - webpage = self._download_webpage(url, playlist_id) - - entries = [] - - for mobj in re.finditer( - r'<a[^>]+href=(["\'])(?P<url>%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL, - webpage): - attrs = extract_attributes(mobj.group(0)) - if attrs.get('class') == 'clearfix': - entries.append(self.url_result( - mobj.group('url'), ie=AsianCrushIE.ie_key())) - - title = self._html_search_regex( - r'(?s)<h1\b[^>]\bid=["\']movieTitle[^>]+>(.+?)</h1>', webpage, - 'title', default=None) or self._og_search_title( - webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', - default=None) or self._html_extract_title(webpage) - if title: - title = re.sub(r'\s*\|\s*.+?$', '', title) - - description = self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'twitter:description', webpage, 'description', fatal=False) - else: - show = self._download_object_data(host, playlist_id, 'show') - title = show.get('name') - description = self._get_object_description(show) - entries = OnDemandPagedList( - functools.partial(self._fetch_page, host, playlist_id), - self._PAGE_SIZE) - - return self.playlist_result(entries, playlist_id, title, description) diff --git a/yt_dlp/extractor/asobichannel.py b/yt_dlp/extractor/asobichannel.py new file mode 100644 index 0000000000..e3479ede99 --- /dev/null +++ b/yt_dlp/extractor/asobichannel.py @@ -0,0 +1,168 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + merge_dicts, + parse_iso8601, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class AsobiChannelBaseIE(InfoExtractor): + _MICROCMS_HEADER = {'X-MICROCMS-API-KEY': 'qRaKehul9AHU8KtL0dnq1OCLKnFec6yrbcz3'} + + def _extract_info(self, metadata): + return traverse_obj(metadata, { + 'id': ('id', {str}), + 'title': ('title', {str}), + 'description': ('body', {clean_html}), + 'thumbnail': ('contents', 'video_thumb', 'url', {url_or_none}), + 'timestamp': ('publishedAt', {parse_iso8601}), + 'modified_timestamp': ('updatedAt', {parse_iso8601}), + 'channel': ('channel', 'name', {str}), + 'channel_id': ('channel', 'id', {str}), + }) + + +class AsobiChannelIE(AsobiChannelBaseIE): + IE_NAME = 'asobichannel' + IE_DESC = 'ASOBI CHANNEL' + + _VALID_URL = r'https?://asobichannel\.asobistore\.jp/watch/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://asobichannel.asobistore.jp/watch/1ypp48qd32p', + 'md5': '39df74e872afe032c4eb27b89144fc92', + 'info_dict': { + 'id': '1ypp48qd32p', + 'ext': 'mp4', + 'title': 'アイドルマスター ミリオンライブ! 765プロch 原っぱ通信 #1', + 'description': 'md5:b930bd2199c9b2fd75951ce4aaa7efd2', + 'thumbnail': 'https://images.microcms-assets.io/assets/d2420de4b9194e11beb164f99edb1f95/a8e6f84119f54eb9ab4ce16729239905/%E3%82%B5%E3%83%A0%E3%83%8D%20(1).png', + 'timestamp': 1697098247, + 'upload_date': '20231012', + 'modified_timestamp': 1698381162, + 'modified_date': '20231027', + 'channel': 'アイドルマスター', + 'channel_id': 'idolmaster', + }, + }, { + 'url': 'https://asobichannel.asobistore.jp/watch/redigiwnjzqj', + 'md5': '229fa8fb5c591c75ce8c37a497f113f6', + 'info_dict': { + 'id': 'redigiwnjzqj', + 'ext': 'mp4', + 'title': '【おまけ放送】アイドルマスター ミリオンライブ! 765プロch 原っぱ通信 #1', + 'description': 'md5:7d9cd35fb54425a6967822bd564ea2d9', + 'thumbnail': 'https://images.microcms-assets.io/assets/d2420de4b9194e11beb164f99edb1f95/20e5c1d6184242eebc2512a5dec59bf0/P1_%E5%8E%9F%E3%81%A3%E3%81%B1%E3%82%B5%E3%83%A0%E3%83%8D.png', + 'modified_timestamp': 1697797125, + 'modified_date': '20231020', + 'timestamp': 1697261769, + 'upload_date': '20231014', + 'channel': 'アイドルマスター', + 'channel_id': 'idolmaster', + }, + }] + + _survapi_header = None + + def _real_initialize(self): + token = self._download_json( + 'https://asobichannel-api.asobistore.jp/api/v1/vspf/token', None, + note='Retrieving API token') + self._survapi_header = {'Authorization': f'Bearer {token}'} + + def _process_vod(self, video_id, metadata): + content_id = metadata['contents']['video_id'] + + vod_data = self._download_json( + f'https://survapi.channel.or.jp/proxy/v1/contents/{content_id}/get_by_cuid', video_id, + headers=self._survapi_header, note='Downloading vod data') + + return { + 'formats': self._extract_m3u8_formats(vod_data['ex_content']['streaming_url'], video_id), + } + + def _process_live(self, video_id, metadata): + content_id = metadata['contents']['video_id'] + event_data = self._download_json( + f'https://survapi.channel.or.jp/ex/events/{content_id}?embed=channel', video_id, + headers=self._survapi_header, note='Downloading event data') + + player_type = traverse_obj(event_data, ('data', 'Player_type', {str})) + if player_type == 'poster': + self.raise_no_formats('Live event has not yet started', expected=True) + live_status = 'is_upcoming' + formats = [] + elif player_type == 'player': + live_status = 'is_live' + formats = self._extract_m3u8_formats( + event_data['data']['Channel']['Custom_live_url'], video_id, live=True) + else: + raise ExtractorError('Unsupported player type {player_type!r}') + + return { + 'release_timestamp': traverse_obj(metadata, ('period', 'start', {parse_iso8601})), + 'live_status': live_status, + 'formats': formats, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + metadata = self._download_json( + f'https://channel.microcms.io/api/v1/media/{video_id}', video_id, + headers=self._MICROCMS_HEADER) + + info = self._extract_info(metadata) + + video_type = traverse_obj(metadata, ('contents', 'video_type', 0, {str})) + if video_type == 'VOD': + return merge_dicts(info, self._process_vod(video_id, metadata)) + if video_type == 'LIVE': + return merge_dicts(info, self._process_live(video_id, metadata)) + + raise ExtractorError(f'Unexpected video type {video_type!r}') + + +class AsobiChannelTagURLIE(AsobiChannelBaseIE): + IE_NAME = 'asobichannel:tag' + IE_DESC = 'ASOBI CHANNEL' + + _VALID_URL = r'https?://asobichannel\.asobistore\.jp/tag/(?P<id>[a-z0-9-_]+)' + _TESTS = [{ + 'url': 'https://asobichannel.asobistore.jp/tag/bjhh-nbcja', + 'info_dict': { + 'id': 'bjhh-nbcja', + 'title': 'アイドルマスター ミリオンライブ! 765プロch 原っぱ通信', + }, + 'playlist_mincount': 16, + }, { + 'url': 'https://asobichannel.asobistore.jp/tag/hvm5qw3c6od', + 'info_dict': { + 'id': 'hvm5qw3c6od', + 'title': 'アイマスMOIW2023ラジオ', + }, + 'playlist_mincount': 13, + }] + + def _real_extract(self, url): + tag_id = self._match_id(url) + webpage = self._download_webpage(url, tag_id) + title = traverse_obj(self._search_nextjs_data( + webpage, tag_id, fatal=False), ('props', 'pageProps', 'data', 'name', {str})) + + media = self._download_json( + f'https://channel.microcms.io/api/v1/media?limit=999&filters=(tag[contains]{tag_id})', + tag_id, headers=self._MICROCMS_HEADER) + + def entries(): + for metadata in traverse_obj(media, ('contents', lambda _, v: v['id'])): + yield { + '_type': 'url', + 'url': f'https://asobichannel.asobistore.jp/watch/{metadata["id"]}', + 'ie_key': AsobiChannelIE.ie_key(), + **self._extract_info(metadata), + } + + return self.playlist_result(entries(), tag_id, title) diff --git a/yt_dlp/extractor/asobistage.py b/yt_dlp/extractor/asobistage.py new file mode 100644 index 0000000000..8fa8f3edb6 --- /dev/null +++ b/yt_dlp/extractor/asobistage.py @@ -0,0 +1,154 @@ +import functools + +from .common import InfoExtractor +from ..utils import str_or_none, url_or_none +from ..utils.traversal import traverse_obj + + +class AsobiStageIE(InfoExtractor): + IE_DESC = 'ASOBISTAGE (アソビステージ)' + _VALID_URL = r'https?://asobistage\.asobistore\.jp/event/(?P<id>(?P<event>\w+)/(?P<type>archive|player)/(?P<slug>\w+))(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://asobistage.asobistore.jp/event/315passionhour_2022summer/archive/frame', + 'info_dict': { + 'id': '315passionhour_2022summer/archive/frame', + 'title': '315プロダクションプレゼンツ 315パッションアワー!!!', + 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': 'edff52f2', + 'ext': 'mp4', + 'title': '315passion_FRAME_only', + 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+', + }, + }], + }, { + 'url': 'https://asobistage.asobistore.jp/event/idolmaster_idolworld2023_goods/archive/live', + 'info_dict': { + 'id': 'idolmaster_idolworld2023_goods/archive/live', + 'title': 'md5:378510b6e830129d505885908bd6c576', + 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '3aef7110', + 'ext': 'mp4', + 'title': 'asobistore_station_1020_serverREC', + 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+', + }, + }], + }, { + 'url': 'https://asobistage.asobistore.jp/event/sidem_fclive_bpct/archive/premium_hc', + 'playlist_count': 4, + 'info_dict': { + 'id': 'sidem_fclive_bpct/archive/premium_hc', + 'title': '315 Production presents F@NTASTIC COMBINATION LIVE ~BRAINPOWER!!~/~CONNECTIME!!!!~', + 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+', + }, + }, { + 'url': 'https://asobistage.asobistore.jp/event/ijigenfes_utagassen/player/day1', + 'only_matching': True, + }] + + _API_HOST = 'https://asobistage-api.asobistore.jp' + _HEADERS = {} + _is_logged_in = False + + @functools.cached_property + def _owned_tickets(self): + owned_tickets = set() + if not self._is_logged_in: + return owned_tickets + + for path, name in [ + ('api/v1/purchase_history/list', 'ticket purchase history'), + ('api/v1/serialcode/list', 'redemption history'), + ]: + response = self._download_json( + f'{self._API_HOST}/{path}', None, f'Downloading {name}', + f'Unable to download {name}', expected_status=400) + if traverse_obj(response, ('payload', 'error_message'), 'error') == 'notlogin': + self._is_logged_in = False + break + owned_tickets.update( + traverse_obj(response, ('payload', 'value', ..., 'digital_product_id', {str_or_none}))) + + return owned_tickets + + def _get_available_channel_id(self, channel): + channel_id = traverse_obj(channel, ('chennel_vspf_id', {str})) + if not channel_id: + return None + # if rights_type_id == 6, then 'No conditions (no login required - non-members are OK)' + if traverse_obj(channel, ('viewrights', lambda _, v: v['rights_type_id'] == 6)): + return channel_id + available_tickets = traverse_obj(channel, ( + 'viewrights', ..., ('tickets', 'serialcodes'), ..., 'digital_product_id', {str_or_none})) + if not self._owned_tickets.intersection(available_tickets): + self.report_warning( + f'You are not a ticketholder for "{channel.get("channel_name") or channel_id}"') + return None + return channel_id + + def _real_initialize(self): + if self._get_cookies(self._API_HOST): + self._is_logged_in = True + token = self._download_json( + f'{self._API_HOST}/api/v1/vspf/token', None, 'Getting token', 'Unable to get token') + self._HEADERS['Authorization'] = f'Bearer {token}' + + def _real_extract(self, url): + video_id, event, type_, slug = self._match_valid_url(url).group('id', 'event', 'type', 'slug') + video_type = {'archive': 'archives', 'player': 'broadcasts'}[type_] + webpage = self._download_webpage(url, video_id) + event_data = traverse_obj( + self._search_nextjs_data(webpage, video_id, default={}), + ('props', 'pageProps', 'eventCMSData', { + 'title': ('event_name', {str}), + 'thumbnail': ('event_thumbnail_image', {url_or_none}), + })) + + available_channels = traverse_obj(self._download_json( + f'https://asobistage.asobistore.jp/cdn/v101/events/{event}/{video_type}.json', + video_id, 'Getting channel list', 'Unable to get channel list'), ( + video_type, lambda _, v: v['broadcast_slug'] == slug, + 'channels', lambda _, v: v['chennel_vspf_id'] != '00000')) + + entries = [] + for channel_id in traverse_obj(available_channels, (..., {self._get_available_channel_id})): + if video_type == 'archives': + channel_json = self._download_json( + f'https://survapi.channel.or.jp/proxy/v1/contents/{channel_id}/get_by_cuid', channel_id, + 'Getting archive channel info', 'Unable to get archive channel info', fatal=False, + headers=self._HEADERS) + channel_data = traverse_obj(channel_json, ('ex_content', { + 'm3u8_url': 'streaming_url', + 'title': 'title', + 'thumbnail': ('thumbnail', 'url'), + })) + else: # video_type == 'broadcasts' + channel_json = self._download_json( + f'https://survapi.channel.or.jp/ex/events/{channel_id}', channel_id, + 'Getting live channel info', 'Unable to get live channel info', fatal=False, + headers=self._HEADERS, query={'embed': 'channel'}) + channel_data = traverse_obj(channel_json, ('data', { + 'm3u8_url': ('Channel', 'Custom_live_url'), + 'title': 'Name', + 'thumbnail': 'Poster_url', + })) + + entries.append({ + 'id': channel_id, + 'title': channel_data.get('title'), + 'formats': self._extract_m3u8_formats(channel_data.get('m3u8_url'), channel_id, fatal=False), + 'is_live': video_type == 'broadcasts', + 'thumbnail': url_or_none(channel_data.get('thumbnail')), + }) + + if not self._is_logged_in and not entries: + self.raise_login_required() + + return self.playlist_result(entries, video_id, **event_data) diff --git a/yt_dlp/extractor/atresplayer.py b/yt_dlp/extractor/atresplayer.py index a20e7f9889..0fe95bec5c 100644 --- a/yt_dlp/extractor/atresplayer.py +++ b/yt_dlp/extractor/atresplayer.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -20,7 +20,7 @@ class AtresPlayerIE(InfoExtractor): 'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc', 'duration': 3413, }, - 'skip': 'This video is only available for registered users' + 'skip': 'This video is only available for registered users', }, { 'url': 'https://www.atresplayer.com/lasexta/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_5ad08edf986b2855ed47adc4/', @@ -33,14 +33,6 @@ class AtresPlayerIE(InfoExtractor): ] _API_BASE = 'https://api.atresplayer.com/' - def _handle_error(self, e, code): - if isinstance(e.cause, compat_HTTPError) and e.cause.code == code: - error = self._parse_json(e.cause.read(), None) - if error.get('error') == 'required_registered': - self.raise_login_required() - raise ExtractorError(error['error_description'], expected=True) - raise - def _perform_login(self, username, password): self._request_webpage( self._API_BASE + 'login', None, 'Downloading login page') @@ -49,13 +41,15 @@ def _perform_login(self, username, password): target_url = self._download_json( 'https://account.atresmedia.com/api/login', None, 'Logging in', headers={ - 'Content-Type': 'application/x-www-form-urlencoded' + 'Content-Type': 'application/x-www-form-urlencoded', }, data=urlencode_postdata({ 'username': username, 'password': password, }))['targetUrl'] except ExtractorError as e: - self._handle_error(e, 400) + if isinstance(e.cause, HTTPError) and e.cause.status == 400: + raise ExtractorError('Invalid username and/or password', expected=True) + raise self._request_webpage(target_url, None, 'Following Target URL') @@ -66,7 +60,12 @@ def _real_extract(self, url): episode = self._download_json( self._API_BASE + 'client/v1/player/episode/' + video_id, video_id) except ExtractorError as e: - self._handle_error(e, 403) + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + error = self._parse_json(e.cause.response.read(), None) + if error.get('error') == 'required_registered': + self.raise_login_required() + raise ExtractorError(error['error_description'], expected=True) + raise title = episode['titulo'] diff --git a/yt_dlp/extractor/atscaleconf.py b/yt_dlp/extractor/atscaleconf.py index 3f7b1e9f8d..b219eeec5c 100644 --- a/yt_dlp/extractor/atscaleconf.py +++ b/yt_dlp/extractor/atscaleconf.py @@ -12,7 +12,7 @@ class AtScaleConfEventIE(InfoExtractor): 'info_dict': { 'id': 'data-scale-spring-2022', 'title': 'Data @Scale Spring 2022', - 'description': 'md5:7d7ca1c42ac9c6d8a785092a1aea4b55' + 'description': 'md5:7d7ca1c42ac9c6d8a785092a1aea4b55', }, }, { 'url': 'https://atscaleconference.com/events/video-scale-2021/', @@ -20,15 +20,15 @@ class AtScaleConfEventIE(InfoExtractor): 'info_dict': { 'id': 'video-scale-2021', 'title': 'Video @Scale 2021', - 'description': 'md5:7d7ca1c42ac9c6d8a785092a1aea4b55' + 'description': 'md5:7d7ca1c42ac9c6d8a785092a1aea4b55', }, }] def _real_extract(self, url): - id = self._match_id(url) - webpage = self._download_webpage(url, id) + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) return self.playlist_from_matches( re.findall(r'data-url\s*=\s*"(https?://(?:www\.)?atscaleconference\.com/videos/[^"]+)"', webpage), - ie='Generic', playlist_id=id, + ie='Generic', playlist_id=playlist_id, title=self._og_search_title(webpage), description=self._og_search_description(webpage)) diff --git a/yt_dlp/extractor/atttechchannel.py b/yt_dlp/extractor/atttechchannel.py deleted file mode 100644 index 6ff4ec0ad3..0000000000 --- a/yt_dlp/extractor/atttechchannel.py +++ /dev/null @@ -1,53 +0,0 @@ -from .common import InfoExtractor -from ..utils import unified_strdate - - -class ATTTechChannelIE(InfoExtractor): - _VALID_URL = r'https?://techchannel\.att\.com/play-video\.cfm/([^/]+/)*(?P<id>.+)' - _TEST = { - 'url': 'http://techchannel.att.com/play-video.cfm/2014/1/27/ATT-Archives-The-UNIX-System-Making-Computers-Easier-to-Use', - 'info_dict': { - 'id': '11316', - 'display_id': 'ATT-Archives-The-UNIX-System-Making-Computers-Easier-to-Use', - 'ext': 'flv', - 'title': 'AT&T Archives : The UNIX System: Making Computers Easier to Use', - 'description': 'A 1982 film about UNIX is the foundation for software in use around Bell Labs and AT&T.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20140127', - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - video_url = self._search_regex( - r"url\s*:\s*'(rtmp://[^']+)'", - webpage, 'video URL') - - video_id = self._search_regex( - r'mediaid\s*=\s*(\d+)', - webpage, 'video id', fatal=False) - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - upload_date = unified_strdate(self._search_regex( - r'[Rr]elease\s+date:\s*(\d{1,2}/\d{1,2}/\d{4})', - webpage, 'upload date', fatal=False), False) - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'ext': 'flv', - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - } diff --git a/yt_dlp/extractor/atvat.py b/yt_dlp/extractor/atvat.py index d6ed9e4958..37bb616952 100644 --- a/yt_dlp/extractor/atvat.py +++ b/yt_dlp/extractor/atvat.py @@ -1,11 +1,11 @@ -import datetime +import datetime as dt from .common import InfoExtractor from ..utils import ( + ExtractorError, float_or_none, jwt_encode_hs256, try_get, - ExtractorError, ) @@ -19,7 +19,7 @@ class ATVAtIE(InfoExtractor): 'id': 'v-ce9cgn1e70n5-1', 'ext': 'mp4', 'title': 'Bauer sucht Frau - Staffel 18 Folge 3 - Die Hofwochen', - } + }, }, { 'url': 'https://www.atv.at/tv/bauer-sucht-frau/staffel-18/episode-01/bauer-sucht-frau-staffel-18-vorstellungsfolge-1', 'only_matching': True, @@ -66,14 +66,14 @@ def _real_extract(self, url): video_id=video_id) video_title = json_data['views']['default']['page']['title'] - contentResource = json_data['views']['default']['page']['contentResource'] - content_id = contentResource[0]['id'] - content_ids = [{'id': id, 'subclip_start': content['start'], 'subclip_end': content['end']} - for id, content in enumerate(contentResource)] + content_resource = json_data['views']['default']['page']['contentResource'] + content_id = content_resource[0]['id'] + content_ids = [{'id': id_, 'subclip_start': content['start'], 'subclip_end': content['end']} + for id_, content in enumerate(content_resource)] - time_of_request = datetime.datetime.now() - not_before = time_of_request - datetime.timedelta(minutes=5) - expire = time_of_request + datetime.timedelta(minutes=5) + time_of_request = dt.datetime.now() + not_before = time_of_request - dt.timedelta(minutes=5) + expire = time_of_request + dt.timedelta(minutes=5) payload = { 'content_ids': { content_id: content_ids, @@ -87,17 +87,17 @@ def _real_extract(self, url): videos = self._download_json( 'https://vas-v4.p7s1video.net/4.0/getsources', content_id, 'Downloading videos JSON', query={ - 'token': jwt_token.decode('utf-8') + 'token': jwt_token.decode('utf-8'), }) - video_id, videos_data = list(videos['data'].items())[0] + video_id, videos_data = next(iter(videos['data'].items())) error_msg = try_get(videos_data, lambda x: x['error']['title']) if error_msg == 'Geo check failed': self.raise_geo_restricted(error_msg) elif error_msg: raise ExtractorError(error_msg) entries = [ - self._extract_video_info(url, contentResource[video['id']], video) + self._extract_video_info(url, content_resource[video['id']], video) for video in videos_data] return { diff --git a/yt_dlp/extractor/audimedia.py b/yt_dlp/extractor/audimedia.py index 35114e5455..c5a9c7e294 100644 --- a/yt_dlp/extractor/audimedia.py +++ b/yt_dlp/extractor/audimedia.py @@ -19,7 +19,7 @@ class AudiMediaIE(InfoExtractor): 'timestamp': 1448354940, 'duration': 74022, 'view_count': int, - } + }, }, { 'url': 'https://www.audi-mediacenter.com/en/audimediatv/video/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-2991', 'only_matching': True, @@ -73,7 +73,7 @@ def _real_extract(self, url): bitrate = self._search_regex(r'(\d+)k', video_version_url, 'bitrate', default=None) if bitrate: f.update({ - 'format_id': 'http-%s' % bitrate, + 'format_id': f'http-{bitrate}', }) formats.append(f) diff --git a/yt_dlp/extractor/audioboom.py b/yt_dlp/extractor/audioboom.py index a23fcd2999..751b74add7 100644 --- a/yt_dlp/extractor/audioboom.py +++ b/yt_dlp/extractor/audioboom.py @@ -15,7 +15,7 @@ class AudioBoomIE(InfoExtractor): 'duration': 4000.99, 'uploader': 'Sue Perkins: An hour or so with...', 'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/perkins', - } + }, }, { # Direct mp3-file link 'url': 'https://audioboom.com/posts/8128496.mp3', 'md5': 'e329edf304d450def95c7f86a9165ee1', @@ -27,7 +27,7 @@ class AudioBoomIE(InfoExtractor): 'duration': 1689.7, 'uploader': 'Lost Dot Podcast: The Trans Pyrenees and Transcontinental Race', 'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channels/5003904', - } + }, }, { 'url': 'https://audioboom.com/posts/4279833-3-09-2016-czaban-hour-3?t=0', 'only_matching': True, diff --git a/yt_dlp/extractor/audiodraft.py b/yt_dlp/extractor/audiodraft.py index 71e5afd8c8..484ad4e1ab 100644 --- a/yt_dlp/extractor/audiodraft.py +++ b/yt_dlp/extractor/audiodraft.py @@ -9,7 +9,7 @@ def _audiodraft_extract_from_id(self, player_entry_id): headers={ 'Content-type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest', - }, data=f'id={player_entry_id}'.encode('utf-8')) + }, data=f'id={player_entry_id}'.encode()) return { 'id': str(data_json['entry_id']), @@ -65,9 +65,10 @@ class AudiodraftCustomIE(AudiodraftBaseIE): }] def _real_extract(self, url): - id = self._match_id(url) - webpage = self._download_webpage(url, id) - player_entry_id = self._search_regex(r'playAudio\(\'(player_entry_\d+)\'\);', webpage, id, 'play entry id') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + player_entry_id = self._search_regex( + r'playAudio\(\'(player_entry_\d+)\'\);', webpage, video_id, 'play entry id') return self._audiodraft_extract_from_id(player_entry_id) @@ -89,5 +90,5 @@ class AudiodraftGenericIE(AudiodraftBaseIE): }] def _real_extract(self, url): - id = self._match_id(url) - return self._audiodraft_extract_from_id(f'player_entry_{id}') + video_id = self._match_id(url) + return self._audiodraft_extract_from_id(f'player_entry_{video_id}') diff --git a/yt_dlp/extractor/audiomack.py b/yt_dlp/extractor/audiomack.py index 5c4160fe46..1d4460c9f8 100644 --- a/yt_dlp/extractor/audiomack.py +++ b/yt_dlp/extractor/audiomack.py @@ -3,7 +3,6 @@ from .common import InfoExtractor from .soundcloud import SoundcloudIE -from ..compat import compat_str from ..utils import ( ExtractorError, url_basename, @@ -22,8 +21,8 @@ class AudiomackIE(InfoExtractor): 'id': '310086', 'ext': 'mp3', 'uploader': 'Roosh Williams', - 'title': 'Extraordinary' - } + 'title': 'Extraordinary', + }, }, # audiomack wrapper around soundcloud song # Needs new test URL. @@ -56,7 +55,7 @@ def _real_extract(self, url): # API is inconsistent with errors if 'url' not in api_response or not api_response['url'] or 'error' in api_response: - raise ExtractorError('Invalid url %s' % url) + raise ExtractorError(f'Invalid url {url}') # Audiomack wraps a lot of soundcloud tracks in their branded wrapper # if so, pass the work off to the soundcloud extractor @@ -64,7 +63,7 @@ def _real_extract(self, url): return self.url_result(api_response['url'], SoundcloudIE.ie_key()) return { - 'id': compat_str(api_response.get('id', album_url_tag)), + 'id': str(api_response.get('id', album_url_tag)), 'uploader': api_response.get('artist'), 'title': api_response.get('title'), 'url': api_response['url'], @@ -82,8 +81,8 @@ class AudiomackAlbumIE(InfoExtractor): 'info_dict': { 'id': '812251', - 'title': 'Tha Tour: Part 2 (Official Mixtape)' - } + 'title': 'Tha Tour: Part 2 (Official Mixtape)', + }, }, # Album playlist ripped from fakeshoredrive with no metadata { @@ -98,16 +97,16 @@ class AudiomackAlbumIE(InfoExtractor): 'id': '837576', 'ext': 'mp3', 'uploader': 'Lil Herb a.k.a. G Herbo', - } + }, }, { 'info_dict': { 'title': 'PPP (Pistol P Project) - 10. 4 Minutes Of Hell Part 4 (prod by DY OF 808 MAFIA)', 'id': '837580', 'ext': 'mp3', 'uploader': 'Lil Herb a.k.a. G Herbo', - } + }, }], - } + }, ] def _real_extract(self, url): @@ -123,12 +122,12 @@ def _real_extract(self, url): api_response = self._download_json( 'http://www.audiomack.com/api/music/url/album/%s/%d?extended=1&_=%d' % (album_url_tag, track_no, time.time()), album_url_tag, - note='Querying song information (%d)' % (track_no + 1)) + note=f'Querying song information ({track_no + 1})') # Total failure, only occurs when url is totally wrong # Won't happen in middle of valid playlist (next case) if 'url' not in api_response or 'error' in api_response: - raise ExtractorError('Invalid url for track %d of album url %s' % (track_no, url)) + raise ExtractorError(f'Invalid url for track {track_no} of album url {url}') # URL is good but song id doesn't exist - usually means end of playlist elif not api_response['url']: break @@ -136,10 +135,10 @@ def _real_extract(self, url): # Pull out the album metadata and add to result (if it exists) for resultkey, apikey in [('id', 'album_id'), ('title', 'album_title')]: if apikey in api_response and resultkey not in result: - result[resultkey] = compat_str(api_response[apikey]) + result[resultkey] = str(api_response[apikey]) song_id = url_basename(api_response['url']).rpartition('.')[0] result['entries'].append({ - 'id': compat_str(api_response.get('id', song_id)), + 'id': str(api_response.get('id', song_id)), 'uploader': api_response.get('artist'), 'title': api_response.get('title', song_id), 'url': api_response['url'], diff --git a/yt_dlp/extractor/audius.py b/yt_dlp/extractor/audius.py index 6448b449b9..c611c6e081 100644 --- a/yt_dlp/extractor/audius.py +++ b/yt_dlp/extractor/audius.py @@ -1,7 +1,7 @@ import random +import urllib.parse from .common import InfoExtractor -from ..compat import compat_str, compat_urllib_parse_unquote from ..utils import ExtractorError, str_or_none, try_get @@ -15,13 +15,13 @@ def _get_response_data(self, response): if response_data is not None: return response_data if len(response) == 1 and 'message' in response: - raise ExtractorError('API error: %s' % response['message'], + raise ExtractorError('API error: {}'.format(response['message']), expected=True) raise ExtractorError('Unexpected API response') def _select_api_base(self): """Selecting one of the currently available API hosts""" - response = super(AudiusBaseIE, self)._download_json( + response = super()._download_json( 'https://api.audius.co/', None, note='Requesting available API hosts', errnote='Unable to request available API hosts') @@ -41,8 +41,8 @@ def _prepare_url(url, title): anything from this link, since the Audius API won't be able to resolve this url """ - url = compat_urllib_parse_unquote(url) - title = compat_urllib_parse_unquote(title) + url = urllib.parse.unquote(url) + title = urllib.parse.unquote(title) if '/' in title or '%2F' in title: fixed_title = title.replace('/', '%5C').replace('%2F', '%5C') return url.replace(title, fixed_title) @@ -54,19 +54,19 @@ def _api_request(self, path, item_id=None, note='Downloading JSON metadata', if self._API_BASE is None: self._select_api_base() try: - response = super(AudiusBaseIE, self)._download_json( - '%s%s%s' % (self._API_BASE, self._API_V, path), item_id, note=note, + response = super()._download_json( + f'{self._API_BASE}{self._API_V}{path}', item_id, note=note, errnote=errnote, expected_status=expected_status) except ExtractorError as exc: # some of Audius API hosts may not work as expected and return HTML - if 'Failed to parse JSON' in compat_str(exc): + if 'Failed to parse JSON' in str(exc): raise ExtractorError('An error occurred while receiving data. Try again', expected=True) raise exc return self._get_response_data(response) def _resolve_url(self, url, item_id): - return self._api_request('/resolve?url=%s' % url, item_id, + return self._api_request(f'/resolve?url={url}', item_id, expected_status=404) @@ -91,7 +91,7 @@ class AudiusIE(AudiusBaseIE): 'view_count': int, 'like_count': int, 'repost_count': int, - } + }, }, { # Regular track @@ -109,14 +109,14 @@ class AudiusIE(AudiusBaseIE): 'view_count': int, 'like_count': int, 'repost_count': int, - } + }, }, ] _ARTWORK_MAP = { - "150x150": 150, - "480x480": 480, - "1000x1000": 1000 + '150x150': 150, + '480x480': 480, + '1000x1000': 1000, } def _real_extract(self, url): @@ -130,7 +130,7 @@ def _real_extract(self, url): else: # API link title = None # uploader = None - track_data = self._api_request('/tracks/%s' % track_id, track_id) + track_data = self._api_request(f'/tracks/{track_id}', track_id) if not isinstance(track_data, dict): raise ExtractorError('Unexpected API response') @@ -144,7 +144,7 @@ def _real_extract(self, url): if isinstance(artworks_data, dict): for quality_key, thumbnail_url in artworks_data.items(): thumbnail = { - "url": thumbnail_url + 'url': thumbnail_url, } quality_code = self._ARTWORK_MAP.get(quality_key) if quality_code is not None: @@ -154,12 +154,12 @@ def _real_extract(self, url): return { 'id': track_id, 'title': track_data.get('title', title), - 'url': '%s/v1/tracks/%s/stream' % (self._API_BASE, track_id), + 'url': f'{self._API_BASE}/v1/tracks/{track_id}/stream', 'ext': 'mp3', 'description': track_data.get('description'), 'duration': track_data.get('duration'), 'track': track_data.get('title'), - 'artist': try_get(track_data, lambda x: x['user']['name'], compat_str), + 'artist': try_get(track_data, lambda x: x['user']['name'], str), 'genre': track_data.get('genre'), 'thumbnails': thumbnails, 'view_count': track_data.get('play_count'), @@ -175,11 +175,11 @@ class AudiusTrackIE(AudiusIE): # XXX: Do not subclass from concrete IE _TESTS = [ { 'url': 'audius:9RWlo', - 'only_matching': True + 'only_matching': True, }, { 'url': 'audius:http://discoveryprovider.audius.prod-us-west-2.staked.cloud/v1/tracks/9RWlo', - 'only_matching': True + 'only_matching': True, }, ] @@ -207,7 +207,7 @@ def _build_playlist(self, tracks): if not track_id: raise ExtractorError('Unable to get track ID from playlist') entries.append(self.url_result( - 'audius:%s' % track_id, + f'audius:{track_id}', ie=AudiusTrackIE.ie_key(), video_id=track_id)) return entries @@ -231,7 +231,7 @@ def _real_extract(self, url): raise ExtractorError('Unable to get playlist ID') playlist_tracks = self._api_request( - '/playlists/%s/tracks' % playlist_id, + f'/playlists/{playlist_id}/tracks', title, note='Downloading playlist tracks metadata', errnote='Unable to download playlist tracks metadata') if not isinstance(playlist_tracks, list): @@ -267,5 +267,5 @@ def _real_extract(self, url): profile_audius_id = _profile_data[0]['id'] profile_bio = _profile_data[0].get('bio') - api_call = self._api_request('/full/users/handle/%s/tracks' % profile_id, profile_id) + api_call = self._api_request(f'/full/users/handle/{profile_id}/tracks', profile_id) return self.playlist_result(self._build_playlist(api_call), profile_audius_id, profile_id, profile_bio) diff --git a/yt_dlp/extractor/awaan.py b/yt_dlp/extractor/awaan.py index 6fc938de9c..4066a5a83f 100644 --- a/yt_dlp/extractor/awaan.py +++ b/yt_dlp/extractor/awaan.py @@ -1,10 +1,7 @@ import base64 +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlencode, - compat_str, -) from ..utils import ( format_field, int_or_none, @@ -22,14 +19,14 @@ def _real_extract(self, url): show_id, video_id, season_id = self._match_valid_url(url).groups() if video_id and int(video_id) > 0: return self.url_result( - 'http://awaan.ae/media/%s' % video_id, 'AWAANVideo') + f'http://awaan.ae/media/{video_id}', 'AWAANVideo') elif season_id and int(season_id) > 0: return self.url_result(smuggle_url( - 'http://awaan.ae/program/season/%s' % season_id, + f'http://awaan.ae/program/season/{season_id}', {'show_id': show_id}), 'AWAANSeason') else: return self.url_result( - 'http://awaan.ae/program/%s' % show_id, 'AWAANSeason') + f'http://awaan.ae/program/{show_id}', 'AWAANSeason') class AWAANBaseIE(InfoExtractor): @@ -75,11 +72,11 @@ def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( - 'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id, + f'http://admin.mangomolo.com/analytics/index.php/plus/video?id={video_id}', video_id, headers={'Origin': 'http://awaan.ae'}) info = self._parse_video_data(video_data, video_id, False) - embed_url = 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + compat_urllib_parse_urlencode({ + embed_url = 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + urllib.parse.urlencode({ 'id': video_data['id'], 'user_id': video_data['user_id'], 'signature': video_data['signature'], @@ -117,11 +114,11 @@ def _real_extract(self, url): channel_id = self._match_id(url) channel_data = self._download_json( - 'http://admin.mangomolo.com/analytics/index.php/plus/getchanneldetails?channel_id=%s' % channel_id, + f'http://admin.mangomolo.com/analytics/index.php/plus/getchanneldetails?channel_id={channel_id}', channel_id, headers={'Origin': 'http://awaan.ae'}) info = self._parse_video_data(channel_data, channel_id, True) - embed_url = 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + compat_urllib_parse_urlencode({ + embed_url = 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + urllib.parse.urlencode({ 'id': base64.b64encode(channel_data['user_id'].encode()).decode(), 'channelid': base64.b64encode(channel_data['id'].encode()).decode(), 'signature': channel_data['signature'], @@ -159,7 +156,7 @@ def _real_extract(self, url): show_id = smuggled_data.get('show_id') if show_id is None: season = self._download_json( - 'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id=%s' % season_id, + f'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id={season_id}', season_id, headers={'Origin': 'http://awaan.ae'}) show_id = season['id'] data['show_id'] = show_id @@ -167,7 +164,7 @@ def _real_extract(self, url): 'http://admin.mangomolo.com/analytics/index.php/plus/show', show_id, data=urlencode_postdata(data), headers={ 'Origin': 'http://awaan.ae', - 'Content-Type': 'application/x-www-form-urlencoded' + 'Content-Type': 'application/x-www-form-urlencoded', }) if not season_id: season_id = show['default_season'] @@ -177,8 +174,8 @@ def _real_extract(self, url): entries = [] for video in show['videos']: - video_id = compat_str(video['id']) + video_id = str(video['id']) entries.append(self.url_result( - 'http://awaan.ae/media/%s' % video_id, 'AWAANVideo', video_id)) + f'http://awaan.ae/media/{video_id}', 'AWAANVideo', video_id)) return self.playlist_result(entries, season_id, title) diff --git a/yt_dlp/extractor/aws.py b/yt_dlp/extractor/aws.py index eb831a1530..177c410275 100644 --- a/yt_dlp/extractor/aws.py +++ b/yt_dlp/extractor/aws.py @@ -1,9 +1,9 @@ -import datetime +import datetime as dt import hashlib import hmac +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode class AWSIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor @@ -12,26 +12,26 @@ class AWSIE(InfoExtractor): # XXX: Conventionally, base classes should end with def _aws_execute_api(self, aws_dict, video_id, query=None): query = query or {} - amz_date = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') + amz_date = dt.datetime.now(dt.timezone.utc).strftime('%Y%m%dT%H%M%SZ') date = amz_date[:8] headers = { 'Accept': 'application/json', 'Host': self._AWS_PROXY_HOST, 'X-Amz-Date': amz_date, - 'X-Api-Key': self._AWS_API_KEY + 'X-Api-Key': self._AWS_API_KEY, } session_token = aws_dict.get('session_token') if session_token: headers['X-Amz-Security-Token'] = session_token def aws_hash(s): - return hashlib.sha256(s.encode('utf-8')).hexdigest() + return hashlib.sha256(s.encode()).hexdigest() # Task 1: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html - canonical_querystring = compat_urllib_parse_urlencode(query) + canonical_querystring = urllib.parse.urlencode(query) canonical_headers = '' for header_name, header_value in sorted(headers.items()): - canonical_headers += '%s:%s\n' % (header_name.lower(), header_value) + canonical_headers += f'{header_name.lower()}:{header_value}\n' signed_headers = ';'.join([header.lower() for header in sorted(headers.keys())]) canonical_request = '\n'.join([ 'GET', @@ -39,7 +39,7 @@ def aws_hash(s): canonical_querystring, canonical_headers, signed_headers, - aws_hash('') + aws_hash(''), ]) # Task 2: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-string-to-sign.html @@ -49,7 +49,7 @@ def aws_hash(s): # Task 3: http://docs.aws.amazon.com/general/latest/gr/sigv4-calculate-signature.html def aws_hmac(key, msg): - return hmac.new(key, msg.encode('utf-8'), hashlib.sha256) + return hmac.new(key, msg.encode(), hashlib.sha256) def aws_hmac_digest(key, msg): return aws_hmac(key, msg).digest() @@ -57,7 +57,7 @@ def aws_hmac_digest(key, msg): def aws_hmac_hexdigest(key, msg): return aws_hmac(key, msg).hexdigest() - k_signing = ('AWS4' + aws_dict['secret_key']).encode('utf-8') + k_signing = ('AWS4' + aws_dict['secret_key']).encode() for value in credential_scope_list: k_signing = aws_hmac_digest(k_signing, value) @@ -65,11 +65,11 @@ def aws_hmac_hexdigest(key, msg): # Task 4: http://docs.aws.amazon.com/general/latest/gr/sigv4-add-signature-to-request.html headers['Authorization'] = ', '.join([ - '%s Credential=%s/%s' % (self._AWS_ALGORITHM, aws_dict['access_key'], credential_scope), - 'SignedHeaders=%s' % signed_headers, - 'Signature=%s' % signature, + '{} Credential={}/{}'.format(self._AWS_ALGORITHM, aws_dict['access_key'], credential_scope), + f'SignedHeaders={signed_headers}', + f'Signature={signature}', ]) return self._download_json( - 'https://%s%s%s' % (self._AWS_PROXY_HOST, aws_dict['uri'], '?' + canonical_querystring if canonical_querystring else ''), + 'https://{}{}{}'.format(self._AWS_PROXY_HOST, aws_dict['uri'], '?' + canonical_querystring if canonical_querystring else ''), video_id, headers=headers) diff --git a/yt_dlp/extractor/axs.py b/yt_dlp/extractor/axs.py new file mode 100644 index 0000000000..7e91667712 --- /dev/null +++ b/yt_dlp/extractor/axs.py @@ -0,0 +1,89 @@ +from .common import InfoExtractor +from ..utils import ( + float_or_none, + js_to_json, + parse_iso8601, + traverse_obj, + url_or_none, +) + + +class AxsIE(InfoExtractor): + IE_NAME = 'axs.tv' + _VALID_URL = r'https?://(?:www\.)?axs\.tv/(?:channel/(?:[^/?#]+/)+)?video/(?P<id>[^/?#]+)' + + _TESTS = [{ + 'url': 'https://www.axs.tv/video/5f4dc776b70e4f1c194f22ef/', + 'md5': '8d97736ae8e50c64df528e5e676778cf', + 'info_dict': { + 'id': '5f4dc776b70e4f1c194f22ef', + 'title': 'Small Town', + 'ext': 'mp4', + 'description': 'md5:e314d28bfaa227a4d7ec965fae19997f', + 'upload_date': '20230602', + 'timestamp': 1685729564, + 'duration': 1284.216, + 'series': 'Rock & Roll Road Trip with Sammy Hagar', + 'season': 'Season 2', + 'season_number': 2, + 'episode': '3', + 'thumbnail': 'https://images.dotstudiopro.com/5f4e9d330a0c3b295a7e8394', + }, + }, { + 'url': 'https://www.axs.tv/channel/rock-star-interview/video/daryl-hall', + 'md5': '300ae795cd8f9984652c0949734ffbdc', + 'info_dict': { + 'id': '5f488148b70e4f392572977c', + 'display_id': 'daryl-hall', + 'title': 'Daryl Hall', + 'ext': 'mp4', + 'description': 'md5:e54ecaa0f4b5683fc9259e9e4b196628', + 'upload_date': '20230214', + 'timestamp': 1676403615, + 'duration': 2570.668, + 'series': 'The Big Interview with Dan Rather', + 'season': 'Season 3', + 'season_number': 3, + 'episode': '5', + 'thumbnail': 'https://images.dotstudiopro.com/5f4d1901f340b50d937cec32', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + webpage_json_data = self._search_json( + r'mountObj\s*=', webpage, 'video ID data', display_id, + transform_source=js_to_json) + video_id = webpage_json_data['video_id'] + company_id = webpage_json_data['company_id'] + + meta = self._download_json( + f'https://api.myspotlight.tv/dotplayer/video/{company_id}/{video_id}', + video_id, query={'device_type': 'desktop_web'})['video'] + + formats = self._extract_m3u8_formats( + meta['video_m3u8'], video_id, 'mp4', m3u8_id='hls') + + subtitles = {} + for cc in traverse_obj(meta, ('closeCaption', lambda _, v: url_or_none(v['srtPath']))): + subtitles.setdefault(cc.get('srtShortLang') or 'en', []).append( + {'ext': cc.get('srtExt'), 'url': cc['srtPath']}) + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + **traverse_obj(meta, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'series': ('seriestitle', {str}), + 'season_number': ('season', {int}), + 'episode': ('episode', {str}), + 'duration': ('duration', {float_or_none}), + 'timestamp': ('updated_at', {parse_iso8601}), + 'thumbnail': ('thumb', {url_or_none}), + }), + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/azmedien.py b/yt_dlp/extractor/azmedien.py index d1686eed64..0e3a03f03f 100644 --- a/yt_dlp/extractor/azmedien.py +++ b/yt_dlp/extractor/azmedien.py @@ -38,14 +38,14 @@ class AZMedienIE(InfoExtractor): 'timestamp': 1538328802, 'view_count': int, 'thumbnail': 'http://cfvod.kaltura.com/p/1719221/sp/171922100/thumbnail/entry_id/1_anruz3wy/version/100031', - 'duration': 1930 + 'duration': 1930, }, 'params': { 'skip_download': True, }, }, { 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', - 'only_matching': True + 'only_matching': True, }] _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/a4016f65fe62b81dc6664dd9f4910e4ab40383be' _PARTNER_ID = '1719221' @@ -62,5 +62,5 @@ def _real_extract(self, url): })['data']['context']['mainAsset']['video']['kaltura']['kalturaId'] return self.url_result( - 'kaltura:%s:%s' % (self._PARTNER_ID, entry_id), + f'kaltura:{self._PARTNER_ID}:{entry_id}', ie=KalturaIE.ie_key(), video_id=entry_id) diff --git a/yt_dlp/extractor/baidu.py b/yt_dlp/extractor/baidu.py index 8786d67e06..a1ad4240ff 100644 --- a/yt_dlp/extractor/baidu.py +++ b/yt_dlp/extractor/baidu.py @@ -24,8 +24,9 @@ class BaiduVideoIE(InfoExtractor): }] def _call_api(self, path, category, playlist_id, note): - return self._download_json('http://app.video.baidu.com/%s/?worktype=adnative%s&id=%s' % ( - path, category, playlist_id), playlist_id, note) + return self._download_json( + f'http://app.video.baidu.com/{path}/?worktype=adnative{category}&id={playlist_id}', + playlist_id, note) def _real_extract(self, url): category, playlist_id = self._match_valid_url(url).groups() @@ -44,7 +45,7 @@ def _real_extract(self, url): 'xqsingle', category, playlist_id, 'Download episodes JSON metadata') entries = [self.url_result( - episode['url'], video_title=episode['title'] + episode['url'], video_title=episode['title'], ) for episode in episodes_detail['videos']] return self.playlist_result( diff --git a/yt_dlp/extractor/banbye.py b/yt_dlp/extractor/banbye.py index c873425656..148a1705ef 100644 --- a/yt_dlp/extractor/banbye.py +++ b/yt_dlp/extractor/banbye.py @@ -1,15 +1,16 @@ import math +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlparse, - compat_parse_qs, -) from ..utils import ( - format_field, InAdvancePagedList, + determine_ext, + format_field, + int_or_none, + join_nonempty, traverse_obj, unified_timestamp, + url_or_none, ) @@ -20,8 +21,8 @@ class BanByeBaseIE(InfoExtractor): @staticmethod def _extract_playlist_id(url, param='playlist'): - return compat_parse_qs( - compat_urllib_parse_urlparse(url).query).get(param, [None])[0] + return urllib.parse.parse_qs( + urllib.parse.urlparse(url).query).get(param, [None])[0] def _extract_playlist(self, playlist_id): data = self._download_json(f'{self._API_BASE}/playlists/{playlist_id}', playlist_id) @@ -31,8 +32,9 @@ def _extract_playlist(self, playlist_id): class BanByeIE(BanByeBaseIE): - _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P<id>\w+)' + _VALID_URL = r'https?://(?:www\.)?banbye\.com/(?:en/)?watch/(?P<id>[\w-]+)' _TESTS = [{ + # ['src']['mp4']['levels'] direct mp4 urls only 'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T', 'md5': '2f4ea15c5ca259a73d909b2cfd558eb5', 'info_dict': { @@ -59,7 +61,70 @@ class BanByeIE(BanByeBaseIE): 'title': 'Krzysztof Karoń', 'id': 'p_Ld82N6gBw_OJ', }, - 'playlist_count': 9, + 'playlist_mincount': 9, + }, { + # ['src']['mp4']['levels'] direct mp4 urls only + 'url': 'https://banbye.com/watch/v_kb6_o1Kyq-CD', + 'info_dict': { + 'id': 'v_kb6_o1Kyq-CD', + 'ext': 'mp4', + 'title': 'Co tak naprawdę dzieje się we Francji?! Czy Warszawa a potem cała Polska będzie drugim Paryżem?!🤔🇵🇱', + 'description': 'md5:82be4c0e13eae8ea1ca8b9f2e07226a8', + 'uploader': 'Marcin Rola - MOIM ZDANIEM!🇵🇱', + 'channel_id': 'ch_QgWnHvDG2fo5', + 'channel_url': 'https://banbye.com/channel/ch_QgWnHvDG2fo5', + 'duration': 597, + 'timestamp': 1688642656, + 'upload_date': '20230706', + 'thumbnail': 'https://cdn.banbye.com/video/v_kb6_o1Kyq-CD/96.webp', + 'tags': ['Paryż', 'Francja', 'Polska', 'Imigranci', 'Morawiecki', 'Tusk'], + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'comment_count': int, + }, + }, { + # ['src']['hls']['levels'] variant m3u8 urls only; master m3u8 is 404 + 'url': 'https://banbye.com/watch/v_a_gPFuC9LoW5', + 'info_dict': { + 'id': 'v_a_gPFuC9LoW5', + 'ext': 'mp4', + 'title': 'md5:183524056bebdfa245fd6d214f63c0fe', + 'description': 'md5:943ac87287ca98d28d8b8797719827c6', + 'uploader': 'wRealu24', + 'channel_id': 'ch_wrealu24', + 'channel_url': 'https://banbye.com/channel/ch_wrealu24', + 'upload_date': '20231113', + 'timestamp': 1699874062, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'thumbnail': 'https://cdn.banbye.com/video/v_a_gPFuC9LoW5/96.webp', + 'tags': ['jaszczur', 'sejm', 'lewica', 'polska', 'ukrainizacja', 'pierwszeposiedzeniesejmu'], + }, + 'expected_warnings': ['Failed to download m3u8'], + }, { + # ['src']['hls']['masterPlaylist'] m3u8 only + 'url': 'https://banbye.com/watch/v_B0rsKWsr-aaa', + 'info_dict': { + 'id': 'v_B0rsKWsr-aaa', + 'ext': 'mp4', + 'title': 'md5:00b254164b82101b3f9e5326037447ed', + 'description': 'md5:3fd8b48aa81954ba024bc60f5de6e167', + 'uploader': 'PSTV Piotr Szlachtowicz ', + 'channel_id': 'ch_KV9EVObkB9wB', + 'channel_url': 'https://banbye.com/channel/ch_KV9EVObkB9wB', + 'upload_date': '20240629', + 'timestamp': 1719646816, + 'duration': 2377, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'thumbnail': 'https://cdn.banbye.com/video/v_B0rsKWsr-aaa/96.webp', + 'tags': ['Biden', 'Trump', 'Wybory', 'USA'], + }, }] def _real_extract(self, url): @@ -74,11 +139,24 @@ def _real_extract(self, url): 'id': f'{quality}p', 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.webp', } for quality in [48, 96, 144, 240, 512, 1080]] - formats = [{ - 'format_id': f'http-{quality}p', - 'quality': quality, - 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.mp4', - } for quality in data['quality']] + + formats = [] + url_data = self._download_json(f'{self._API_BASE}/videos/{video_id}/url', video_id, data=b'') + if master_url := traverse_obj(url_data, ('src', 'hls', 'masterPlaylist', {url_or_none})): + formats = self._extract_m3u8_formats(master_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + + for format_id, format_url in traverse_obj(url_data, ( + 'src', ('mp4', 'hls'), 'levels', {dict.items}, lambda _, v: url_or_none(v[1]))): + ext = determine_ext(format_url) + is_hls = ext == 'm3u8' + formats.append({ + 'url': format_url, + 'ext': 'mp4' if is_hls else ext, + 'format_id': join_nonempty(is_hls and 'hls', format_id), + 'protocol': 'm3u8_native' if is_hls else 'https', + 'height': int_or_none(format_id), + }) + self._remove_duplicate_formats(formats) return { 'id': video_id, @@ -100,7 +178,7 @@ def _real_extract(self, url): class BanByeChannelIE(BanByeBaseIE): - _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?channel/(?P<id>\w+)' + _VALID_URL = r'https?://(?:www\.)?banbye\.com/(?:en/)?channel/(?P<id>\w+)' _TESTS = [{ 'url': 'https://banbye.com/channel/ch_wrealu24', 'info_dict': { @@ -132,7 +210,7 @@ def page_func(page_num): 'sort': 'new', 'limit': self._PAGE_SIZE, 'offset': page_num * self._PAGE_SIZE, - }, note=f'Downloading page {page_num+1}') + }, note=f'Downloading page {page_num + 1}') return [ self.url_result(f"{self._VIDEO_BASE}/{video['_id']}", BanByeIE) for video in data['items'] diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py index e89b3a69b3..61cbab5a7a 100644 --- a/yt_dlp/extractor/bandcamp.py +++ b/yt_dlp/extractor/bandcamp.py @@ -3,7 +3,6 @@ import time from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( KNOWN_EXTENSIONS, ExtractorError, @@ -42,7 +41,7 @@ class BandcampIE(InfoExtractor): 'uploader_id': 'youtube-dl', 'thumbnail': 'https://f4.bcbits.com/img/a3216802731_5.jpg', }, - '_skip': 'There is a limit of 200 free downloads / month for the test song' + 'skip': 'There is a limit of 200 free downloads / month for the test song', }, { # free download 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', @@ -119,7 +118,7 @@ class BandcampIE(InfoExtractor): def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True): return self._parse_json(self._html_search_regex( - r'data-%s=(["\'])({.+?})\1' % attr, webpage, + rf'data-{attr}=(["\'])({{.+?}})\1', webpage, attr + ' data', group=2), video_id, fatal=fatal) def _real_extract(self, url): @@ -167,7 +166,7 @@ def _real_extract(self, url): download_link = tralbum.get('freeDownloadPage') if download_link: - track_id = compat_str(tralbum['id']) + track_id = str(tralbum['id']) download_webpage = self._download_webpage( download_link, track_id, 'Downloading free downloads page') @@ -192,7 +191,7 @@ def _real_extract(self, url): if isinstance(download_formats_list, list): for f in blob['download_formats']: name, ext = f.get('name'), f.get('file_extension') - if all(isinstance(x, compat_str) for x in (name, ext)): + if all(isinstance(x, str) for x in (name, ext)): download_formats[name] = ext.strip('.') for format_id, f in downloads.items(): @@ -207,7 +206,7 @@ def _real_extract(self, url): }) format_id = f.get('encoding_name') or format_id stat = self._download_json( - stat_url, track_id, 'Downloading %s JSON' % format_id, + stat_url, track_id, f'Downloading {format_id} JSON', transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1], fatal=False) if not stat: @@ -225,7 +224,7 @@ def _real_extract(self, url): 'acodec': format_id.split('-')[0], }) - title = '%s - %s' % (artist, track) if artist else track + title = f'{artist} - {track}' if artist else track if not duration: duration = float_or_none(self._html_search_meta( @@ -267,7 +266,7 @@ class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE 'timestamp': 1311756226, 'upload_date': '20110727', 'uploader': 'Blazo', - } + }, }, { 'md5': '1a2c32e2691474643e912cc6cd4bffaa', @@ -278,7 +277,7 @@ class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE 'timestamp': 1311757238, 'upload_date': '20110727', 'uploader': 'Blazo', - } + }, }, ], 'info_dict': { @@ -287,9 +286,9 @@ class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE 'uploader_id': 'blazo', }, 'params': { - 'playlistend': 2 + 'playlistend': 2, }, - 'skip': 'Bandcamp imposes download limits.' + 'skip': 'Bandcamp imposes download limits.', }, { 'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave', 'info_dict': { @@ -324,7 +323,7 @@ class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE def suitable(cls, url): return (False if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url) - else super(BandcampAlbumIE, cls).suitable(url)) + else super().suitable(url)) def _real_extract(self, url): uploader_id, album_id = self._match_valid_url(url).groups() @@ -376,7 +375,7 @@ class BandcampWeeklyIE(BandcampIE): # XXX: Do not subclass from concrete IE }, }, { 'url': 'https://bandcamp.com/?blah/blah@&show=228', - 'only_matching': True + 'only_matching': True, }] def _real_extract(self, url): @@ -407,7 +406,7 @@ def _real_extract(self, url): title = show.get('audio_title') or 'Bandcamp Weekly' subtitle = show.get('subtitle') if subtitle: - title += ' - %s' % subtitle + title += f' - {subtitle}' return { 'id': show_id, @@ -419,7 +418,7 @@ def _real_extract(self, url): 'series': 'Bandcamp Weekly', 'episode': show.get('subtitle'), 'episode_id': show_id, - 'formats': formats + 'formats': formats, } @@ -440,7 +439,7 @@ class BandcampUserIE(InfoExtractor): 'url': 'http://dotscale.bandcamp.com', 'info_dict': { 'id': 'dotscale', - 'title': 'Discography of dotscale' + 'title': 'Discography of dotscale', }, 'playlist_count': 1, }, { diff --git a/yt_dlp/extractor/bannedvideo.py b/yt_dlp/extractor/bannedvideo.py index 51e7220578..46f2978f7f 100644 --- a/yt_dlp/extractor/bannedvideo.py +++ b/yt_dlp/extractor/bannedvideo.py @@ -2,11 +2,11 @@ from .common import InfoExtractor from ..utils import ( - try_get, - int_or_none, - url_or_none, float_or_none, + int_or_none, + try_get, unified_timestamp, + url_or_none, ) @@ -23,7 +23,7 @@ class BannedVideoIE(InfoExtractor): 'description': 'md5:560d96f02abbebe6c6b78b47465f6b28', 'upload_date': '20200324', 'timestamp': 1585087895, - } + }, }] _GRAPHQL_GETMETADATA_QUERY = ''' @@ -84,15 +84,15 @@ class BannedVideoIE(InfoExtractor): 'GetCommentReplies': _GRAPHQL_GETCOMMENTSREPLIES_QUERY, } - def _call_api(self, video_id, id, operation, note): + def _call_api(self, video_id, id_var, operation, note): return self._download_json( 'https://api.infowarsmedia.com/graphql', video_id, note=note, headers={ - 'Content-Type': 'application/json; charset=utf-8' + 'Content-Type': 'application/json; charset=utf-8', }, data=json.dumps({ - 'variables': {'id': id}, + 'variables': {'id': id_var}, 'operationName': operation, - 'query': self._GRAPHQL_QUERIES[operation] + 'query': self._GRAPHQL_QUERIES[operation], }).encode('utf8')).get('data') def _get_comments(self, video_id, comments, comment_data): @@ -151,5 +151,5 @@ def _real_extract(self, url): 'tags': [tag.get('name') for tag in video_info.get('tags')], 'availability': self._availability(is_unlisted=video_info.get('unlisted')), 'comments': comments, - '__post_extractor': self.extract_comments(video_id, comments, video_json.get('getVideoComments')) + '__post_extractor': self.extract_comments(video_id, comments, video_json.get('getVideoComments')), } diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 9d28e70a3a..3af923f958 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -2,11 +2,11 @@ import itertools import json import re -import urllib.error +import urllib.parse import xml.etree.ElementTree from .common import InfoExtractor -from ..compat import compat_HTTPError, compat_str, compat_urlparse +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, OnDemandPagedList, @@ -15,11 +15,13 @@ float_or_none, get_element_by_class, int_or_none, + join_nonempty, js_to_json, parse_duration, parse_iso8601, parse_qs, strip_or_none, + traverse_obj, try_get, unescapeHTML, unified_timestamp, @@ -33,7 +35,7 @@ class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})' - _VALID_URL = r'''(?x) + _VALID_URL = rf'''(?x) https?:// (?:www\.)?bbc\.co\.uk/ (?: @@ -41,11 +43,10 @@ class BBCCoUkIE(InfoExtractor): iplayer(?:/[^/]+)?/(?:episode/|playlist/)| music/(?:clips|audiovideo/popular)[/#]| radio/player/| - sounds/play/| events/[^/]+/play/[^/]+/ ) - (?P<id>%s)(?!/(?:episodes|broadcasts|clips)) - ''' % _ID_REGEX + (?P<id>{_ID_REGEX})(?!/(?:episodes|broadcasts|clips)) + ''' _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)'] _LOGIN_URL = 'https://account.bbc.com/signin' @@ -74,7 +75,7 @@ class BBCCoUkIE(InfoExtractor): 'params': { # rtmp download 'skip_download': True, - } + }, }, { 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/', @@ -147,7 +148,7 @@ class BBCCoUkIE(InfoExtractor): 'params': { # rtmp download 'skip_download': True, - } + }, }, { 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz', 'note': 'Video', @@ -161,7 +162,7 @@ class BBCCoUkIE(InfoExtractor): 'params': { # rtmp download 'skip_download': True, - } + }, }, { 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls', 'info_dict': { @@ -218,20 +219,6 @@ class BBCCoUkIE(InfoExtractor): # rtmp download 'skip_download': True, }, - }, { - 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb', - 'note': 'Audio', - 'info_dict': { - 'id': 'm0007jz9', - 'ext': 'mp4', - 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra', - 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.", - 'duration': 9840, - }, - 'params': { - # rtmp download - 'skip_download': True, - } }, { 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', 'only_matching': True, @@ -277,23 +264,23 @@ def _perform_login(self, username, password): post_url, None, 'Logging in', data=urlencode_postdata(login_form), headers={'Referer': self._LOGIN_URL}) - if self._LOGIN_URL in urlh.geturl(): + if self._LOGIN_URL in urlh.url: error = clean_html(get_element_by_class('form-message', response)) if error: raise ExtractorError( - 'Unable to login: %s' % error, expected=True) + f'Unable to login: {error}', expected=True) raise ExtractorError('Unable to log in') class MediaSelectionError(Exception): - def __init__(self, id): - self.id = id + def __init__(self, error_id): + self.id = error_id def _extract_asx_playlist(self, connection, programme_id): asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist') return [ref.get('href') for ref in asx.findall('./Entry/ref')] def _extract_items(self, playlist): - return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS) + return playlist.findall(f'./{{{self._EMP_PLAYLIST_NS}}}item') def _extract_medias(self, media_selection): error = media_selection.get('result') @@ -325,21 +312,30 @@ def _get_subtitles(self, media, programme_id): def _raise_extractor_error(self, media_selection_error): raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, media_selection_error.id), + f'{self.IE_NAME} returned error: {media_selection_error.id}', expected=True) def _download_media_selector(self, programme_id): last_exception = None + formats, subtitles = [], {} for media_set in self._MEDIA_SETS: try: - return self._download_media_selector_url( + fmts, subs = self._download_media_selector_url( self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id) + formats.extend(fmts) + if subs: + self._merge_subtitles(subs, target=subtitles) except BBCCoUkIE.MediaSelectionError as e: if e.id in ('notukerror', 'geolocation', 'selectionunavailable'): last_exception = e continue self._raise_extractor_error(e) - self._raise_extractor_error(last_exception) + if last_exception: + if formats or subtitles: + self.report_warning(f'{self.IE_NAME} returned error: {last_exception.id}') + else: + self._raise_extractor_error(last_exception) + return formats, subtitles def _download_media_selector_url(self, url, programme_id=None): media_selection = self._download_json( @@ -376,7 +372,7 @@ def _process_media_selector(self, media_selection, programme_id): for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): formats.append({ 'url': ref, - 'format_id': 'ref%s_%s' % (i, format_id), + 'format_id': f'ref{i}_{format_id}', }) elif transfer_format == 'dash': formats.extend(self._extract_mpd_formats( @@ -388,8 +384,8 @@ def _process_media_selector(self, media_selection, programme_id): href, programme_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) except ExtractorError as e: - if not (isinstance(e.exc_info[1], urllib.error.HTTPError) - and e.exc_info[1].code in (403, 404)): + if not (isinstance(e.exc_info[1], HTTPError) + and e.exc_info[1].status in (403, 404)): raise fmts = [] formats.extend(fmts) @@ -398,7 +394,7 @@ def _process_media_selector(self, media_selection, programme_id): href, programme_id, f4m_id=format_id, fatal=False)) else: if not supplier and bitrate: - format_id += '-%d' % bitrate + format_id += f'-{bitrate}' fmt = { 'format_id': format_id, 'filesize': file_size, @@ -427,9 +423,9 @@ def _process_media_selector(self, media_selection, programme_id): identifier = connection.get('identifier') server = connection.get('server') fmt.update({ - 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), + 'url': f'{protocol}://{server}/{application}?{auth_string}', 'play_path': identifier, - 'app': '%s?%s' % (application, auth_string), + 'app': f'{application}?{auth_string}', 'page_url': 'http://www.bbc.co.uk', 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', 'rtmp_live': False, @@ -445,7 +441,7 @@ def _process_media_selector(self, media_selection, programme_id): def _download_playlist(self, playlist_id): try: playlist = self._download_json( - 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id, + f'http://www.bbc.co.uk/programmes/{playlist_id}/playlist.json', playlist_id, 'Downloading playlist JSON') formats = [] subtitles = {} @@ -472,7 +468,7 @@ def _download_playlist(self, playlist_id): return programme_id, title, description, duration, formats, subtitles except ExtractorError as ee: - if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): + if not (isinstance(ee.cause, HTTPError) and ee.cause.status == 404): raise # fallback to legacy playlist @@ -484,32 +480,32 @@ def _process_legacy_playlist_url(self, url, display_id): def _process_legacy_playlist(self, playlist_id): return self._process_legacy_playlist_url( - 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id) + f'http://www.bbc.co.uk/iplayer/playlist/{playlist_id}', playlist_id) def _download_legacy_playlist_url(self, url, playlist_id=None): return self._download_xml( url, playlist_id, 'Downloading legacy playlist XML') def _extract_from_legacy_playlist(self, playlist, playlist_id): - no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS) + no_items = playlist.find(f'./{{{self._EMP_PLAYLIST_NS}}}noItems') if no_items is not None: reason = no_items.get('reason') if reason == 'preAvailability': - msg = 'Episode %s is not yet available' % playlist_id + msg = f'Episode {playlist_id} is not yet available' elif reason == 'postAvailability': - msg = 'Episode %s is no longer available' % playlist_id + msg = f'Episode {playlist_id} is no longer available' elif reason == 'noMedia': - msg = 'Episode %s is not currently available' % playlist_id + msg = f'Episode {playlist_id} is not currently available' else: - msg = 'Episode %s is not available: %s' % (playlist_id, reason) + msg = f'Episode {playlist_id} is not available: {reason}' raise ExtractorError(msg, expected=True) for item in self._extract_items(playlist): kind = item.get('kind') if kind not in ('programme', 'radioProgramme'): continue - title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text - description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS) + title = playlist.find(f'./{{{self._EMP_PLAYLIST_NS}}}title').text + description_el = playlist.find(f'./{{{self._EMP_PLAYLIST_NS}}}summary') description = description_el.text if description_el is not None else None def get_programme_id(item): @@ -519,7 +515,7 @@ def get_from_attributes(item): if value and re.match(r'^[pb][\da-z]{7}$', value): return value get_from_attributes(item) - mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS) + mediator = item.find(f'./{{{self._EMP_PLAYLIST_NS}}}mediator') if mediator is not None: return get_from_attributes(mediator) @@ -559,7 +555,7 @@ def _real_extract(self, url): if not programme_id: programme_id = self._search_regex( - r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None) + rf'"vpid"\s*:\s*"({self._ID_REGEX})"', webpage, 'vpid', fatal=False, default=None) if programme_id: formats, subtitles = self._download_media_selector(programme_id) @@ -606,7 +602,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { 'id': 'world-europe-32668511', - 'title': 'Russia stages massive WW2 parade', + 'title': 'Russia stages massive WW2 parade despite Western boycott', 'description': 'md5:00ff61976f6081841f759a08bf78cc9c', }, 'playlist_count': 2, @@ -627,6 +623,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'info_dict': { 'id': '3662a707-0af9-3149-963f-47bea720b460', 'title': 'BUGGER', + 'description': r're:BUGGER The recent revelations by the whistleblower Edward Snowden were fascinating. .{211}\.{3}$', }, 'playlist_count': 18, }, { @@ -635,16 +632,16 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'info_dict': { 'id': 'p02mprgb', 'ext': 'mp4', - 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'description': 'md5:2868290467291b37feda7863f7a83f54', + 'title': 'Germanwings crash site aerial video', + 'description': r're:(?s)Aerial video showed the site where the Germanwings flight 4U 9525, .{156} BFM TV\.$', 'duration': 47, 'timestamp': 1427219242, 'upload_date': '20150324', + 'thumbnail': 'https://ichef.bbci.co.uk/news/1024/media/images/81879000/jpg/_81879090_81879089.jpg', }, 'params': { - # rtmp download 'skip_download': True, - } + }, }, { # article with single video embedded with data-playable containing XML playlist # with direct video links as progressiveDownloadUrl (for now these are extracted) @@ -660,21 +657,24 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE }, 'params': { 'skip_download': True, - } + }, + 'skip': 'now SIMORGH_DATA with no video', }, { # single video embedded with data-playable containing XML playlists (regional section) 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', 'info_dict': { - 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', + 'id': '39275083', + 'display_id': '150619_video_honduras_militares_hospitales_corrupcion_aw', 'ext': 'mp4', 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', - 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8', + 'description': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', 'timestamp': 1434713142, 'upload_date': '20150619', + 'thumbnail': 'https://a.files.bbci.co.uk/worldservice/live/assets/images/2015/06/19/150619132146_honduras_hsopitales_militares_640x360_aptn_nocredit.jpg', }, 'params': { 'skip_download': True, - } + }, }, { # single video from video playlist embedded with vxp-playlist-data JSON 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', @@ -687,22 +687,21 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE }, 'params': { 'skip_download': True, - } + }, + 'skip': '404 Not Found', }, { - # single video story with digitalData + # single video story with __PWA_PRELOADED_STATE__ 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', 'info_dict': { 'id': 'p02q6gc4', - 'ext': 'flv', - 'title': 'Sri Lanka’s spicy secret', - 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.', - 'timestamp': 1437674293, - 'upload_date': '20150723', + 'ext': 'mp4', + 'title': 'Tasting the spice of life in Jaffna', + 'description': r're:(?s)BBC Travel Show’s Henry Golding explores the city of Jaffna .{151} aftertaste\.$', + 'timestamp': 1646058397, + 'upload_date': '20220228', + 'duration': 255, + 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1920xn/p02vxvkn.jpg', }, - 'params': { - # rtmp download - 'skip_download': True, - } }, { # single video story without digitalData 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', @@ -714,12 +713,10 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'timestamp': 1415867444, 'upload_date': '20141113', }, - 'params': { - # rtmp download - 'skip_download': True, - } + 'skip': 'redirects to TopGear home page', }, { # single video embedded with Morph + # TODO: replacement test page 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975', 'info_dict': { 'id': 'p041vhd0', @@ -730,27 +727,22 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'uploader': 'BBC Sport', 'uploader_id': 'bbc_sport', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'Georestricted to UK', + 'skip': 'Video no longer in page', }, { - # single video with playlist.sxml URL in playlist param + # single video in __INITIAL_DATA__ 'url': 'http://www.bbc.com/sport/0/football/33653409', 'info_dict': { 'id': 'p02xycnp', 'ext': 'mp4', - 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.', + 'title': 'Ronaldo to Man Utd, Arsenal to spend?', + 'description': r're:(?s)BBC Sport\'s David Ornstein rounds up the latest transfer reports, .{359} here\.$', + 'timestamp': 1437750175, + 'upload_date': '20150724', + 'thumbnail': r're:https?://.+/.+media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png', 'duration': 140, }, - 'params': { - # rtmp download - 'skip_download': True, - } }, { - # article with multiple videos embedded with playlist.sxml in playlist param + # article with multiple videos embedded with Morph.setPayload 'url': 'http://www.bbc.com/sport/0/football/34475836', 'info_dict': { 'id': '34475836', @@ -758,6 +750,21 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.', }, 'playlist_count': 3, + }, { + # Testing noplaylist + 'url': 'http://www.bbc.com/sport/0/football/34475836', + 'info_dict': { + 'id': 'p034ppnv', + 'ext': 'mp4', + 'title': 'All you need to know about Jurgen Klopp', + 'timestamp': 1444335081, + 'upload_date': '20151008', + 'duration': 122.0, + 'thumbnail': 'https://ichef.bbci.co.uk/onesport/cps/976/cpsprodpb/7542/production/_85981003_klopp.jpg', + }, + 'params': { + 'noplaylist': True, + }, }, { # school report article with single video 'url': 'http://www.bbc.co.uk/schoolreport/35744779', @@ -766,6 +773,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'title': 'School which breaks down barriers in Jerusalem', }, 'playlist_count': 1, + 'skip': 'redirects to Young Reporter home page https://www.bbc.co.uk/news/topics/cg41ylwv43pt', }, { # single video with playlist URL from weather section 'url': 'http://www.bbc.com/weather/features/33601775', @@ -782,18 +790,33 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'thumbnail': r're:https?://.+/.+\.jpg', 'timestamp': 1437785037, 'upload_date': '20150725', + 'duration': 105, }, }, { # video with window.__INITIAL_DATA__ and value as JSON string 'url': 'https://www.bbc.com/news/av/world-europe-59468682', 'info_dict': { - 'id': 'p0b71qth', + 'id': 'p0b779gc', 'ext': 'mp4', 'title': 'Why France is making this woman a national hero', - 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4', + 'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{208} Second World War.', 'thumbnail': r're:https?://.+/.+\.jpg', - 'timestamp': 1638230731, - 'upload_date': '20211130', + 'timestamp': 1638215626, + 'upload_date': '20211129', + 'duration': 125, + }, + }, { + # video with script id __NEXT_DATA__ and value as JSON string + 'url': 'https://www.bbc.com/news/uk-68546268', + 'info_dict': { + 'id': 'p0hj0lq7', + 'ext': 'mp4', + 'title': 'Nasser Hospital doctor describes his treatment by IDF', + 'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276} hostages\."$', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1710188248, + 'upload_date': '20240311', + 'duration': 104, }, }, { # single video article embedded with data-media-vpid @@ -821,6 +844,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'uploader': 'Radio 3', 'uploader_id': 'bbc_radio_three', }, + 'skip': '404 Not Found', }, { 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227', 'info_dict': { @@ -828,6 +852,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'ext': 'mp4', 'title': 'md5:2fabf12a726603193a2879a055f72514', 'description': 'Learn English words and phrases from this story', + 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1200x675/p06pq9gk.jpg', }, 'add_ie': [BBCCoUkIE.ie_key()], }, { @@ -836,13 +861,29 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'info_dict': { 'id': 'p07c6sb9', 'ext': 'mp4', - 'title': 'How positive thinking is harming your happiness', - 'alt_title': 'The downsides of positive thinking', - 'description': 'md5:fad74b31da60d83b8265954ee42d85b4', + 'title': 'The downsides of positive thinking', + 'description': 'The downsides of positive thinking', 'duration': 235, - 'thumbnail': r're:https?://.+/p07c9dsr.jpg', - 'upload_date': '20190604', - 'categories': ['Psychology'], + 'thumbnail': r're:https?://.+/p07c9dsr\.(?:jpg|webp|png)', + 'upload_date': '20220223', + 'timestamp': 1645632746, + }, + }, { + # BBC Sounds + 'url': 'https://www.bbc.co.uk/sounds/play/w3ct5rgx', + 'info_dict': { + 'id': 'p0hrw4nr', + 'ext': 'mp4', + 'title': 'Are our coastlines being washed away?', + 'description': r're:(?s)Around the world, coastlines are constantly changing .{2000,} Images\)$', + 'timestamp': 1713556800, + 'upload_date': '20240419', + 'duration': 1588, + 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0hrnxbl.jpg', + 'uploader': 'World Service', + 'uploader_id': 'bbc_world_service', + 'series': 'CrowdScience', + 'chapters': [], }, }, { # onion routes 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576', @@ -856,7 +897,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE def suitable(cls, url): EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE) return (False if any(ie.suitable(url) for ie in EXCLUDE_IE) - else super(BBCIE, cls).suitable(url)) + else super().suitable(url)) def _extract_from_media_meta(self, media_meta, video_id): # Direct links to media in media metadata (e.g. @@ -968,7 +1009,7 @@ def _real_extract(self, url): if playlist: entry = None for key in ('streaming', 'progressiveDownload'): - playlist_url = playlist.get('%sUrl' % key) + playlist_url = playlist.get(f'{key}Url') if not playlist_url: continue try: @@ -983,7 +1024,7 @@ def _real_extract(self, url): # Some playlist URL may fail with 500, at the same time # the other one may work fine (e.g. # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500: + if isinstance(e.cause, HTTPError) and e.cause.status == 500: continue raise if entry: @@ -994,18 +1035,17 @@ def _real_extract(self, url): # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227 group_id = self._search_regex( - r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, + rf'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\']({self._ID_REGEX})', webpage, 'group id', default=None) if group_id: return self.url_result( - 'https://www.bbc.co.uk/programmes/%s' % group_id, - ie=BBCCoUkIE.ie_key()) + f'https://www.bbc.co.uk/programmes/{group_id}', BBCCoUkIE) # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( - [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX, - r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX, - r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX], + [rf'data-(?:video-player|media)-vpid="({self._ID_REGEX})"', + rf'<param[^>]+name="externalIdentifier"[^>]+value="({self._ID_REGEX})"', + rf'videoId\s*:\s*["\']({self._ID_REGEX})["\']'], webpage, 'vpid', default=None) if programme_id: @@ -1059,76 +1099,133 @@ def _real_extract(self, url): } # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) - # There are several setPayload calls may be present but the video - # seems to be always related to the first one - morph_payload = self._parse_json( - self._search_regex( - r'Morph\.setPayload\([^,]+,\s*({.+?})\);', - webpage, 'morph payload', default='{}'), - playlist_id, fatal=False) + # Several setPayload calls may be present but the video(s) + # should be in one that mentions leadMedia or videoData + morph_payload = self._search_json( + r'\bMorph\s*\.\s*setPayload\s*\([^,]+,', webpage, 'morph payload', playlist_id, + contains_pattern=r'{(?s:(?:(?!</script>).)+(?:"leadMedia"|\\"videoData\\")\s*:.+)}', + default={}) if morph_payload: - components = try_get(morph_payload, lambda x: x['body']['components'], list) or [] - for component in components: - if not isinstance(component, dict): - continue - lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict) - if not lead_media: - continue - identifiers = lead_media.get('identifiers') - if not identifiers or not isinstance(identifiers, dict): - continue - programme_id = identifiers.get('vpid') or identifiers.get('playablePid') + for lead_media in traverse_obj(morph_payload, ( + 'body', 'components', ..., 'props', 'leadMedia', {dict})): + programme_id = traverse_obj(lead_media, ('identifiers', ('vpid', 'playablePid'), {str}, any)) if not programme_id: continue - title = lead_media.get('title') or self._og_search_title(webpage) formats, subtitles = self._download_media_selector(programme_id) - description = lead_media.get('summary') - uploader = lead_media.get('masterBrand') - uploader_id = lead_media.get('mid') - duration = None - duration_d = lead_media.get('duration') - if isinstance(duration_d, dict): - duration = parse_duration(dict_get( - duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration'))) return { 'id': programme_id, - 'title': title, - 'description': description, - 'duration': duration, - 'uploader': uploader, - 'uploader_id': uploader_id, + 'title': lead_media.get('title') or self._og_search_title(webpage), + **traverse_obj(lead_media, { + 'description': ('summary', {str}), + 'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}), + 'uploader': ('masterBrand', {str}), + 'uploader_id': ('mid', {str}), + }), 'formats': formats, 'subtitles': subtitles, } + body = self._parse_json(traverse_obj(morph_payload, ( + 'body', 'content', 'article', 'body')), playlist_id, fatal=False) + for video_data in traverse_obj(body, (lambda _, v: v['videoData']['pid'], 'videoData')): + if video_data.get('vpid'): + video_id = video_data['vpid'] + formats, subtitles = self._download_media_selector(video_id) + entry = { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + } + else: + video_id = video_data['pid'] + entry = self.url_result( + f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE, + video_id, url_transparent=True) + entry.update({ + 'timestamp': traverse_obj(morph_payload, ( + 'body', 'content', 'article', 'dateTimeInfo', 'dateTime', {parse_iso8601}), + ), + **traverse_obj(video_data, { + 'thumbnail': (('iChefImage', 'image'), {url_or_none}, any), + 'title': (('title', 'caption'), {str}, any), + 'duration': ('duration', {parse_duration}), + }), + }) + if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id): + return entry + entries.append(entry) + if entries: + playlist_title = traverse_obj(morph_payload, ( + 'body', 'content', 'article', 'headline', {str})) or playlist_title + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) - preload_state = self._parse_json(self._search_regex( - r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage, - 'preload state', default='{}'), playlist_id, fatal=False) - if preload_state: - current_programme = preload_state.get('programmes', {}).get('current') or {} - programme_id = current_programme.get('id') - if current_programme and programme_id and current_programme.get('type') == 'playable_item': - title = current_programme.get('titles', {}).get('tertiary') or playlist_title - formats, subtitles = self._download_media_selector(programme_id) - synopses = current_programme.get('synopses') or {} - network = current_programme.get('network') or {} - duration = int_or_none( - current_programme.get('duration', {}).get('value')) - thumbnail = None - image_url = current_programme.get('image_url') - if image_url: - thumbnail = image_url.replace('{recipe}', 'raw') + # various PRELOADED_STATE JSON + preload_state = self._search_json( + r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage, + 'preload state', playlist_id, transform_source=js_to_json, default={}) + # PRELOADED_STATE with current programmme + current_programme = traverse_obj(preload_state, ('programmes', 'current', {dict})) + programme_id = traverse_obj(current_programme, ('id', {str})) + if programme_id and current_programme.get('type') == 'playable_item': + title = traverse_obj(current_programme, ('titles', ('tertiary', 'secondary'), {str}, any)) or playlist_title + formats, subtitles = self._download_media_selector(programme_id) + return { + 'id': programme_id, + 'title': title, + 'formats': formats, + **traverse_obj(current_programme, { + 'description': ('synopses', ('long', 'medium', 'short'), {str}, any), + 'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}), + 'duration': ('duration', 'value', {int_or_none}), + 'uploader': ('network', 'short_title', {str}), + 'uploader_id': ('network', 'id', {str}), + 'timestamp': ((('availability', 'from'), ('release', 'date')), {parse_iso8601}, any), + 'series': ('titles', 'primary', {str}), + }), + 'subtitles': subtitles, + 'chapters': traverse_obj(preload_state, ( + 'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), { + 'title': ('titles', {lambda x: join_nonempty( + 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}), + 'start_time': ('offset', 'start', {float_or_none}), + 'end_time': ('offset', 'end', {float_or_none}), + }), + ), + } + + # PWA_PRELOADED_STATE with article video asset + asset_id = traverse_obj(preload_state, ( + 'entities', 'articles', lambda k, _: k.rsplit('/', 1)[-1] == playlist_id, + 'assetVideo', 0, {str}, any)) + if asset_id: + video_id = traverse_obj(preload_state, ('entities', 'videos', asset_id, 'vpid', {str})) + if video_id: + article = traverse_obj(preload_state, ( + 'entities', 'articles', lambda _, v: v['assetVideo'][0] == asset_id, any)) + + def image_url(image_id): + return traverse_obj(preload_state, ( + 'entities', 'images', image_id, 'url', + {lambda u: url_or_none(u.replace('$recipe', 'raw'))})) + + formats, subtitles = self._download_media_selector(video_id) return { - 'id': programme_id, - 'title': title, - 'description': dict_get(synopses, ('long', 'medium', 'short')), - 'thumbnail': thumbnail, - 'duration': duration, - 'uploader': network.get('short_title'), - 'uploader_id': network.get('id'), + 'id': video_id, + **traverse_obj(preload_state, ('entities', 'videos', asset_id, { + 'title': ('title', {str}), + 'description': (('synopsisLong', 'synopsisMedium', 'synopsisShort'), {str}, any), + 'thumbnail': (0, {image_url}), + 'duration': ('duration', {int_or_none}), + })), 'formats': formats, 'subtitles': subtitles, + 'timestamp': traverse_obj(article, ('displayDate', {parse_iso8601})), } + else: + return self.url_result( + f'https://www.bbc.co.uk/programmes/{asset_id}', BBCCoUkIE, + asset_id, playlist_title, display_id=playlist_id, + description=playlist_description) bbc3_config = self._parse_json( self._search_regex( @@ -1174,17 +1271,52 @@ def _real_extract(self, url): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) + def parse_model(model): + """Extract single video from model structure""" + item_id = traverse_obj(model, ('versions', 0, 'versionId', {str})) + if not item_id: + return + formats, subtitles = self._download_media_selector(item_id) + return { + 'id': item_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(model, { + 'title': ('title', {str}), + 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), + 'description': ('synopses', ('long', 'medium', 'short'), {str}, {lambda x: x or None}, any), + 'duration': ('versions', 0, 'duration', {int}), + 'timestamp': ('versions', 0, 'availableFrom', {functools.partial(int_or_none, scale=1000)}), + }), + } + + def is_type(*types): + return lambda _, v: v['type'] in types + initial_data = self._search_regex( r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage, 'quoted preload state', default=None) if initial_data is None: initial_data = self._search_regex( r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage, - 'preload state', default={}) + 'preload state', default='{}') else: initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) initial_data = self._parse_json(initial_data, playlist_id, fatal=False) if initial_data: + for video_data in traverse_obj(initial_data, ( + 'stores', 'article', 'articleBodyContent', is_type('video'))): + model = traverse_obj(video_data, ( + 'model', 'blocks', is_type('aresMedia'), + 'model', 'blocks', is_type('aresMediaMetadata'), + 'model', {dict}, any)) + entry = parse_model(model) + if entry: + entries.append(entry) + if entries: + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def parse_media(media): if not media: return @@ -1199,7 +1331,7 @@ def parse_media(media): if blocks: summary = [] for block in blocks: - text = try_get(block, lambda x: x['model']['text'], compat_str) + text = try_get(block, lambda x: x['model']['text'], str) if text: summary.append(text) if summary: @@ -1217,37 +1349,100 @@ def parse_media(media): 'subtitles': subtitles, 'timestamp': item_time, 'description': strip_or_none(item_desc), + 'duration': int_or_none(item.get('duration')), }) - for resp in (initial_data.get('data') or {}).values(): - name = resp.get('name') + + for resp in traverse_obj(initial_data, ('data', lambda _, v: v['name'])): + name = resp['name'] if name == 'media-experience': parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) elif name == 'article': - for block in (try_get(resp, - (lambda x: x['data']['blocks'], - lambda x: x['data']['content']['model']['blocks'],), - list) or []): - if block.get('type') not in ['media', 'video']: - continue - parse_media(block.get('model')) + for block in traverse_obj(resp, ( + 'data', (None, ('content', 'model')), 'blocks', + is_type('media', 'video'), 'model', {dict})): + parse_media(block) return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) + # extract from SIMORGH_DATA hydration JSON + simorgh_data = self._search_json( + r'window\s*\.\s*SIMORGH_DATA\s*=', webpage, + 'simorgh data', playlist_id, default={}) + if simorgh_data: + done = False + for video_data in traverse_obj(simorgh_data, ( + 'pageData', 'content', 'model', 'blocks', is_type('video', 'legacyMedia'))): + model = traverse_obj(video_data, ( + 'model', 'blocks', is_type('aresMedia'), + 'model', 'blocks', is_type('aresMediaMetadata'), + 'model', {dict}, any)) + if video_data['type'] == 'video': + entry = parse_model(model) + else: # legacyMedia: no duration, subtitles + block_id, entry = traverse_obj(model, ('blockId', {str})), None + media_data = traverse_obj(simorgh_data, ( + 'pageData', 'promo', 'media', + {lambda x: x if x['id'] == block_id else None})) + formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), { + 'url': ('url', {url_or_none}), + 'ext': ('format', {str}), + 'tbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}), + })) + if formats: + entry = { + 'id': block_id, + 'display_id': playlist_id, + 'formats': formats, + 'description': traverse_obj(simorgh_data, ('pageData', 'promo', 'summary', {str})), + **traverse_obj(model, { + 'title': ('title', {str}), + 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), + 'description': ('synopses', ('long', 'medium', 'short'), {str}, any), + 'timestamp': ('firstPublished', {functools.partial(int_or_none, scale=1000)}), + }), + } + done = True + if entry: + entries.append(entry) + if done: + break + if entries: + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def extract_all(pattern): - return list(filter(None, map( - lambda s: self._parse_json(s, playlist_id, fatal=False), - re.findall(pattern, webpage)))) + return list(filter(None, ( + self._parse_json(s, playlist_id, fatal=False) + for s in re.findall(pattern, webpage)))) + + # US accessed article with single embedded video (e.g. + # https://www.bbc.com/news/uk-68546268) + next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}), + ('props', 'pageProps', 'page')) + model = traverse_obj(next_data, ( + ..., 'contents', is_type('video'), + 'model', 'blocks', is_type('media'), + 'model', 'blocks', is_type('mediaMetadata'), + 'model', {dict}, any)) + if model and (entry := parse_model(model)): + if not entry.get('timestamp'): + entry['timestamp'] = traverse_obj(next_data, ( + ..., 'contents', is_type('timestamp'), 'model', + 'timestamp', {functools.partial(int_or_none, scale=1000)}, any)) + entries.append(entry) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) # Multiple video article (e.g. # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) - EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX + EMBED_URL = rf'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+{self._ID_REGEX}(?:\b[^"]+)?' entries = [] for match in extract_all(r'new\s+SMP\(({.+?})\)'): embed_url = match.get('playerSettings', {}).get('externalEmbedUrl') if embed_url and re.match(EMBED_URL, embed_url): entries.append(embed_url) entries.extend(re.findall( - r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage)) + rf'setPlaylist\("({EMBED_URL})"\)', webpage)) if entries: return self.playlist_result( [self.url_result(entry_, 'BBCCoUk') for entry_ in entries], @@ -1297,11 +1492,11 @@ def extract_all(pattern): video_id = media_meta.get('externalId') if not video_id: - video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num) + video_id = playlist_id if len(medias) == 1 else f'{playlist_id}-{num}' title = media_meta.get('caption') if not title: - title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num) + title = playlist_title if len(medias) == 1 else f'{playlist_title} - Video {num}' duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration')) @@ -1362,8 +1557,8 @@ def _real_extract(self, url): class BBCCoUkPlaylistBaseIE(InfoExtractor): def _entries(self, webpage, url, playlist_id): - single_page = 'page' in compat_urlparse.parse_qs( - compat_urlparse.urlparse(url).query) + single_page = 'page' in urllib.parse.parse_qs( + urllib.parse.urlparse(url).query) for page_num in itertools.count(2): for video_id in re.findall( self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage): @@ -1377,8 +1572,8 @@ def _entries(self, webpage, url, playlist_id): if not next_page: break webpage = self._download_webpage( - compat_urlparse.urljoin(url, next_page), playlist_id, - 'Downloading page %d' % page_num, page_num) + urllib.parse.urljoin(url, next_page), playlist_id, + f'Downloading page {page_num}', page_num) def _real_extract(self, url): playlist_id = self._match_id(url) @@ -1393,7 +1588,7 @@ def _real_extract(self, url): class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor): - _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX + _VALID_URL_TMPL = rf'https?://(?:www\.)?bbc\.co\.uk/iplayer/%s/(?P<id>{BBCCoUkIE._ID_REGEX})' @staticmethod def _get_default(episode, key, default_key='default'): @@ -1517,11 +1712,11 @@ def _call_api(self, pid, per_page, page=1, series_id=None): variables['sliceId'] = series_id return self._download_json( 'https://graph.ibl.api.bbc.co.uk/', pid, headers={ - 'Content-Type': 'application/json' + 'Content-Type': 'application/json', }, data=json.dumps({ 'id': '5692d93d5aac8d796a0305e895e61551', 'variables': variables, - }).encode('utf-8'))['data']['programme'] + }).encode())['data']['programme'] @staticmethod def _get_playlist_data(data): @@ -1581,7 +1776,7 @@ def _get_episode(element): def _call_api(self, pid, per_page, page=1, series_id=None): return self._download_json( - 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid, + f'http://ibl.api.bbc.co.uk/ibl/v1/groups/{pid}/episodes', pid, query={ 'page': page, 'per_page': per_page, @@ -1597,7 +1792,7 @@ def _get_playlist_title(self, data): class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): IE_NAME = 'bbc.co.uk:playlist' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX + _VALID_URL = rf'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>{BBCCoUkIE._ID_REGEX})/(?:episodes|broadcasts|clips)' _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s' _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)' _TESTS = [{ diff --git a/yt_dlp/extractor/beatbump.py b/yt_dlp/extractor/beatbump.py index 0f40ebe7ac..777a1b3268 100644 --- a/yt_dlp/extractor/beatbump.py +++ b/yt_dlp/extractor/beatbump.py @@ -3,14 +3,13 @@ class BeatBumpVideoIE(InfoExtractor): - _VALID_URL = r'https://beatbump\.ml/listen\?id=(?P<id>[\w-]+)' + _VALID_URL = r'https?://beatbump\.(?:ml|io)/listen\?id=(?P<id>[\w-]+)' _TESTS = [{ 'url': 'https://beatbump.ml/listen?id=MgNrAu2pzNs', 'md5': '5ff3fff41d3935b9810a9731e485fe66', 'info_dict': { 'id': 'MgNrAu2pzNs', 'ext': 'mp4', - 'uploader_url': 'http://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA', 'artist': 'Stephen', 'thumbnail': 'https://i.ytimg.com/vi_webp/MgNrAu2pzNs/maxresdefault.webp', 'channel_url': 'https://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA', @@ -22,10 +21,9 @@ class BeatBumpVideoIE(InfoExtractor): 'alt_title': 'Voyeur Girl', 'view_count': int, 'track': 'Voyeur Girl', - 'uploader': 'Stephen - Topic', + 'uploader': 'Stephen', 'title': 'Voyeur Girl', 'channel_follower_count': int, - 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA', 'age_limit': 0, 'availability': 'public', 'live_status': 'not_live', @@ -36,7 +34,12 @@ class BeatBumpVideoIE(InfoExtractor): 'tags': 'count:11', 'creator': 'Stephen', 'channel_id': 'UC-pWHpBjdGG69N9mM2auIAA', - } + 'channel_is_verified': True, + 'heatmap': 'count:100', + }, + }, { + 'url': 'https://beatbump.io/listen?id=LDGZAprNGWo', + 'only_matching': True, }] def _real_extract(self, url): @@ -45,7 +48,7 @@ def _real_extract(self, url): class BeatBumpPlaylistIE(InfoExtractor): - _VALID_URL = r'https://beatbump\.ml/(?:release\?id=|artist/|playlist/)(?P<id>[\w-]+)' + _VALID_URL = r'https?://beatbump\.(?:ml|io)/(?:release\?id=|artist/|playlist/)(?P<id>[\w-]+)' _TESTS = [{ 'url': 'https://beatbump.ml/release?id=MPREb_gTAcphH99wE', 'playlist_count': 50, @@ -56,25 +59,28 @@ class BeatBumpPlaylistIE(InfoExtractor): 'title': 'Album - Royalty Free Music Library V2 (50 Songs)', 'description': '', 'tags': [], - 'modified_date': '20221223', - } + 'modified_date': '20231110', + }, + 'expected_warnings': ['YouTube Music is not directly supported'], }, { 'url': 'https://beatbump.ml/artist/UC_aEa8K-EOJ3D6gOs7HcyNg', 'playlist_mincount': 1, 'params': {'flatplaylist': True}, 'info_dict': { 'id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', - 'uploader_url': 'https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg', + 'uploader_url': 'https://www.youtube.com/@NoCopyrightSounds', 'channel_url': 'https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg', - 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + 'uploader_id': '@NoCopyrightSounds', 'channel_follower_count': int, - 'title': 'NoCopyrightSounds - Videos', + 'title': 'NoCopyrightSounds', 'uploader': 'NoCopyrightSounds', 'description': 'md5:cd4fd53d81d363d05eee6c1b478b491a', 'channel': 'NoCopyrightSounds', - 'tags': 'count:12', + 'tags': 'count:65', 'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + 'channel_is_verified': True, }, + 'expected_warnings': ['YouTube Music is not directly supported'], }, { 'url': 'https://beatbump.ml/playlist/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', 'playlist_mincount': 1, @@ -84,16 +90,20 @@ class BeatBumpPlaylistIE(InfoExtractor): 'uploader_url': 'https://www.youtube.com/@NoCopyrightSounds', 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!', 'view_count': int, - 'channel_url': 'https://www.youtube.com/@NoCopyrightSounds', - 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + 'channel_url': 'https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg', + 'uploader_id': '@NoCopyrightSounds', 'title': 'NCS : All Releases 💿', 'uploader': 'NoCopyrightSounds', 'availability': 'public', 'channel': 'NoCopyrightSounds', 'tags': [], - 'modified_date': '20221225', + 'modified_date': '20231112', 'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', - } + }, + 'expected_warnings': ['YouTube Music is not directly supported'], + }, { + 'url': 'https://beatbump.io/playlist/VLPLFCHGavqRG-q_2ZhmgU2XB2--ZY6irT1c', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/beatport.py b/yt_dlp/extractor/beatport.py index 0aecbd089d..acc8d12595 100644 --- a/yt_dlp/extractor/beatport.py +++ b/yt_dlp/extractor/beatport.py @@ -1,7 +1,6 @@ import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import int_or_none @@ -33,7 +32,7 @@ class BeatportIE(InfoExtractor): 'display_id': 'birds-original-mix', 'ext': 'mp4', 'title': "Tos, Middle Milk, Mumblin' Johnsson - Birds (Original Mix)", - } + }, }] def _real_extract(self, url): @@ -51,7 +50,7 @@ def _real_extract(self, url): track = next(t for t in playables['tracks'] if t['id'] == int(track_id)) - title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name'] + title = ', '.join(a['name'] for a in track['artists']) + ' - ' + track['name'] if track['mix']: title += ' (' + track['mix'] + ')' @@ -89,7 +88,7 @@ def _real_extract(self, url): images.append(image) return { - 'id': compat_str(track.get('id')) or track_id, + 'id': str(track.get('id')) or track_id, 'display_id': track.get('slug') or display_id, 'title': title, 'formats': formats, diff --git a/yt_dlp/extractor/beeg.py b/yt_dlp/extractor/beeg.py index 52ee68eca7..960cdfabdd 100644 --- a/yt_dlp/extractor/beeg.py +++ b/yt_dlp/extractor/beeg.py @@ -1,7 +1,7 @@ from .common import InfoExtractor - from ..utils import ( int_or_none, + str_or_none, traverse_obj, try_get, unified_timestamp, @@ -22,8 +22,8 @@ class BeegIE(InfoExtractor): 'age_limit': 18, 'upload_date': '20220131', 'timestamp': 1643656455, - 'display_id': 2540839, - } + 'display_id': '2540839', + }, }, { 'url': 'https://beeg.com/-0599050563103750?t=4-861', 'md5': 'bd8b5ea75134f7f07fad63008db2060e', @@ -36,9 +36,9 @@ class BeegIE(InfoExtractor): 'age_limit': 18, 'description': 'md5:b4fc879a58ae6c604f8f259155b7e3b9', 'timestamp': 1643623200, - 'display_id': 2569965, + 'display_id': '2569965', 'upload_date': '20220131', - } + }, }, { # api/v6 v2 'url': 'https://beeg.com/1941093077?t=911-1391', @@ -55,8 +55,8 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) video = self._download_json( - 'https://store.externulls.com/facts/file/%s' % video_id, - video_id, 'Downloading JSON for %s' % video_id) + f'https://store.externulls.com/facts/file/{video_id}', + video_id, f'Downloading JSON for {video_id}') fc_facts = video.get('fc_facts') first_fact = {} @@ -78,7 +78,7 @@ def _real_extract(self, url): return { 'id': video_id, - 'display_id': first_fact.get('id'), + 'display_id': str_or_none(first_fact.get('id')), 'title': traverse_obj(video, ('file', 'stuff', 'sf_name')), 'description': traverse_obj(video, ('file', 'stuff', 'sf_story')), 'timestamp': unified_timestamp(first_fact.get('fc_created')), diff --git a/yt_dlp/extractor/behindkink.py b/yt_dlp/extractor/behindkink.py index ca4498150e..45f45d03ba 100644 --- a/yt_dlp/extractor/behindkink.py +++ b/yt_dlp/extractor/behindkink.py @@ -3,6 +3,7 @@ class BehindKinkIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)' _TEST = { 'url': 'http://www.behindkink.com/2014/12/05/what-are-you-passionate-about-marley-blaze/', @@ -15,7 +16,7 @@ class BehindKinkIE(InfoExtractor): 'upload_date': '20141205', 'thumbnail': 'http://www.behindkink.com/wp-content/uploads/2014/12/blaze-1.jpg', 'age_limit': 18, - } + }, } def _real_extract(self, url): diff --git a/yt_dlp/extractor/bellmedia.py b/yt_dlp/extractor/bellmedia.py index 5ae4b917ac..ac45dd4779 100644 --- a/yt_dlp/extractor/bellmedia.py +++ b/yt_dlp/extractor/bellmedia.py @@ -32,7 +32,7 @@ class BellMediaIE(InfoExtractor): 'description': 'md5:810f7f8c6a83ad5b48677c3f8e5bb2c3', 'upload_date': '20180525', 'timestamp': 1527288600, - 'season_id': 73997, + 'season_id': '73997', 'season': '2018', 'thumbnail': 'http://images2.9c9media.com/image_asset/2018_5_25_baf30cbd-b28d-4a18-9903-4bb8713b00f5_PNG_956x536.jpg', 'tags': [], @@ -86,6 +86,6 @@ def _real_extract(self, url): return { '_type': 'url_transparent', 'id': video_id, - 'url': '9c9media:%s_web:%s' % (self._DOMAINS.get(domain, domain), video_id), + 'url': f'9c9media:{self._DOMAINS.get(domain, domain)}_web:{video_id}', 'ie_key': 'NineCNineMedia', } diff --git a/yt_dlp/extractor/berufetv.py b/yt_dlp/extractor/berufetv.py index 8160cbd9a7..5bba33a44c 100644 --- a/yt_dlp/extractor/berufetv.py +++ b/yt_dlp/extractor/berufetv.py @@ -16,7 +16,7 @@ class BerufeTVIE(InfoExtractor): 'tags': ['Studienfilm'], 'duration': 602.440, 'thumbnail': r're:^https://asset-out-cdn\.video-cdn\.net/private/videos/DvKC3DUpMKvUZ_6fEnfg3u/thumbnails/793063\?quality=thumbnail&__token__=[^\s]+$', - } + }, }] def _real_extract(self, url): @@ -54,7 +54,7 @@ def _real_extract(self, url): subtitles.setdefault(track['language'], []).append({ 'url': track['source'], 'name': track.get('label'), - 'ext': 'vtt' + 'ext': 'vtt', }) return { diff --git a/yt_dlp/extractor/bet.py b/yt_dlp/extractor/bet.py index 6b867d135f..3a8e743092 100644 --- a/yt_dlp/extractor/bet.py +++ b/yt_dlp/extractor/bet.py @@ -1,10 +1,9 @@ from .mtv import MTVServicesInfoExtractor from ..utils import unified_strdate -# TODO Remove - Reason: Outdated Site - class BetIE(MTVServicesInfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P<id>.+?)\.html' _TESTS = [ { @@ -20,7 +19,7 @@ class BetIE(MTVServicesInfoExtractor): 'thumbnail': r're:(?i)^https?://.*\.jpg$', 'subtitles': { 'en': 'mincount:2', - } + }, }, 'params': { # rtmp download @@ -40,16 +39,16 @@ class BetIE(MTVServicesInfoExtractor): 'thumbnail': r're:(?i)^https?://.*\.jpg$', 'subtitles': { 'en': 'mincount:2', - } + }, }, 'params': { # rtmp download 'skip_download': True, }, - } + }, ] - _FEED_URL = "http://feeds.mtvnservices.com/od/feed/bet-mrss-player" + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/bet-mrss-player' def _get_feed_query(self, uri): return { diff --git a/yt_dlp/extractor/bfi.py b/yt_dlp/extractor/bfi.py index 76f0516a4d..a6ebfedffd 100644 --- a/yt_dlp/extractor/bfi.py +++ b/yt_dlp/extractor/bfi.py @@ -5,6 +5,7 @@ class BFIPlayerIE(InfoExtractor): + _WORKING = False IE_NAME = 'bfi:player' _VALID_URL = r'https?://player\.bfi\.org\.uk/[^/]+/film/watch-(?P<id>[\w-]+)-online' _TEST = { diff --git a/yt_dlp/extractor/bfmtv.py b/yt_dlp/extractor/bfmtv.py index a7be0e67de..87f011783b 100644 --- a/yt_dlp/extractor/bfmtv.py +++ b/yt_dlp/extractor/bfmtv.py @@ -7,7 +7,7 @@ class BFMTVBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:www\.|rmc\.)?bfmtv\.com/' _VALID_URL_TMPL = _VALID_URL_BASE + r'(?:[^/]+/)*[^/?&#]+_%s[A-Z]-(?P<id>\d{12})\.html' - _VIDEO_BLOCK_REGEX = r'(<div[^>]+class="video_block"[^>]*>)' + _VIDEO_BLOCK_REGEX = r'(<div[^>]+class="video_block[^"]*"[^>]*>)' BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' def _brightcove_url_result(self, video_id, video_block): @@ -55,8 +55,11 @@ class BFMTVLiveIE(BFMTVIE): # XXX: Do not subclass from concrete IE 'ext': 'mp4', 'title': r're:^le direct BFMTV WEB \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'uploader_id': '876450610001', - 'upload_date': '20171018', - 'timestamp': 1508329950, + 'upload_date': '20220926', + 'timestamp': 1664207191, + 'live_status': 'is_live', + 'thumbnail': r're:https://.+/image\.jpg', + 'tags': [], }, 'params': { 'skip_download': True, @@ -90,14 +93,13 @@ class BFMTVArticleIE(BFMTVBaseIE): 'id': '6318445464112', 'ext': 'mp4', 'title': 'Le plein de bioéthanol fait de plus en plus mal à la pompe', - 'description': None, 'uploader_id': '876630703001', 'upload_date': '20230110', 'timestamp': 1673341692, 'duration': 109.269, 'tags': ['rmc', 'show', 'apolline de malherbe', 'info', 'talk', 'matinale', 'radio'], - 'thumbnail': 'https://cf-images.eu-west-1.prod.boltdns.net/v1/static/876630703001/5bef74b8-9d5e-4480-a21f-60c2e2480c46/96c88b74-f9db-45e1-8040-e199c5da216c/1920x1080/match/image.jpg' - } + 'thumbnail': 'https://cf-images.eu-west-1.prod.boltdns.net/v1/static/876630703001/5bef74b8-9d5e-4480-a21f-60c2e2480c46/96c88b74-f9db-45e1-8040-e199c5da216c/1920x1080/match/image.jpg', + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/bibeltv.py b/yt_dlp/extractor/bibeltv.py index fd20aadad4..666b51c56a 100644 --- a/yt_dlp/extractor/bibeltv.py +++ b/yt_dlp/extractor/bibeltv.py @@ -1,27 +1,197 @@ +import functools + from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + determine_ext, + format_field, + int_or_none, + js_to_json, + orderedSet, + parse_iso8601, + traverse_obj, + url_or_none, +) -class BibelTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?:crn/)?(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://www.bibeltv.de/mediathek/videos/329703-sprachkurs-in-malaiisch', - 'md5': '252f908192d611de038b8504b08bf97f', - 'info_dict': { - 'id': 'ref:329703', - 'ext': 'mp4', - 'title': 'Sprachkurs in Malaiisch', - 'description': 'md5:3e9f197d29ee164714e67351cf737dfe', - 'timestamp': 1608316701, - 'uploader_id': '5840105145001', - 'upload_date': '20201218', +class BibelTVBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['AT', 'CH', 'DE'] + _GEO_BYPASS = False + + API_URL = 'https://www.bibeltv.de/mediathek/api' + AUTH_TOKEN = 'j88bRXY8DsEqJ9xmTdWhrByVi5Hm' + + def _extract_formats_and_subtitles(self, data, crn_id, *, is_live=False): + formats = [] + subtitles = {} + for media_url in traverse_obj(data, (..., 'src', {url_or_none})): + media_ext = determine_ext(media_url) + if media_ext == 'm3u8': + m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + media_url, crn_id, live=is_live) + formats.extend(m3u8_formats) + subtitles.update(m3u8_subs) + elif media_ext == 'mpd': + mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(media_url, crn_id) + formats.extend(mpd_formats) + subtitles.update(mpd_subs) + elif media_ext == 'mp4': + formats.append({'url': media_url}) + else: + self.report_warning(f'Unknown format {media_ext!r}') + + return formats, subtitles + + @staticmethod + def _extract_base_info(data): + return { + 'id': data['crn'], + **traverse_obj(data, { + 'title': 'title', + 'description': 'description', + 'duration': ('duration', {functools.partial(int_or_none, scale=1000)}), + 'timestamp': ('schedulingStart', {parse_iso8601}), + 'season_number': 'seasonNumber', + 'episode_number': 'episodeNumber', + 'view_count': 'viewCount', + 'like_count': 'likeCount', + }), + 'thumbnails': orderedSet(traverse_obj(data, ('images', ..., { + 'url': ('url', {url_or_none}), + }))), } - }, { - 'url': 'https://www.bibeltv.de/mediathek/videos/crn/326374', - 'only_matching': True, + + def _extract_url_info(self, data): + return { + '_type': 'url', + 'url': format_field(data, 'slug', 'https://www.bibeltv.de/mediathek/videos/%s'), + **self._extract_base_info(data), + } + + def _extract_video_info(self, data): + crn_id = data['crn'] + + if data.get('drm'): + self.report_drm(crn_id) + + json_data = self._download_json( + format_field(data, 'id', f'{self.API_URL}/video/%s'), crn_id, + headers={'Authorization': self.AUTH_TOKEN}, fatal=False, + errnote='No formats available') or {} + + formats, subtitles = self._extract_formats_and_subtitles( + traverse_obj(json_data, ('video', 'videoUrls', ...)), crn_id) + + return { + '_type': 'video', + **self._extract_base_info(data), + 'formats': formats, + 'subtitles': subtitles, + } + + +class BibelTVVideoIE(BibelTVBaseIE): + IE_DESC = 'BibelTV single video' + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?P<id>\d+)[\w-]+' + IE_NAME = 'bibeltv:video' + + _TESTS = [{ + 'url': 'https://www.bibeltv.de/mediathek/videos/344436-alte-wege', + 'md5': 'ec1c07efe54353780512e8a4103b612e', + 'info_dict': { + 'id': '344436', + 'ext': 'mp4', + 'title': 'Alte Wege', + 'description': 'md5:2f4eb7294c9797a47b8fd13cccca22e9', + 'timestamp': 1677877071, + 'duration': 150.0, + 'upload_date': '20230303', + 'thumbnail': r're:https://bibeltv\.imgix\.net/[\w-]+\.jpg', + 'episode': 'Episode 1', + 'episode_number': 1, + 'view_count': int, + 'like_count': int, + }, + 'params': { + 'format': '6', + }, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5840105145001/default_default/index.html?videoId=ref:%s' def _real_extract(self, url): crn_id = self._match_id(url) - return self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % crn_id, 'BrightcoveNew') + video_data = traverse_obj( + self._search_nextjs_data(self._download_webpage(url, crn_id), crn_id), + ('props', 'pageProps', 'videoPageData', 'videos', 0, {dict})) + if not video_data: + raise ExtractorError('Missing video data.') + + return self._extract_video_info(video_data) + + +class BibelTVSeriesIE(BibelTVBaseIE): + IE_DESC = 'BibelTV series playlist' + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/serien/(?P<id>\d+)[\w-]+' + IE_NAME = 'bibeltv:series' + + _TESTS = [{ + 'url': 'https://www.bibeltv.de/mediathek/serien/333485-ein-wunder-fuer-jeden-tag', + 'playlist_mincount': 400, + 'info_dict': { + 'id': '333485', + 'title': 'Ein Wunder für jeden Tag', + 'description': 'Tägliche Kurzandacht mit Déborah Rosenkranz.', + }, + }] + + def _real_extract(self, url): + crn_id = self._match_id(url) + webpage = self._download_webpage(url, crn_id) + nextjs_data = self._search_nextjs_data(webpage, crn_id) + series_data = traverse_obj(nextjs_data, ('props', 'pageProps', 'seriePageData', {dict})) + if not series_data: + raise ExtractorError('Missing series data.') + + return self.playlist_result( + traverse_obj(series_data, ('videos', ..., {dict}, {self._extract_url_info})), + crn_id, series_data.get('title'), clean_html(series_data.get('description'))) + + +class BibelTVLiveIE(BibelTVBaseIE): + IE_DESC = 'BibelTV live program' + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/livestreams/(?P<id>[\w-]+)' + IE_NAME = 'bibeltv:live' + + _TESTS = [{ + 'url': 'https://www.bibeltv.de/livestreams/bibeltv/', + 'info_dict': { + 'id': 'bibeltv', + 'ext': 'mp4', + 'title': 're:Bibel TV', + 'live_status': 'is_live', + 'thumbnail': 'https://streampreview.bibeltv.de/bibeltv.webp', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.bibeltv.de/livestreams/impuls/', + 'only_matching': True, + }] + + def _real_extract(self, url): + stream_id = self._match_id(url) + webpage = self._download_webpage(url, stream_id) + stream_data = self._search_json( + r'\\"video\\":', webpage, 'bibeltvData', stream_id, + transform_source=lambda jstring: js_to_json(jstring.replace('\\"', '"'))) + + formats, subtitles = self._extract_formats_and_subtitles( + traverse_obj(stream_data, ('src', ...)), stream_id, is_live=True) + + return { + 'id': stream_id, + 'title': stream_data.get('title'), + 'thumbnail': stream_data.get('poster'), + 'is_live': True, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/bigflix.py b/yt_dlp/extractor/bigflix.py index 02d1ba0e3f..9c55bb9682 100644 --- a/yt_dlp/extractor/bigflix.py +++ b/yt_dlp/extractor/bigflix.py @@ -1,10 +1,8 @@ +import base64 import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_urllib_parse_unquote, -) class BigflixIE(InfoExtractor): @@ -21,7 +19,7 @@ class BigflixIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, }, { # multiple formats 'url': 'http://www.bigflix.com/Malayalam-movies/Drama-movies/Indian-Rupee/15967', @@ -38,7 +36,7 @@ def _real_extract(self, url): webpage, 'title') def decode_url(quoted_b64_url): - return compat_b64decode(compat_urllib_parse_unquote( + return base64.b64decode(urllib.parse.unquote( quoted_b64_url)).decode('utf-8') formats = [] @@ -47,7 +45,7 @@ def decode_url(quoted_b64_url): video_url = decode_url(encoded_url) f = { 'url': video_url, - 'format_id': '%sp' % height, + 'format_id': f'{height}p', 'height': int(height), } if video_url.startswith('rtmp'): @@ -69,5 +67,5 @@ def decode_url(quoted_b64_url): 'id': video_id, 'title': title, 'description': description, - 'formats': formats + 'formats': formats, } diff --git a/yt_dlp/extractor/bigo.py b/yt_dlp/extractor/bigo.py index 1cb6e58be6..b1c230f357 100644 --- a/yt_dlp/extractor/bigo.py +++ b/yt_dlp/extractor/bigo.py @@ -29,13 +29,14 @@ def _real_extract(self, url): info_raw = self._download_json( 'https://ta.bigo.tv/official_website/studio/getInternalStudioInfo', - user_id, data=urlencode_postdata({'siteId': user_id})) + user_id, data=urlencode_postdata({'siteId': user_id}), + headers={'Accept': 'application/json'}) if not isinstance(info_raw, dict): raise ExtractorError('Received invalid JSON data') if info_raw.get('code'): raise ExtractorError( - 'Bigo says: %s (code %s)' % (info_raw.get('msg'), info_raw.get('code')), expected=True) + 'Bigo says: {} (code {})'.format(info_raw.get('msg'), info_raw.get('code')), expected=True) info = info_raw.get('data') or {} if not info.get('alive'): diff --git a/yt_dlp/extractor/bild.py b/yt_dlp/extractor/bild.py index f3dea33c46..2ba63700c6 100644 --- a/yt_dlp/extractor/bild.py +++ b/yt_dlp/extractor/bild.py @@ -1,6 +1,7 @@ from .common import InfoExtractor from ..utils import ( int_or_none, + traverse_obj, unescapeHTML, ) @@ -8,7 +9,8 @@ class BildIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bild\.de/(?:[^/]+/)+(?P<display_id>[^/]+)-(?P<id>\d+)(?:,auto=true)?\.bild\.html' IE_DESC = 'Bild.de' - _TEST = { + _TESTS = [{ + 'note': 'static MP4 only', 'url': 'http://www.bild.de/video/clip/apple-ipad-air/das-koennen-die-neuen-ipads-38184146.bild.html', 'md5': 'dd495cbd99f2413502a1713a1156ac8a', 'info_dict': { @@ -18,8 +20,20 @@ class BildIE(InfoExtractor): 'description': 'md5:a4058c4fa2a804ab59c00d7244bbf62f', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 196, - } - } + }, + }, { + 'note': 'static MP4 and HLS', + 'url': 'https://www.bild.de/video/clip/news-ausland/deftiger-abgang-vom-10m-turm-bademeister-sorgt-fuer-skandal-85158620.bild.html', + 'md5': 'fb0ed4f09c495d4ba7ce2eee0bb90de1', + 'info_dict': { + 'id': '85158620', + 'ext': 'mp4', + 'title': 'Der Sprungturm-Skandal', + 'description': 'md5:709b543c24dc31bbbffee73bccda34ad', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 69, + }, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -27,11 +41,23 @@ def _real_extract(self, url): video_data = self._download_json( url.split('.bild.html')[0] + ',view=json.bild.html', video_id) + formats = [] + for src in traverse_obj(video_data, ('clipList', 0, 'srces', lambda _, v: v['src'])): + src_type = src.get('type') + if src_type == 'application/x-mpegURL': + formats.extend( + self._extract_m3u8_formats( + src['src'], video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif src_type == 'video/mp4': + formats.append({'url': src['src'], 'format_id': 'http-mp4'}) + else: + self.report_warning(f'Skipping unsupported format type: "{src_type}"') + return { 'id': video_id, 'title': unescapeHTML(video_data['title']).strip(), 'description': unescapeHTML(video_data.get('description')), - 'url': video_data['clipList'][0]['srces'][0]['src'], + 'formats': formats, 'thumbnail': video_data.get('poster'), 'duration': int_or_none(video_data.get('durationSec')), } diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index f4180633ab..3163df8ab7 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1,55 +1,88 @@ import base64 import functools +import hashlib import itertools +import json import math -import urllib.error +import re +import time import urllib.parse +import uuid from .common import InfoExtractor, SearchInfoExtractor from ..dependencies import Cryptodome +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, GeoRestrictedError, InAdvancePagedList, OnDemandPagedList, + bool_or_none, + clean_html, + determine_ext, filter_dict, float_or_none, format_field, + get_element_by_class, int_or_none, + join_nonempty, make_archive_id, merge_dicts, mimetype2ext, parse_count, parse_qs, + parse_resolution, qualities, smuggle_url, srt_subtitles_timecode, str_or_none, traverse_obj, + unified_timestamp, unsmuggle_url, url_or_none, urlencode_postdata, + variadic, ) class BilibiliBaseIE(InfoExtractor): + _FORMAT_ID_RE = re.compile(r'-(\d+)\.m4s\?') + _WBI_KEY_CACHE_TIMEOUT = 30 # exact expire timeout is unclear, use 30s for one session + _wbi_key_cache = {} + + @property + def is_logged_in(self): + return bool(self._get_cookies('https://api.bilibili.com').get('SESSDATA')) + + def _check_missing_formats(self, play_info, formats): + parsed_qualities = set(traverse_obj(formats, (..., 'quality'))) + missing_formats = join_nonempty(*[ + traverse_obj(fmt, 'new_description', 'display_desc', 'quality') + for fmt in traverse_obj(play_info, ( + 'support_formats', lambda _, v: v['quality'] not in parsed_qualities))], delim=', ') + if missing_formats: + self.to_screen( + f'Format(s) {missing_formats} are missing; you have to login or ' + f'become a premium member to download them. {self._login_hint()}') + def extract_formats(self, play_info): format_names = { r['quality']: traverse_obj(r, 'new_description', 'display_desc') for r in traverse_obj(play_info, ('support_formats', lambda _, v: v['quality'])) } - audios = traverse_obj(play_info, ('dash', 'audio', ...)) + audios = traverse_obj(play_info, ('dash', (None, 'dolby'), 'audio', ..., {dict})) flac_audio = traverse_obj(play_info, ('dash', 'flac', 'audio')) if flac_audio: audios.append(flac_audio) formats = [{ 'url': traverse_obj(audio, 'baseUrl', 'base_url', 'url'), 'ext': mimetype2ext(traverse_obj(audio, 'mimeType', 'mime_type')), - 'acodec': audio.get('codecs'), + 'acodec': traverse_obj(audio, ('codecs', {str.lower})), 'vcodec': 'none', 'tbr': float_or_none(audio.get('bandwidth'), scale=1000), - 'filesize': int_or_none(audio.get('size')) + 'filesize': int_or_none(audio.get('size')), + 'format_id': str_or_none(audio.get('id')), } for audio in audios] formats.extend({ @@ -60,19 +93,86 @@ def extract_formats(self, play_info): 'height': int_or_none(video.get('height')), 'vcodec': video.get('codecs'), 'acodec': 'none' if audios else None, + 'dynamic_range': {126: 'DV', 125: 'HDR10'}.get(int_or_none(video.get('id'))), 'tbr': float_or_none(video.get('bandwidth'), scale=1000), 'filesize': int_or_none(video.get('size')), 'quality': int_or_none(video.get('id')), + 'format_id': traverse_obj( + video, (('baseUrl', 'base_url'), {self._FORMAT_ID_RE.search}, 1), + ('id', {str_or_none}), get_all=False), 'format': format_names.get(video.get('id')), } for video in traverse_obj(play_info, ('dash', 'video', ...))) - missing_formats = format_names.keys() - set(traverse_obj(formats, (..., 'quality'))) - if missing_formats: - self.to_screen(f'Format(s) {", ".join(format_names[i] for i in missing_formats)} are missing; ' - f'you have to login or become premium member to download them. {self._login_hint()}') + if formats: + self._check_missing_formats(play_info, formats) + fragments = traverse_obj(play_info, ('durl', lambda _, v: url_or_none(v['url']), { + 'url': ('url', {url_or_none}), + 'duration': ('length', {functools.partial(float_or_none, scale=1000)}), + 'filesize': ('size', {int_or_none}), + })) + if fragments: + formats.append({ + 'url': fragments[0]['url'], + 'filesize': sum(traverse_obj(fragments, (..., 'filesize'))), + **({ + 'fragments': fragments, + 'protocol': 'http_dash_segments', + } if len(fragments) > 1 else {}), + **traverse_obj(play_info, { + 'quality': ('quality', {int_or_none}), + 'format_id': ('quality', {str_or_none}), + 'format_note': ('quality', {lambda x: format_names.get(x)}), + 'duration': ('timelength', {functools.partial(float_or_none, scale=1000)}), + }), + **parse_resolution(format_names.get(play_info.get('quality'))), + }) return formats + def _get_wbi_key(self, video_id): + if time.time() < self._wbi_key_cache.get('ts', 0) + self._WBI_KEY_CACHE_TIMEOUT: + return self._wbi_key_cache['key'] + + session_data = self._download_json( + 'https://api.bilibili.com/x/web-interface/nav', video_id, note='Downloading wbi sign') + + lookup = ''.join(traverse_obj(session_data, ( + 'data', 'wbi_img', ('img_url', 'sub_url'), + {lambda x: x.rpartition('/')[2].partition('.')[0]}))) + + # from getMixinKey() in the vendor js + mixin_key_enc_tab = [ + 46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49, + 33, 9, 42, 19, 29, 28, 14, 39, 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40, + 61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, 57, 62, 11, + 36, 20, 34, 44, 52, + ] + + self._wbi_key_cache.update({ + 'key': ''.join(lookup[i] for i in mixin_key_enc_tab)[:32], + 'ts': time.time(), + }) + return self._wbi_key_cache['key'] + + def _sign_wbi(self, params, video_id): + params['wts'] = round(time.time()) + params = { + k: ''.join(filter(lambda char: char not in "!'()*", str(v))) + for k, v in sorted(params.items()) + } + query = urllib.parse.urlencode(params) + params['w_rid'] = hashlib.md5(f'{query}{self._get_wbi_key(video_id)}'.encode()).hexdigest() + return params + + def _download_playinfo(self, bvid, cid, headers=None, qn=None): + params = {'bvid': bvid, 'cid': cid, 'fnval': 4048} + if qn: + params['qn'] = qn + return self._download_json( + 'https://api.bilibili.com/x/player/wbi/playurl', bvid, + query=self._sign_wbi(params, bvid), headers=headers, + note=f'Downloading video formats for cid {cid} {qn or ""}')['data'] + def json2srt(self, json_data): srt_data = '' for idx, line in enumerate(json_data.get('body') or []): @@ -81,18 +181,26 @@ def json2srt(self, json_data): f'{line["content"]}\n\n') return srt_data - def _get_subtitles(self, video_id, initial_state, cid): + def _get_subtitles(self, video_id, cid, aid=None): subtitles = { 'danmaku': [{ 'ext': 'xml', 'url': f'https://comment.bilibili.com/{cid}.xml', - }] + }], } - for s in traverse_obj(initial_state, ('videoData', 'subtitle', 'list')) or []: + video_info = self._download_json( + 'https://api.bilibili.com/x/player/v2', video_id, + query={'aid': aid, 'cid': cid} if aid else {'bvid': video_id, 'cid': cid}, + note=f'Extracting subtitle info {cid}') + if traverse_obj(video_info, ('data', 'need_login_subtitle')): + self.report_warning( + f'Subtitles are only available when logged in. {self._login_hint()}', only_once=True) + for s in traverse_obj(video_info, ( + 'data', 'subtitle', 'subtitles', lambda _, v: v['subtitle_url'] and v['lan'])): subtitles.setdefault(s['lan'], []).append({ 'ext': 'srt', - 'data': self.json2srt(self._download_json(s['subtitle_url'], video_id)) + 'data': self.json2srt(self._download_json(s['subtitle_url'], video_id)), }) return subtitles @@ -130,9 +238,67 @@ def _get_all_children(self, reply): for children in map(self._get_all_children, traverse_obj(reply, ('replies', ...))): yield from children + def _get_episodes_from_season(self, ss_id, url): + season_info = self._download_json( + 'https://api.bilibili.com/pgc/web/season/section', ss_id, + note='Downloading season info', query={'season_id': ss_id}, + headers={'Referer': url, **self.geo_verification_headers()}) + + for entry in traverse_obj(season_info, ( + 'result', 'main_section', 'episodes', + lambda _, v: url_or_none(v['share_url']) and v['id'])): + yield self.url_result(entry['share_url'], BiliBiliBangumiIE, str_or_none(entry.get('id'))) + + def _get_divisions(self, video_id, graph_version, edges, edge_id, cid_edges=None): + cid_edges = cid_edges or {} + division_data = self._download_json( + 'https://api.bilibili.com/x/stein/edgeinfo_v2', video_id, + query={'graph_version': graph_version, 'edge_id': edge_id, 'bvid': video_id}, + note=f'Extracting divisions from edge {edge_id}') + edges.setdefault(edge_id, {}).update( + traverse_obj(division_data, ('data', 'story_list', lambda _, v: v['edge_id'] == edge_id, { + 'title': ('title', {str}), + 'cid': ('cid', {int_or_none}), + }), get_all=False)) + + edges[edge_id].update(traverse_obj(division_data, ('data', { + 'title': ('title', {str}), + 'choices': ('edges', 'questions', ..., 'choices', ..., { + 'edge_id': ('id', {int_or_none}), + 'cid': ('cid', {int_or_none}), + 'text': ('option', {str}), + }), + }))) + # use dict to combine edges that use the same video section (same cid) + cid_edges.setdefault(edges[edge_id]['cid'], {})[edge_id] = edges[edge_id] + for choice in traverse_obj(edges, (edge_id, 'choices', ...)): + if choice['edge_id'] not in edges: + edges[choice['edge_id']] = {'cid': choice['cid']} + self._get_divisions(video_id, graph_version, edges, choice['edge_id'], cid_edges=cid_edges) + return cid_edges + + def _get_interactive_entries(self, video_id, cid, metainfo, headers=None): + graph_version = traverse_obj( + self._download_json( + 'https://api.bilibili.com/x/player/wbi/v2', video_id, + 'Extracting graph version', query={'bvid': video_id, 'cid': cid}, headers=headers), + ('data', 'interaction', 'graph_version', {int_or_none})) + cid_edges = self._get_divisions(video_id, graph_version, {1: {'cid': cid}}, 1) + for cid, edges in cid_edges.items(): + play_info = self._download_playinfo(video_id, cid, headers=headers) + yield { + **metainfo, + 'id': f'{video_id}_{cid}', + 'title': f'{metainfo.get("title")} - {next(iter(edges.values())).get("title")}', + 'formats': self.extract_formats(play_info), + 'description': f'{json.dumps(edges, ensure_ascii=False)}\n{metainfo.get("description", "")}', + 'duration': float_or_none(play_info.get('timelength'), scale=1000), + 'subtitles': self.extract_subtitles(video_id, cid), + } + class BiliBiliIE(BilibiliBaseIE): - _VALID_URL = r'https?://www\.bilibili\.com/video/[aAbB][vV](?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:video/|festival/[^/?#]+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bilibili.com/video/BV13x41117TL', @@ -151,17 +317,17 @@ class BiliBiliIE(BilibiliBaseIE): 'timestamp': 1488353834, 'like_count': int, 'view_count': int, + '_old_archive_ids': ['bilibili 8903802_part1'], }, }, { - # old av URL version + 'note': 'old av URL version', 'url': 'http://www.bilibili.com/video/av1074402/', 'info_dict': { - 'thumbnail': r're:^https?://.*\.(jpg|jpeg)$', + 'id': 'BV11x411K7CN', 'ext': 'mp4', + 'title': '【金坷垃】金泡沫', 'uploader': '菊子桑', 'uploader_id': '156160', - 'id': 'BV11x411K7CN', - 'title': '【金坷垃】金泡沫', 'duration': 308.36, 'upload_date': '20140420', 'timestamp': 1397983878, @@ -170,6 +336,8 @@ class BiliBiliIE(BilibiliBaseIE): 'comment_count': int, 'view_count': int, 'tags': list, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg)$', + '_old_archive_ids': ['bilibili 1074402_part1'], }, 'params': {'skip_download': True}, }, { @@ -177,7 +345,7 @@ class BiliBiliIE(BilibiliBaseIE): 'url': 'https://www.bilibili.com/video/BV1bK411W797', 'info_dict': { 'id': 'BV1bK411W797', - 'title': '物语中的人物是如何吐槽自己的OP的' + 'title': '物语中的人物是如何吐槽自己的OP的', }, 'playlist_count': 18, 'playlist': [{ @@ -185,7 +353,7 @@ class BiliBiliIE(BilibiliBaseIE): 'id': 'BV1bK411W797_p1', 'ext': 'mp4', 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', - 'tags': 'count:11', + 'tags': 'count:10', 'timestamp': 1589601697, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'uploader': '打牌还是打桩', @@ -196,8 +364,9 @@ class BiliBiliIE(BilibiliBaseIE): 'view_count': int, 'description': 'md5:e3c401cf7bc363118d1783dd74068a68', 'duration': 90.314, - } - }] + '_old_archive_ids': ['bilibili 498159642_part1'], + }, + }], }, { 'note': 'Specific page of Anthology', 'url': 'https://www.bilibili.com/video/BV1bK411W797?p=1', @@ -205,7 +374,7 @@ class BiliBiliIE(BilibiliBaseIE): 'id': 'BV1bK411W797_p1', 'ext': 'mp4', 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', - 'tags': 'count:11', + 'tags': 'count:10', 'timestamp': 1589601697, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'uploader': '打牌还是打桩', @@ -216,28 +385,8 @@ class BiliBiliIE(BilibiliBaseIE): 'view_count': int, 'description': 'md5:e3c401cf7bc363118d1783dd74068a68', 'duration': 90.314, - } - }, { - 'note': 'video has subtitles', - 'url': 'https://www.bilibili.com/video/BV12N4y1M7rh', - 'info_dict': { - 'id': 'BV12N4y1M7rh', - 'ext': 'mp4', - 'title': 'md5:96e8bb42c2b432c0d4ce3434a61479c1', - 'tags': list, - 'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4', - 'duration': 313.557, - 'upload_date': '20220709', - 'uploader': '小夫Tech', - 'timestamp': 1657347907, - 'uploader_id': '1326814124', - 'comment_count': int, - 'view_count': int, - 'like_count': int, - 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', - 'subtitles': 'count:2' + '_old_archive_ids': ['bilibili 498159642_part1'], }, - 'params': {'listsubtitles': True}, }, { 'url': 'https://www.bilibili.com/video/av8903802/', 'info_dict': { @@ -255,6 +404,7 @@ class BiliBiliIE(BilibiliBaseIE): 'comment_count': int, 'view_count': int, 'like_count': int, + '_old_archive_ids': ['bilibili 8903802_part1'], }, 'params': { 'skip_download': True, @@ -278,25 +428,249 @@ class BiliBiliIE(BilibiliBaseIE): 'view_count': int, 'like_count': int, 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + '_old_archive_ids': ['bilibili 463665680_part1'], }, 'params': {'skip_download': True}, + }, { + 'note': 'video redirects to festival page', + 'url': 'https://www.bilibili.com/video/BV1wP4y1P72h', + 'info_dict': { + 'id': 'BV1wP4y1P72h', + 'ext': 'mp4', + 'title': '牛虎年相交之际,一首传统民族打击乐《牛斗虎》祝大家新春快乐,虎年大吉!【bilibili音乐虎闹新春】', + 'timestamp': 1643947497, + 'upload_date': '20220204', + 'description': 'md5:8681a0d4d2c06b4ae27e59c8080a7fe6', + 'uploader': '叨叨冯聊音乐', + 'duration': 246.719, + 'uploader_id': '528182630', + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + '_old_archive_ids': ['bilibili 893839363_part1'], + }, + }, { + 'note': 'newer festival video', + 'url': 'https://www.bilibili.com/festival/2023honkaiimpact3gala?bvid=BV1ay4y1d77f', + 'info_dict': { + 'id': 'BV1ay4y1d77f', + 'ext': 'mp4', + 'title': '【崩坏3新春剧场】为特别的你送上祝福!', + 'timestamp': 1674273600, + 'upload_date': '20230121', + 'description': 'md5:58af66d15c6a0122dc30c8adfd828dd8', + 'uploader': '果蝇轰', + 'duration': 1111.722, + 'uploader_id': '8469526', + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + '_old_archive_ids': ['bilibili 778246196_part1'], + }, + }, { + 'note': 'legacy flv/mp4 video', + 'url': 'https://www.bilibili.com/video/BV1ms411Q7vw/?p=4', + 'info_dict': { + 'id': 'BV1ms411Q7vw_p4', + 'title': '[搞笑]【动画】云南方言快乐生产线出品 p04 新烧包谷之漫游桃花岛', + 'timestamp': 1458222815, + 'upload_date': '20160317', + 'description': '云南方言快乐生产线出品', + 'duration': float, + 'uploader': '一笑颠天', + 'uploader_id': '3916081', + 'view_count': int, + 'comment_count': int, + 'like_count': int, + 'tags': list, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + '_old_archive_ids': ['bilibili 4120229_part4'], + }, + 'params': {'extractor_args': {'bilibili': {'prefer_multi_flv': ['32']}}}, + 'playlist_count': 19, + 'playlist': [{ + 'info_dict': { + 'id': 'BV1ms411Q7vw_p4_0', + 'ext': 'flv', + 'title': '[搞笑]【动画】云南方言快乐生产线出品 p04 新烧包谷之漫游桃花岛', + 'duration': 399.102, + }, + }], + }, { + 'note': 'legacy mp4-only video', + 'url': 'https://www.bilibili.com/video/BV1nx411u79K', + 'info_dict': { + 'id': 'BV1nx411u79K', + 'ext': 'mp4', + 'title': '【练习室】201603声乐练习《No Air》with VigoVan', + 'timestamp': 1508893551, + 'upload_date': '20171025', + 'description': '@ZERO-G伯远\n声乐练习 《No Air》with Vigo Van', + 'duration': 80.384, + 'uploader': '伯远', + 'uploader_id': '10584494', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'tags': list, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + '_old_archive_ids': ['bilibili 15700301_part1'], + }, + }, { + 'note': 'interactive/split-path video', + 'url': 'https://www.bilibili.com/video/BV1af4y1H7ga/', + 'info_dict': { + 'id': 'BV1af4y1H7ga', + 'title': '【互动游戏】花了大半年时间做的自我介绍~请查收!!', + 'timestamp': 1630500414, + 'upload_date': '20210901', + 'description': 'md5:01113e39ab06e28042d74ac356a08786', + 'tags': list, + 'uploader': '钉宫妮妮Ninico', + 'duration': 1503, + 'uploader_id': '8881297', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + '_old_archive_ids': ['bilibili 292734508_part1'], + }, + 'playlist_count': 33, + 'playlist': [{ + 'info_dict': { + 'id': 'BV1af4y1H7ga_400950101', + 'ext': 'mp4', + 'title': '【互动游戏】花了大半年时间做的自我介绍~请查收!! - 听见猫猫叫~', + 'timestamp': 1630500414, + 'upload_date': '20210901', + 'description': 'md5:db66ac7a2813a94b8291dbce990cc5b2', + 'tags': list, + 'uploader': '钉宫妮妮Ninico', + 'duration': 11.605, + 'uploader_id': '8881297', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + '_old_archive_ids': ['bilibili 292734508_part1'], + }, + }], + }, { + 'note': '301 redirect to bangumi link', + 'url': 'https://www.bilibili.com/video/BV1TE411f7f1', + 'info_dict': { + 'id': '288525', + 'title': '李永乐老师 钱学森弹道和乘波体飞行器是什么?', + 'ext': 'mp4', + 'series': '我和我的祖国', + 'series_id': '4780', + 'season': '幕后纪实', + 'season_id': '28609', + 'season_number': 1, + 'episode': '钱学森弹道和乘波体飞行器是什么?', + 'episode_id': '288525', + 'episode_number': 105, + 'duration': 1183.957, + 'timestamp': 1571648124, + 'upload_date': '20191021', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + }, { + 'note': 'video has subtitles, which requires login', + 'url': 'https://www.bilibili.com/video/BV12N4y1M7rh', + 'info_dict': { + 'id': 'BV12N4y1M7rh', + 'ext': 'mp4', + 'title': 'md5:96e8bb42c2b432c0d4ce3434a61479c1', + 'tags': list, + 'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4', + 'duration': 313.557, + 'upload_date': '20220709', + 'uploader': '小夫太渴', + 'timestamp': 1657347907, + 'uploader_id': '1326814124', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'subtitles': 'count:2', # login required for CC subtitle + '_old_archive_ids': ['bilibili 898179753_part1'], + }, + 'params': {'listsubtitles': True}, + 'skip': 'login required for subtitle', + }, { + 'url': 'https://www.bilibili.com/video/BV1jL41167ZG/', + 'info_dict': { + 'id': 'BV1jL41167ZG', + 'title': '一场大火引发的离奇死亡!古典推理经典短篇集《不可能犯罪诊断书》!', + 'ext': 'mp4', + }, + 'skip': 'supporter-only video', + }, { + 'url': 'https://www.bilibili.com/video/BV1Ks411f7aQ/', + 'info_dict': { + 'id': 'BV1Ks411f7aQ', + 'title': '【BD1080P】狼与香辛料I【华盟】', + 'ext': 'mp4', + }, + 'skip': 'login required', + }, { + 'url': 'https://www.bilibili.com/video/BV1GJ411x7h7/', + 'info_dict': { + 'id': 'BV1GJ411x7h7', + 'title': '【官方 MV】Never Gonna Give You Up - Rick Astley', + 'ext': 'mp4', + }, + 'skip': 'geo-restricted', + }, { + 'note': 'has - in the last path segment of the url', + 'url': 'https://www.bilibili.com/festival/bh3-7th?bvid=BV1tr4y1f7p2&', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) - play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] + headers = self.geo_verification_headers() + webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers) + if not self._match_valid_url(urlh.url): + return self.url_result(urlh.url) + + headers['Referer'] = url + + initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) + is_festival = 'videoData' not in initial_state + if is_festival: + video_data = initial_state['videoInfo'] + else: + play_info_obj = self._search_json( + r'window\.__playinfo__\s*=', webpage, 'play info', video_id, fatal=False) + if not play_info_obj: + if traverse_obj(initial_state, ('error', 'trueCode')) == -403: + self.raise_login_required() + if traverse_obj(initial_state, ('error', 'trueCode')) == -404: + raise ExtractorError( + 'This video may be deleted or geo-restricted. ' + 'You might want to try a VPN or a proxy server (with --proxy)', expected=True) + play_info = traverse_obj(play_info_obj, ('data', {dict})) + if not play_info: + if traverse_obj(play_info_obj, 'code') == 87007: + toast = get_element_by_class('tips-toast', webpage) or '' + msg = clean_html( + f'{get_element_by_class("belongs-to", toast) or ""},' + + (get_element_by_class('level', toast) or '')) + raise ExtractorError( + f'This is a supporter-only video: {msg}. {self._login_hint()}', expected=True) + raise ExtractorError('Failed to extract play info') + video_data = initial_state['videoData'] - video_data = initial_state['videoData'] video_id, title = video_data['bvid'], video_data.get('title') # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. - page_list_json = traverse_obj( + page_list_json = not is_festival and traverse_obj( self._download_json( 'https://api.bilibili.com/x/player/pagelist', video_id, fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'}, - note='Extracting videos in anthology'), + note='Extracting videos in anthology', headers=headers), 'data', expected_type=list) or [] is_anthology = len(page_list_json) > 1 @@ -312,127 +686,473 @@ def _real_extract(self, url): aid = video_data.get('aid') old_video_id = format_field(aid, None, f'%s_part{part_id or 1}') - cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid') - return { + festival_info = {} + if is_festival: + play_info = self._download_playinfo(video_id, cid, headers=headers) + + festival_info = traverse_obj(initial_state, { + 'uploader': ('videoInfo', 'upName'), + 'uploader_id': ('videoInfo', 'upMid', {str_or_none}), + 'like_count': ('videoStatus', 'like', {int_or_none}), + 'thumbnail': ('sectionEpisodes', lambda _, v: v['bvid'] == video_id, 'cover'), + }, get_all=False) + + metainfo = { + **traverse_obj(initial_state, { + 'uploader': ('upData', 'name'), + 'uploader_id': ('upData', 'mid', {str_or_none}), + 'like_count': ('videoData', 'stat', 'like', {int_or_none}), + 'tags': ('tags', ..., 'tag_name'), + 'thumbnail': ('videoData', 'pic', {url_or_none}), + }), + **festival_info, + **traverse_obj(video_data, { + 'description': 'desc', + 'timestamp': ('pubdate', {int_or_none}), + 'view_count': (('viewCount', ('stat', 'view')), {int_or_none}), + 'comment_count': ('stat', 'reply', {int_or_none}), + }, get_all=False), 'id': f'{video_id}{format_field(part_id, None, "_p%d")}', - 'formats': self.extract_formats(play_info), '_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None, 'title': title, - 'description': traverse_obj(initial_state, ('videoData', 'desc')), - 'view_count': traverse_obj(initial_state, ('videoData', 'stat', 'view')), - 'uploader': traverse_obj(initial_state, ('upData', 'name')), - 'uploader_id': traverse_obj(initial_state, ('upData', 'mid')), - 'like_count': traverse_obj(initial_state, ('videoData', 'stat', 'like')), - 'comment_count': traverse_obj(initial_state, ('videoData', 'stat', 'reply')), - 'tags': traverse_obj(initial_state, ('tags', ..., 'tag_name')), - 'thumbnail': traverse_obj(initial_state, ('videoData', 'pic')), - 'timestamp': traverse_obj(initial_state, ('videoData', 'pubdate')), + 'http_headers': {'Referer': url}, + } + + is_interactive = traverse_obj(video_data, ('rights', 'is_stein_gate')) + if is_interactive: + return self.playlist_result( + self._get_interactive_entries(video_id, cid, metainfo, headers=headers), **metainfo, + duration=traverse_obj(initial_state, ('videoData', 'duration', {int_or_none})), + __post_extractor=self.extract_comments(aid)) + else: + formats = self.extract_formats(play_info) + + if not traverse_obj(play_info, ('dash')): + # we only have legacy formats and need additional work + has_qn = lambda x: x in traverse_obj(formats, (..., 'quality')) + for qn in traverse_obj(play_info, ('accept_quality', lambda _, v: not has_qn(v), {int})): + formats.extend(traverse_obj( + self.extract_formats(self._download_playinfo(video_id, cid, headers=headers, qn=qn)), + lambda _, v: not has_qn(v['quality']))) + self._check_missing_formats(play_info, formats) + flv_formats = traverse_obj(formats, lambda _, v: v['fragments']) + if flv_formats and len(flv_formats) < len(formats): + # Flv and mp4 are incompatible due to `multi_video` workaround, so drop one + if not self._configuration_arg('prefer_multi_flv'): + dropped_fmts = ', '.join( + f'{f.get("format_note")} ({f.get("format_id")})' for f in flv_formats) + formats = traverse_obj(formats, lambda _, v: not v.get('fragments')) + if dropped_fmts: + self.to_screen( + f'Dropping incompatible flv format(s) {dropped_fmts} since mp4 is available. ' + 'To extract flv, pass --extractor-args "bilibili:prefer_multi_flv"') + else: + formats = traverse_obj( + # XXX: Filtering by extractor-arg is for testing purposes + formats, lambda _, v: v['quality'] == int(self._configuration_arg('prefer_multi_flv')[0]), + ) or [max(flv_formats, key=lambda x: x['quality'])] + + if traverse_obj(formats, (0, 'fragments')): + # We have flv formats, which are individual short videos with their own timestamps and metainfo + # Binary concatenation corrupts their timestamps, so we need a `multi_video` workaround + return { + **metainfo, + '_type': 'multi_video', + 'entries': [{ + 'id': f'{metainfo["id"]}_{idx}', + 'title': metainfo['title'], + 'http_headers': metainfo['http_headers'], + 'formats': [{ + **fragment, + 'format_id': formats[0].get('format_id'), + }], + 'subtitles': self.extract_subtitles(video_id, cid) if idx == 0 else None, + '__post_extractor': self.extract_comments(aid) if idx == 0 else None, + } for idx, fragment in enumerate(formats[0]['fragments'])], + 'duration': float_or_none(play_info.get('timelength'), scale=1000), + } + else: + return { + **metainfo, + 'formats': formats, + 'duration': float_or_none(play_info.get('timelength'), scale=1000), + 'chapters': self._get_chapters(aid, cid), + 'subtitles': self.extract_subtitles(video_id, cid), + '__post_extractor': self.extract_comments(aid), + } + + +class BiliBiliBangumiIE(BilibiliBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/play/ep(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://www.bilibili.com/bangumi/play/ep21495/', + 'info_dict': { + 'id': '21495', + 'ext': 'mp4', + 'series': '悠久之翼', + 'series_id': '774', + 'season': '第二季', + 'season_id': '1182', + 'season_number': 2, + 'episode': 'forever/ef', + 'episode_id': '21495', + 'episode_number': 12, + 'title': '12 forever/ef', + 'duration': 1420.791, + 'timestamp': 1320412200, + 'upload_date': '20111104', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + }, { + 'url': 'https://www.bilibili.com/bangumi/play/ep267851', + 'info_dict': { + 'id': '267851', + 'ext': 'mp4', + 'series': '鬼灭之刃', + 'series_id': '4358', + 'season': '立志篇', + 'season_id': '26801', + 'season_number': 1, + 'episode': '残酷', + 'episode_id': '267851', + 'episode_number': 1, + 'title': '1 残酷', + 'duration': 1425.256, + 'timestamp': 1554566400, + 'upload_date': '20190406', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + 'skip': 'Geo-restricted', + }, { + 'note': 'a making-of which falls outside main section', + 'url': 'https://www.bilibili.com/bangumi/play/ep345120', + 'info_dict': { + 'id': '345120', + 'ext': 'mp4', + 'series': '鬼灭之刃', + 'series_id': '4358', + 'season': '立志篇', + 'season_id': '26801', + 'season_number': 1, + 'episode': '炭治郎篇', + 'episode_id': '345120', + 'episode_number': 27, + 'title': '#1 炭治郎篇', + 'duration': 1922.129, + 'timestamp': 1602853860, + 'upload_date': '20201016', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + headers = self.geo_verification_headers() + webpage = self._download_webpage(url, episode_id, headers=headers) + + if '您所在的地区无法观看本片' in webpage: + raise GeoRestrictedError('This video is restricted') + elif '正在观看预览,大会员免费看全片' in webpage: + self.raise_login_required('This video is for premium members only') + + headers['Referer'] = url + play_info = self._download_json( + 'https://api.bilibili.com/pgc/player/web/v2/playurl', episode_id, + 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id}, + headers=headers) + premium_only = play_info.get('code') == -10403 + play_info = traverse_obj(play_info, ('result', 'video_info', {dict})) or {} + + formats = self.extract_formats(play_info) + if not formats and (premium_only or '成为大会员抢先看' in webpage or '开通大会员观看' in webpage): + self.raise_login_required('This video is for premium members only') + + bangumi_info = self._download_json( + 'https://api.bilibili.com/pgc/view/web/season', episode_id, 'Get episode details', + query={'ep_id': episode_id}, headers=headers)['result'] + + episode_number, episode_info = next(( + (idx, ep) for idx, ep in enumerate(traverse_obj( + bangumi_info, (('episodes', ('section', ..., 'episodes')), ..., {dict})), 1) + if str_or_none(ep.get('id')) == episode_id), (1, {})) + + season_id = bangumi_info.get('season_id') + season_number, season_title = season_id and next(( + (idx + 1, e.get('season_title')) for idx, e in enumerate( + traverse_obj(bangumi_info, ('seasons', ...))) + if e.get('season_id') == season_id + ), (None, None)) + + aid = episode_info.get('aid') + + return { + 'id': episode_id, + 'formats': formats, + **traverse_obj(bangumi_info, { + 'series': ('series', 'series_title', {str}), + 'series_id': ('series', 'series_id', {str_or_none}), + 'thumbnail': ('square_cover', {url_or_none}), + }), + **traverse_obj(episode_info, { + 'episode': ('long_title', {str}), + 'episode_number': ('title', {int_or_none}, {lambda x: x or episode_number}), + 'timestamp': ('pub_time', {int_or_none}), + 'title': {lambda v: v and join_nonempty('title', 'long_title', delim=' ', from_dict=v)}, + }), + 'episode_id': episode_id, + 'season': str_or_none(season_title), + 'season_id': str_or_none(season_id), + 'season_number': season_number, 'duration': float_or_none(play_info.get('timelength'), scale=1000), - 'chapters': self._get_chapters(aid, cid), - 'subtitles': self.extract_subtitles(video_id, initial_state, cid), + 'subtitles': self.extract_subtitles(episode_id, episode_info.get('cid'), aid=aid), '__post_extractor': self.extract_comments(aid), 'http_headers': {'Referer': url}, } -class BiliBiliBangumiIE(BilibiliBaseIE): - _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/(?P<id>(?:ss|ep)\d+)' - - _TESTS = [{ - 'url': 'https://www.bilibili.com/bangumi/play/ss897', - 'info_dict': { - 'id': 'ss897', - 'ext': 'mp4', - 'series': '神的记事本', - 'season': '神的记事本', - 'season_id': 897, - 'season_number': 1, - 'episode': '你与旅行包', - 'episode_number': 2, - 'title': '神的记事本:第2话 你与旅行包', - 'duration': 1428.487, - 'timestamp': 1310809380, - 'upload_date': '20110716', - 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', - }, - }, { - 'url': 'https://www.bilibili.com/bangumi/play/ep508406', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - if '您所在的地区无法观看本片' in webpage: - raise GeoRestrictedError('This video is restricted') - elif ('开通大会员观看' in webpage and '__playinfo__' not in webpage - or '正在观看预览,大会员免费看全片' in webpage): - self.raise_login_required('This video is for premium members only') - - play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] - formats = self.extract_formats(play_info) - if (not formats and '成为大会员抢先看' in webpage - and play_info.get('durl') and not play_info.get('dash')): - self.raise_login_required('This video is for premium members only') - - initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) - - season_id = traverse_obj(initial_state, ('mediaInfo', 'season_id')) - season_number = season_id and next(( - idx + 1 for idx, e in enumerate( - traverse_obj(initial_state, ('mediaInfo', 'seasons', ...))) - if e.get('season_id') == season_id - ), None) - - return { - 'id': video_id, - 'formats': formats, - 'title': traverse_obj(initial_state, 'h1Title'), - 'episode': traverse_obj(initial_state, ('epInfo', 'long_title')), - 'episode_number': int_or_none(traverse_obj(initial_state, ('epInfo', 'title'))), - 'series': traverse_obj(initial_state, ('mediaInfo', 'series')), - 'season': traverse_obj(initial_state, ('mediaInfo', 'season_title')), - 'season_id': season_id, - 'season_number': season_number, - 'thumbnail': traverse_obj(initial_state, ('epInfo', 'cover')), - 'timestamp': traverse_obj(initial_state, ('epInfo', 'pub_time')), - 'duration': float_or_none(play_info.get('timelength'), scale=1000), - 'subtitles': self.extract_subtitles( - video_id, initial_state, traverse_obj(initial_state, ('epInfo', 'cid'))), - '__post_extractor': self.extract_comments(traverse_obj(initial_state, ('epInfo', 'aid'))), - 'http_headers': {'Referer': url, **self.geo_verification_headers()}, - } - - -class BiliBiliBangumiMediaIE(InfoExtractor): - _VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P<id>\d+)' +class BiliBiliBangumiMediaIE(BilibiliBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/media/md(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/media/md24097891', 'info_dict': { 'id': '24097891', + 'title': 'CAROLE & TUESDAY', + 'description': 'md5:42417ad33d1eaa1c93bfd2dd1626b829', }, 'playlist_mincount': 25, + }, { + 'url': 'https://www.bilibili.com/bangumi/media/md1565/', + 'info_dict': { + 'id': '1565', + 'title': '攻壳机动队 S.A.C. 2nd GIG', + 'description': 'md5:46cac00bafd645b97f4d6df616fc576d', + }, + 'playlist_count': 26, + 'playlist': [{ + 'info_dict': { + 'id': '68540', + 'ext': 'mp4', + 'series': '攻壳机动队', + 'series_id': '1077', + 'season': '第二季', + 'season_id': '1565', + 'season_number': 2, + 'episode': '再启动 REEMBODY', + 'episode_id': '68540', + 'episode_number': 1, + 'title': '1 再启动 REEMBODY', + 'duration': 1525.777, + 'timestamp': 1425074413, + 'upload_date': '20150227', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + }], }] def _real_extract(self, url): media_id = self._match_id(url) webpage = self._download_webpage(url, media_id) - initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id) - episode_list = self._download_json( - 'https://api.bilibili.com/pgc/web/season/section', media_id, - query={'season_id': initial_state['mediaInfo']['season_id']}, - note='Downloading season info')['result']['main_section']['episodes'] + initial_state = self._search_json( + r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id) + ss_id = initial_state['mediaInfo']['season_id'] - return self.playlist_result(( - self.url_result(entry['share_url'], BiliBiliBangumiIE, entry['aid']) - for entry in episode_list), media_id) + return self.playlist_result( + self._get_episodes_from_season(ss_id, url), media_id, + **traverse_obj(initial_state, ('mediaInfo', { + 'title': ('title', {str}), + 'description': ('evaluate', {str}), + }))) -class BilibiliSpaceBaseIE(InfoExtractor): +class BiliBiliBangumiSeasonIE(BilibiliBaseIE): + _VALID_URL = r'(?x)https?://(?:www\.)?bilibili\.com/bangumi/play/ss(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/bangumi/play/ss26801', + 'info_dict': { + 'id': '26801', + 'title': '鬼灭之刃', + 'description': 'md5:e2cc9848b6f69be6db79fc2a82d9661b', + }, + 'playlist_mincount': 26, + }, { + 'url': 'https://www.bilibili.com/bangumi/play/ss2251', + 'info_dict': { + 'id': '2251', + 'title': '玲音', + 'description': 'md5:1fd40e3df4c08d4d9d89a6a34844bdc4', + }, + 'playlist_count': 13, + 'playlist': [{ + 'info_dict': { + 'id': '50188', + 'ext': 'mp4', + 'series': '玲音', + 'series_id': '1526', + 'season': 'TV', + 'season_id': '2251', + 'season_number': 1, + 'episode': 'WEIRD', + 'episode_id': '50188', + 'episode_number': 1, + 'title': '1 WEIRD', + 'duration': 1436.992, + 'timestamp': 1343185080, + 'upload_date': '20120725', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + }], + }] + + def _real_extract(self, url): + ss_id = self._match_id(url) + webpage = self._download_webpage(url, ss_id) + metainfo = traverse_obj( + self._search_json(r'<script[^>]+type="application/ld\+json"[^>]*>', webpage, 'info', ss_id), + ('itemListElement', ..., { + 'title': ('name', {str}), + 'description': ('description', {str}), + }), get_all=False) + + return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id, **metainfo) + + +class BilibiliCheeseBaseIE(BilibiliBaseIE): + _HEADERS = {'Referer': 'https://www.bilibili.com/'} + + def _extract_episode(self, season_info, ep_id): + episode_info = traverse_obj(season_info, ( + 'episodes', lambda _, v: v['id'] == int(ep_id)), get_all=False) + aid, cid = episode_info['aid'], episode_info['cid'] + + if traverse_obj(episode_info, 'ep_status') == -1: + raise ExtractorError('This course episode is not yet available.', expected=True) + if not traverse_obj(episode_info, 'playable'): + self.raise_login_required('You need to purchase the course to download this episode') + + play_info = self._download_json( + 'https://api.bilibili.com/pugv/player/web/playurl', ep_id, + query={'avid': aid, 'cid': cid, 'ep_id': ep_id, 'fnval': 16, 'fourk': 1}, + headers=self._HEADERS, note='Downloading playinfo')['data'] + + return { + 'id': str_or_none(ep_id), + 'episode_id': str_or_none(ep_id), + 'formats': self.extract_formats(play_info), + 'extractor_key': BilibiliCheeseIE.ie_key(), + 'extractor': BilibiliCheeseIE.IE_NAME, + 'webpage_url': f'https://www.bilibili.com/cheese/play/ep{ep_id}', + **traverse_obj(episode_info, { + 'episode': ('title', {str}), + 'title': {lambda v: v and join_nonempty('index', 'title', delim=' - ', from_dict=v)}, + 'alt_title': ('subtitle', {str}), + 'duration': ('duration', {int_or_none}), + 'episode_number': ('index', {int_or_none}), + 'thumbnail': ('cover', {url_or_none}), + 'timestamp': ('release_date', {int_or_none}), + 'view_count': ('play', {int_or_none}), + }), + **traverse_obj(season_info, { + 'uploader': ('up_info', 'uname', {str}), + 'uploader_id': ('up_info', 'mid', {str_or_none}), + }), + 'subtitles': self.extract_subtitles(ep_id, cid, aid=aid), + '__post_extractor': self.extract_comments(aid), + 'http_headers': self._HEADERS, + } + + def _download_season_info(self, query_key, video_id): + return self._download_json( + f'https://api.bilibili.com/pugv/view/web/season?{query_key}={video_id}', video_id, + headers=self._HEADERS, note='Downloading season info')['data'] + + +class BilibiliCheeseIE(BilibiliCheeseBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/cheese/play/ep(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/cheese/play/ep229832', + 'info_dict': { + 'id': '229832', + 'ext': 'mp4', + 'title': '1 - 课程先导片', + 'alt_title': '视频课 · 3分41秒', + 'uploader': '马督工', + 'uploader_id': '316568752', + 'episode': '课程先导片', + 'episode_id': '229832', + 'episode_number': 1, + 'duration': 221, + 'timestamp': 1695549606, + 'upload_date': '20230924', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'view_count': int, + }, + }] + + def _real_extract(self, url): + ep_id = self._match_id(url) + return self._extract_episode(self._download_season_info('ep_id', ep_id), ep_id) + + +class BilibiliCheeseSeasonIE(BilibiliCheeseBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/cheese/play/ss(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/cheese/play/ss5918', + 'info_dict': { + 'id': '5918', + 'title': '【限时五折】新闻系学不到:马督工教你做自媒体', + 'description': '帮普通人建立世界模型,降低人与人的沟通门槛', + }, + 'playlist': [{ + 'info_dict': { + 'id': '229832', + 'ext': 'mp4', + 'title': '1 - 课程先导片', + 'alt_title': '视频课 · 3分41秒', + 'uploader': '马督工', + 'uploader_id': '316568752', + 'episode': '课程先导片', + 'episode_id': '229832', + 'episode_number': 1, + 'duration': 221, + 'timestamp': 1695549606, + 'upload_date': '20230924', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'view_count': int, + }, + }], + 'params': {'playlist_items': '1'}, + }, { + 'url': 'https://www.bilibili.com/cheese/play/ss5918', + 'info_dict': { + 'id': '5918', + 'title': '【限时五折】新闻系学不到:马督工教你做自媒体', + 'description': '帮普通人建立世界模型,降低人与人的沟通门槛', + }, + 'playlist_mincount': 5, + 'skip': 'paid video in list', + }] + + def _get_cheese_entries(self, season_info): + for ep_id in traverse_obj(season_info, ('episodes', lambda _, v: v['episode_can_view'], 'id')): + yield self._extract_episode(season_info, ep_id) + + def _real_extract(self, url): + season_id = self._match_id(url) + season_info = self._download_season_info('season_id', season_id) + + return self.playlist_result( + self._get_cheese_entries(season_info), season_id, + **traverse_obj(season_info, { + 'title': ('title', {str}), + 'description': ('subtitle', {str}), + })) + + +class BilibiliSpaceBaseIE(BilibiliBaseIE): def _extract_playlist(self, fetch_page, get_metadata, get_entries): first_page = fetch_page(0) metadata = get_metadata(first_page) @@ -452,6 +1172,14 @@ class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE): 'id': '3985676', }, 'playlist_mincount': 178, + 'skip': 'login required', + }, { + 'url': 'https://space.bilibili.com/313580179/video', + 'info_dict': { + 'id': '313580179', + }, + 'playlist_mincount': 92, + 'skip': 'login required', }] def _real_extract(self, url): @@ -461,18 +1189,36 @@ def _real_extract(self, url): 'To download audios, add a "/audio" to the URL') def fetch_page(page_idx): + query = { + 'keyword': '', + 'mid': playlist_id, + 'order': traverse_obj(parse_qs(url), ('order', 0)) or 'pubdate', + 'order_avoided': 'true', + 'platform': 'web', + 'pn': page_idx + 1, + 'ps': 30, + 'tid': 0, + 'web_location': 1550101, + } + try: - response = self._download_json('https://api.bilibili.com/x/space/arc/search', - playlist_id, note=f'Downloading page {page_idx}', - query={'mid': playlist_id, 'pn': page_idx + 1, 'jsonp': 'jsonp'}) + response = self._download_json( + 'https://api.bilibili.com/x/space/wbi/arc/search', playlist_id, + query=self._sign_wbi(query, playlist_id), + note=f'Downloading space page {page_idx}', headers={'Referer': url}) except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 412: + if isinstance(e.cause, HTTPError) and e.cause.status == 412: raise ExtractorError( 'Request is blocked by server (412), please add cookies, wait and try later.', expected=True) raise - if response['code'] == -401: + status_code = response['code'] + if status_code == -401: raise ExtractorError( 'Request is blocked by server (401), please add cookies, wait and try later.', expected=True) + elif status_code == -352 and not self.is_logged_in: + self.raise_login_required('Request is rejected, you need to login to access playlist') + elif status_code != 0: + raise ExtractorError(f'Request failed ({status_code}): {response.get("message") or "Unknown error"}') return response['data'] def get_metadata(page_data): @@ -494,9 +1240,9 @@ def get_entries(page_data): class BilibiliSpaceAudioIE(BilibiliSpaceBaseIE): _VALID_URL = r'https?://space\.bilibili\.com/(?P<id>\d+)/audio' _TESTS = [{ - 'url': 'https://space.bilibili.com/3985676/audio', + 'url': 'https://space.bilibili.com/313580179/audio', 'info_dict': { - 'id': '3985676', + 'id': '313580179', }, 'playlist_mincount': 1, }] @@ -524,13 +1270,35 @@ def get_entries(page_data): return self.playlist_result(paged_list, playlist_id) -class BilibiliSpacePlaylistIE(BilibiliSpaceBaseIE): - _VALID_URL = r'https?://space.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail\?sid=(?P<sid>\d+)' +class BilibiliSpaceListBaseIE(BilibiliSpaceBaseIE): + def _get_entries(self, page_data, bvid_keys, ending_key='bvid'): + for bvid in traverse_obj(page_data, (*variadic(bvid_keys, (str, bytes, dict, set)), ..., ending_key, {str})): + yield self.url_result(f'https://www.bilibili.com/video/{bvid}', BiliBiliIE, bvid) + + def _get_uploader(self, uid, playlist_id): + webpage = self._download_webpage(f'https://space.bilibili.com/{uid}', playlist_id, fatal=False) + return self._search_regex(r'(?s)<title\b[^>]*>([^<]+)的个人空间-', webpage, 'uploader', fatal=False) + + def _extract_playlist(self, fetch_page, get_metadata, get_entries): + metadata, page_list = super()._extract_playlist(fetch_page, get_metadata, get_entries) + metadata.pop('page_count', None) + metadata.pop('page_size', None) + return metadata, page_list + + +class BilibiliCollectionListIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://space\.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail/?\?sid=(?P<sid>\d+)' _TESTS = [{ 'url': 'https://space.bilibili.com/2142762/channel/collectiondetail?sid=57445', 'info_dict': { 'id': '2142762_57445', - 'title': '《底特律 变人》' + 'title': '【完结】《底特律 变人》全结局流程解说', + 'description': '', + 'uploader': '老戴在此', + 'uploader_id': '2142762', + 'timestamp': int, + 'upload_date': str, + 'thumbnail': 'https://archive.biliimg.com/bfs/archive/e0e543ae35ad3df863ea7dea526bc32e70f4c091.jpg', }, 'playlist_mincount': 31, }] @@ -551,46 +1319,308 @@ def get_metadata(page_data): return { 'page_count': math.ceil(entry_count / page_size), 'page_size': page_size, - 'title': traverse_obj(page_data, ('meta', 'name')) + 'uploader': self._get_uploader(mid, playlist_id), + **traverse_obj(page_data, { + 'title': ('meta', 'name', {str}), + 'description': ('meta', 'description', {str}), + 'uploader_id': ('meta', 'mid', {str_or_none}), + 'timestamp': ('meta', 'ptime', {int_or_none}), + 'thumbnail': ('meta', 'cover', {url_or_none}), + }), } def get_entries(page_data): - for entry in page_data.get('archives', []): - yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}', - BiliBiliIE, entry['bvid']) + return self._get_entries(page_data, 'archives') metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries) - return self.playlist_result(paged_list, playlist_id, metadata['title']) + return self.playlist_result(paged_list, playlist_id, **metadata) + + +class BilibiliSeriesListIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://space\.bilibili\.com/(?P<mid>\d+)/channel/seriesdetail/?\?\bsid=(?P<sid>\d+)' + _TESTS = [{ + 'url': 'https://space.bilibili.com/1958703906/channel/seriesdetail?sid=547718&ctype=0', + 'info_dict': { + 'id': '1958703906_547718', + 'title': '直播回放', + 'description': '直播回放', + 'uploader': '靡烟miya', + 'uploader_id': '1958703906', + 'timestamp': 1637985853, + 'upload_date': '20211127', + 'modified_timestamp': int, + 'modified_date': str, + }, + 'playlist_mincount': 513, + }] + + def _real_extract(self, url): + mid, sid = self._match_valid_url(url).group('mid', 'sid') + playlist_id = f'{mid}_{sid}' + playlist_meta = traverse_obj(self._download_json( + f'https://api.bilibili.com/x/series/series?series_id={sid}', playlist_id, fatal=False, + ), { + 'title': ('data', 'meta', 'name', {str}), + 'description': ('data', 'meta', 'description', {str}), + 'uploader_id': ('data', 'meta', 'mid', {str_or_none}), + 'timestamp': ('data', 'meta', 'ctime', {int_or_none}), + 'modified_timestamp': ('data', 'meta', 'mtime', {int_or_none}), + }) + + def fetch_page(page_idx): + return self._download_json( + 'https://api.bilibili.com/x/series/archives', + playlist_id, note=f'Downloading page {page_idx}', + query={'mid': mid, 'series_id': sid, 'pn': page_idx + 1, 'ps': 30})['data'] + + def get_metadata(page_data): + page_size = page_data['page']['size'] + entry_count = page_data['page']['total'] + return { + 'page_count': math.ceil(entry_count / page_size), + 'page_size': page_size, + 'uploader': self._get_uploader(mid, playlist_id), + **playlist_meta, + } + + def get_entries(page_data): + return self._get_entries(page_data, 'archives') + + metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries) + return self.playlist_result(paged_list, playlist_id, **metadata) + + +class BilibiliFavoritesListIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://(?:space\.bilibili\.com/\d+/favlist/?\?fid=|(?:www\.)?bilibili\.com/medialist/detail/ml)(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://space.bilibili.com/84912/favlist?fid=1103407912&ftype=create', + 'info_dict': { + 'id': '1103407912', + 'title': '【V2】(旧)', + 'description': '', + 'uploader': '晓月春日', + 'uploader_id': '84912', + 'timestamp': 1604905176, + 'upload_date': '20201109', + 'modified_timestamp': int, + 'modified_date': str, + 'thumbnail': r're:http://i\d\.hdslb\.com/bfs/archive/14b83c62aa8871b79083df1e9ab4fbc699ad16fe\.jpg', + 'view_count': int, + 'like_count': int, + }, + 'playlist_mincount': 22, + }, { + 'url': 'https://www.bilibili.com/medialist/detail/ml1103407912', + 'only_matching': True, + }] + + def _real_extract(self, url): + fid = self._match_id(url) + + list_info = self._download_json( + f'https://api.bilibili.com/x/v3/fav/resource/list?media_id={fid}&pn=1&ps=20', + fid, note='Downloading favlist metadata') + if list_info['code'] == -403: + self.raise_login_required(msg='This is a private favorites list. You need to log in as its owner') + + entries = self._get_entries(self._download_json( + f'https://api.bilibili.com/x/v3/fav/resource/ids?media_id={fid}', + fid, note='Download favlist entries'), 'data') + + return self.playlist_result(entries, fid, **traverse_obj(list_info, ('data', 'info', { + 'title': ('title', {str}), + 'description': ('intro', {str}), + 'uploader': ('upper', 'name', {str}), + 'uploader_id': ('upper', 'mid', {str_or_none}), + 'timestamp': ('ctime', {int_or_none}), + 'modified_timestamp': ('mtime', {int_or_none}), + 'thumbnail': ('cover', {url_or_none}), + 'view_count': ('cnt_info', 'play', {int_or_none}), + 'like_count': ('cnt_info', 'thumb_up', {int_or_none}), + }))) + + +class BilibiliWatchlaterIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/watchlater/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/watchlater/#/list', + 'info_dict': { + 'id': r're:\d+', + 'title': '稍后再看', + }, + 'playlist_mincount': 0, + 'skip': 'login required', + }] + + def _real_extract(self, url): + list_id = getattr(self._get_cookies(url).get('DedeUserID'), 'value', 'watchlater') + watchlater_info = self._download_json( + 'https://api.bilibili.com/x/v2/history/toview/web?jsonp=jsonp', list_id) + if watchlater_info['code'] == -101: + self.raise_login_required(msg='You need to login to access your watchlater list') + entries = self._get_entries(watchlater_info, ('data', 'list')) + return self.playlist_result(entries, id=list_id, title='稍后再看') + + +class BilibiliPlaylistIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:medialist/play|list)/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/list/1958703906?sid=547718', + 'info_dict': { + 'id': '5_547718', + 'title': '直播回放', + 'uploader': '靡烟miya', + 'uploader_id': '1958703906', + 'timestamp': 1637985853, + 'upload_date': '20211127', + }, + 'playlist_mincount': 513, + }, { + 'url': 'https://www.bilibili.com/list/1958703906?sid=547718&oid=687146339&bvid=BV1DU4y1r7tz', + 'info_dict': { + 'id': 'BV1DU4y1r7tz', + 'ext': 'mp4', + 'title': '【直播回放】8.20晚9:30 3d发布喵 2022年8月20日21点场', + 'upload_date': '20220820', + 'description': '', + 'timestamp': 1661016330, + 'uploader_id': '1958703906', + 'uploader': '靡烟miya', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'duration': 9552.903, + 'tags': list, + 'comment_count': int, + 'view_count': int, + 'like_count': int, + '_old_archive_ids': ['bilibili 687146339_part1'], + }, + 'params': {'noplaylist': True}, + }, { + 'url': 'https://www.bilibili.com/medialist/play/1958703906?business=space_series&business_id=547718&desc=1', + 'info_dict': { + 'id': '5_547718', + }, + 'playlist_mincount': 513, + 'skip': 'redirect url', + }, { + 'url': 'https://www.bilibili.com/list/ml1103407912', + 'info_dict': { + 'id': '3_1103407912', + 'title': '【V2】(旧)', + 'uploader': '晓月春日', + 'uploader_id': '84912', + 'timestamp': 1604905176, + 'upload_date': '20201109', + 'thumbnail': r're:http://i\d\.hdslb\.com/bfs/archive/14b83c62aa8871b79083df1e9ab4fbc699ad16fe\.jpg', + }, + 'playlist_mincount': 22, + }, { + 'url': 'https://www.bilibili.com/medialist/play/ml1103407912', + 'info_dict': { + 'id': '3_1103407912', + }, + 'playlist_mincount': 22, + 'skip': 'redirect url', + }, { + 'url': 'https://www.bilibili.com/list/watchlater', + 'info_dict': { + 'id': r're:2_\d+', + 'title': '稍后再看', + 'uploader': str, + 'uploader_id': str, + }, + 'playlist_mincount': 0, + 'skip': 'login required', + }, { + 'url': 'https://www.bilibili.com/medialist/play/watchlater', + 'info_dict': {'id': 'watchlater'}, + 'playlist_mincount': 0, + 'skip': 'redirect url & login required', + }] + + def _extract_medialist(self, query, list_id): + for page_num in itertools.count(1): + page_data = self._download_json( + 'https://api.bilibili.com/x/v2/medialist/resource/list', + list_id, query=query, note=f'getting playlist {query["biz_id"]} page {page_num}', + )['data'] + yield from self._get_entries(page_data, 'media_list', ending_key='bv_id') + query['oid'] = traverse_obj(page_data, ('media_list', -1, 'id')) + if not page_data.get('has_more', False): + break + + def _real_extract(self, url): + list_id = self._match_id(url) + + bvid = traverse_obj(parse_qs(url), ('bvid', 0)) + if not self._yes_playlist(list_id, bvid): + return self.url_result(f'https://www.bilibili.com/video/{bvid}', BiliBiliIE) + + webpage = self._download_webpage(url, list_id) + initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', list_id) + if traverse_obj(initial_state, ('error', 'code', {int_or_none})) != 200: + error_code = traverse_obj(initial_state, ('error', 'trueCode', {int_or_none})) + error_message = traverse_obj(initial_state, ('error', 'message', {str_or_none})) + if error_code == -400 and list_id == 'watchlater': + self.raise_login_required('You need to login to access your watchlater playlist') + elif error_code == -403: + self.raise_login_required('This is a private playlist. You need to login as its owner') + elif error_code == 11010: + raise ExtractorError('Playlist is no longer available', expected=True) + raise ExtractorError(f'Could not access playlist: {error_code} {error_message}') + + query = { + 'ps': 20, + 'with_current': False, + **traverse_obj(initial_state, { + 'type': ('playlist', 'type', {int_or_none}), + 'biz_id': ('playlist', 'id', {int_or_none}), + 'tid': ('tid', {int_or_none}), + 'sort_field': ('sortFiled', {int_or_none}), + 'desc': ('desc', {bool_or_none}, {str_or_none}, {str.lower}), + }), + } + metadata = { + 'id': f'{query["type"]}_{query["biz_id"]}', + **traverse_obj(initial_state, ('mediaListInfo', { + 'title': ('title', {str}), + 'uploader': ('upper', 'name', {str}), + 'uploader_id': ('upper', 'mid', {str_or_none}), + 'timestamp': ('ctime', {int_or_none}, {lambda x: x or None}), + 'thumbnail': ('cover', {url_or_none}), + })), + } + return self.playlist_result(self._extract_medialist(query, list_id), **metadata) class BilibiliCategoryIE(InfoExtractor): IE_NAME = 'Bilibili category extractor' _MAX_RESULTS = 1000000 - _VALID_URL = r'https?://www\.bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+' + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+' _TESTS = [{ 'url': 'https://www.bilibili.com/v/kichiku/mad', 'info_dict': { 'id': 'kichiku: mad', - 'title': 'kichiku: mad' + 'title': 'kichiku: mad', }, 'playlist_mincount': 45, 'params': { - 'playlistend': 45 - } + 'playlistend': 45, + }, }] def _fetch_page(self, api_url, num_pages, query, page_num): parsed_json = self._download_json( api_url, query, query={'Search_key': query, 'pn': page_num}, - note='Extracting results from page %s of %s' % (page_num, num_pages)) + note=f'Extracting results from page {page_num} of {num_pages}') video_list = traverse_obj(parsed_json, ('data', 'archives'), expected_type=list) if not video_list: - raise ExtractorError('Failed to retrieve video list for page %d' % page_num) + raise ExtractorError(f'Failed to retrieve video list for page {page_num}') for video in video_list: yield self.url_result( - 'https://www.bilibili.com/video/%s' % video['bvid'], 'BiliBili', video['bvid']) + 'https://www.bilibili.com/video/{}'.format(video['bvid']), 'BiliBili', video['bvid']) def _entries(self, category, subcategory, query): # map of categories : subcategories : RIDs @@ -600,7 +1630,7 @@ def _entries(self, category, subcategory, query): 'manual_vocaloid': 126, 'guide': 22, 'theatre': 216, - 'course': 127 + 'course': 127, }, } @@ -626,7 +1656,7 @@ def _entries(self, category, subcategory, query): def _real_extract(self, url): category, subcategory = urllib.parse.urlparse(url).path.split('/')[2:4] - query = '%s: %s' % (category, subcategory) + query = f'{category}: {subcategory}' return self.playlist_result(self._entries(category, subcategory, query), query, query) @@ -635,8 +1665,37 @@ class BiliBiliSearchIE(SearchInfoExtractor): IE_DESC = 'Bilibili video search' _MAX_RESULTS = 100000 _SEARCH_KEY = 'bilisearch' + _TESTS = [{ + 'url': 'bilisearch3:靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊', + 'playlist_count': 3, + 'info_dict': { + 'id': '靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊', + 'title': '靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'BV1n44y1Q7sc', + 'ext': 'mp4', + 'title': '“出道一年,我怎么还在等你单推的女人睡觉后开播啊?”【一分钟了解靡烟miya】', + 'timestamp': 1669889987, + 'upload_date': '20221201', + 'description': 'md5:43343c0973defff527b5a4b403b4abf9', + 'tags': list, + 'uploader': '靡烟miya', + 'duration': 123.156, + 'uploader_id': '1958703906', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + '_old_archive_ids': ['bilibili 988222410_part1'], + }, + }], + }] def _search_results(self, query): + if not self._get_cookies('https://api.bilibili.com').get('buvid3'): + self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc') for page_num in itertools.count(1): videos = self._download_json( 'https://api.bilibili.com/x/web-interface/search/type', query, @@ -700,7 +1759,7 @@ def _real_extract(self, url): formats = [{ 'url': play_data['cdns'][0], 'filesize': int_or_none(play_data.get('size')), - 'vcodec': 'none' + 'vcodec': 'none', }] for a_format in formats: @@ -718,7 +1777,7 @@ def _real_extract(self, url): subtitles = { 'origin': [{ 'url': lyric, - }] + }], } return { @@ -786,13 +1845,14 @@ class BiliBiliPlayerIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) return self.url_result( - 'http://www.bilibili.tv/video/av%s/' % video_id, + f'http://www.bilibili.tv/video/av{video_id}/', ie=BiliBiliIE.ie_key(), video_id=video_id) class BiliIntlBaseIE(InfoExtractor): _API_URL = 'https://api.bilibili.tv/intl/gateway' _NETRC_MACHINE = 'biliintl' + _HEADERS = {'Referer': 'https://www.bilibili.com/'} def _call_api(self, endpoint, *args, **kwargs): json = self._download_json(self._API_URL + endpoint, *args, **kwargs) @@ -813,11 +1873,10 @@ def _call_api(self, endpoint, *args, **kwargs): return json.get('data') def json2srt(self, json): - data = '\n\n'.join( + return '\n\n'.join( f'{i + 1}\n{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n{line["content"]}' for i, line in enumerate(traverse_obj(json, ( 'body', lambda _, l: l['content'] and l['from'] and l['to'])))) - return data def _get_subtitles(self, *, ep_id=None, aid=None): sub_json = self._call_api( @@ -830,19 +1889,34 @@ def _get_subtitles(self, *, ep_id=None, aid=None): 'aid': aid, })) or {} subtitles = {} - for sub in sub_json.get('subtitles') or []: - sub_url = sub.get('url') - if not sub_url: - continue - sub_data = self._download_json( - sub_url, ep_id or aid, errnote='Unable to download subtitles', fatal=False, - note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '') - if not sub_data: - continue - subtitles.setdefault(sub.get('lang_key', 'en'), []).append({ - 'ext': 'srt', - 'data': self.json2srt(sub_data) - }) + fetched_urls = set() + for sub in traverse_obj(sub_json, (('subtitles', 'video_subtitle'), ..., {dict})): + for url in traverse_obj(sub, ((None, 'ass', 'srt'), 'url', {url_or_none})): + if url in fetched_urls: + continue + fetched_urls.add(url) + sub_ext = determine_ext(url) + sub_lang = sub.get('lang_key') or 'en' + + if sub_ext == 'ass': + subtitles.setdefault(sub_lang, []).append({ + 'ext': 'ass', + 'url': url, + }) + elif sub_ext == 'json': + sub_data = self._download_json( + url, ep_id or aid, fatal=False, + note=f'Downloading subtitles{format_field(sub, "lang", " for %s")} ({sub_lang})', + errnote='Unable to download subtitles') + + if sub_data: + subtitles.setdefault(sub_lang, []).append({ + 'ext': 'srt', + 'data': self.json2srt(sub_data), + }) + else: + self.report_warning('Unexpected subtitle extension', ep_id or aid) + return subtitles def _get_formats(self, *, ep_id=None, aid=None): @@ -888,28 +1962,31 @@ def _get_formats(self, *, ep_id=None, aid=None): def _parse_video_metadata(self, video_data): return { 'title': video_data.get('title_display') or video_data.get('title'), + 'description': video_data.get('desc'), 'thumbnail': video_data.get('cover'), + 'timestamp': unified_timestamp(video_data.get('formatted_pub_date')), 'episode_number': int_or_none(self._search_regex( r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)), } def _perform_login(self, username, password): - if not Cryptodome: + if not Cryptodome.RSA: raise ExtractorError('pycryptodomex not found. Please install', expected=True) key_data = self._download_json( 'https://passport.bilibili.tv/x/intl/passport-login/web/key?lang=en-US', None, note='Downloading login key', errnote='Unable to download login key')['data'] - public_key = Cryptodome.PublicKey.RSA.importKey(key_data['key']) - password_hash = Cryptodome.Cipher.PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode('utf-8')) + public_key = Cryptodome.RSA.importKey(key_data['key']) + password_hash = Cryptodome.PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode()) login_post = self._download_json( - 'https://passport.bilibili.tv/x/intl/passport-login/web/login/password?lang=en-US', None, data=urlencode_postdata({ + 'https://passport.bilibili.tv/x/intl/passport-login/web/login/password?lang=en-US', None, + data=urlencode_postdata({ 'username': username, 'password': base64.b64encode(password_hash).decode('ascii'), 'keep_me': 'true', 's_locale': 'en_US', - 'isTrusted': 'true' + 'isTrusted': 'true', }), note='Logging in', errnote='Unable to log in') if login_post.get('code'): if login_post.get('message'): @@ -936,17 +2013,17 @@ class BiliIntlIE(BiliIntlBaseIE): 'chapters': [{ 'start_time': 0, 'end_time': 76.242, - 'title': '<Untitled Chapter 1>' + 'title': '<Untitled Chapter 1>', }, { 'start_time': 76.242, 'end_time': 161.161, - 'title': 'Intro' + 'title': 'Intro', }, { 'start_time': 1325.742, 'end_time': 1403.903, - 'title': 'Outro' + 'title': 'Outro', }], - } + }, }, { # Non-Bstation page 'url': 'https://www.bilibili.tv/en/play/1033760/11005006', @@ -963,17 +2040,17 @@ class BiliIntlIE(BiliIntlBaseIE): 'chapters': [{ 'start_time': 0, 'end_time': 88.0, - 'title': '<Untitled Chapter 1>' + 'title': '<Untitled Chapter 1>', }, { 'start_time': 88.0, 'end_time': 156.0, - 'title': 'Intro' + 'title': 'Intro', }, { 'start_time': 1173.0, 'end_time': 1259.535, - 'title': 'Outro' + 'title': 'Outro', }], - } + }, }, { # Subtitle with empty content 'url': 'https://www.bilibili.tv/en/play/1005144/10131790', @@ -984,18 +2061,54 @@ class BiliIntlIE(BiliIntlBaseIE): 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', 'episode_number': 140, }, - 'skip': 'According to the copyright owner\'s request, you may only watch the video after you log in.' + 'skip': 'According to the copyright owner\'s request, you may only watch the video after you log in.', }, { - 'url': 'https://www.bilibili.tv/en/video/2041863208', + # episode comment extraction + 'url': 'https://www.bilibili.tv/en/play/34580/340317', 'info_dict': { - 'id': '2041863208', + 'id': '340317', 'ext': 'mp4', - 'timestamp': 1670874843, - 'description': 'Scheduled for April 2023.\nStudio: ufotable', - 'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$', - 'upload_date': '20221212', - 'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation', - } + 'timestamp': 1604057820, + 'upload_date': '20201030', + 'episode_number': 5, + 'title': 'E5 - My Own Steel', + 'description': 'md5:2b17ab10aebb33e3c2a54da9e8e487e2', + 'thumbnail': r're:https?://pic\.bstarstatic\.com/ogv/.+\.png$', + 'episode': 'Episode 5', + 'comment_count': int, + 'chapters': [{ + 'start_time': 0, + 'end_time': 61.0, + 'title': '<Untitled Chapter 1>', + }, { + 'start_time': 61.0, + 'end_time': 134.0, + 'title': 'Intro', + }, { + 'start_time': 1290.0, + 'end_time': 1379.0, + 'title': 'Outro', + }], + }, + 'params': { + 'getcomments': True, + }, + }, { + # user generated content comment extraction + 'url': 'https://www.bilibili.tv/en/video/2045730385', + 'info_dict': { + 'id': '2045730385', + 'ext': 'mp4', + 'description': 'md5:693b6f3967fb4e7e7764ea817857c33a', + 'timestamp': 1667891924, + 'upload_date': '20221108', + 'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan', + 'comment_count': int, + 'thumbnail': r're:https://pic\.bstarstatic\.(?:com|net)/ugc/f6c363659efd2eabe5683fbb906b1582\.jpg', + }, + 'params': { + 'getcomments': True, + }, }, { # episode id without intro and outro 'url': 'https://www.bilibili.tv/en/play/1048837/11246489', @@ -1027,6 +2140,7 @@ class BiliIntlIE(BiliIntlBaseIE): 'only_matching': True, }] + @staticmethod def _make_url(video_id, series_id=None): if series_id: return f'https://www.bilibili.tv/en/play/{series_id}/{video_id}' @@ -1049,16 +2163,76 @@ def _extract_video_metadata(self, url, video_id, season_id): # Non-Bstation layout, read through episode list season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id) video_data = traverse_obj(season_json, ( - 'sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == video_id + 'sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == video_id, ), expected_type=dict, get_all=False) # XXX: webpage metadata may not accurate, it just used to not crash when video_data not found return merge_dicts( - self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id), { - 'title': self._html_search_meta('og:title', webpage), - 'description': self._html_search_meta('og:description', webpage) + self._parse_video_metadata(video_data), { + 'title': get_element_by_class( + 'bstar-meta__title', webpage) or self._html_search_meta('og:title', webpage), + 'description': get_element_by_class( + 'bstar-meta__desc', webpage) or self._html_search_meta('og:description', webpage), + }, self._search_json_ld(webpage, video_id, default={})) + + def _get_comments_reply(self, root_id, next_id=0, display_id=None): + comment_api_raw_data = self._download_json( + 'https://api.bilibili.tv/reply/web/detail', display_id, + note=f'Downloading reply comment of {root_id} - {next_id}', + query={ + 'platform': 'web', + 'ps': 20, # comment's reply per page (default: 3) + 'root': root_id, + 'next': next_id, }) + for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)): + yield { + 'author': traverse_obj(replies, ('member', 'name')), + 'author_id': traverse_obj(replies, ('member', 'mid')), + 'author_thumbnail': traverse_obj(replies, ('member', 'face')), + 'text': traverse_obj(replies, ('content', 'message')), + 'id': replies.get('rpid'), + 'like_count': int_or_none(replies.get('like_count')), + 'parent': replies.get('parent'), + 'timestamp': unified_timestamp(replies.get('ctime_text')), + } + + if not traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')): + yield from self._get_comments_reply( + root_id, comment_api_raw_data['data']['cursor']['next'], display_id) + + def _get_comments(self, video_id, ep_id): + for i in itertools.count(0): + comment_api_raw_data = self._download_json( + 'https://api.bilibili.tv/reply/web/root', video_id, + note=f'Downloading comment page {i + 1}', + query={ + 'platform': 'web', + 'pn': i, # page number + 'ps': 20, # comment per page (default: 20) + 'oid': video_id, + 'type': 3 if ep_id else 1, # 1: user generated content, 3: series content + 'sort_type': 1, # 1: best, 2: recent + }) + + for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)): + yield { + 'author': traverse_obj(replies, ('member', 'name')), + 'author_id': traverse_obj(replies, ('member', 'mid')), + 'author_thumbnail': traverse_obj(replies, ('member', 'face')), + 'text': traverse_obj(replies, ('content', 'message')), + 'id': replies.get('rpid'), + 'like_count': int_or_none(replies.get('like_count')), + 'timestamp': unified_timestamp(replies.get('ctime_text')), + 'author_is_uploader': bool(traverse_obj(replies, ('member', 'type'))), + } + if replies.get('count'): + yield from self._get_comments_reply(replies.get('rpid'), display_id=video_id) + + if traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')): + break + def _real_extract(self, url): season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid') video_id = ep_id or aid @@ -1074,11 +2248,11 @@ def _real_extract(self, url): chapters = [{ 'start_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'opening_start_time')), 1000), 'end_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'opening_end_time')), 1000), - 'title': 'Intro' + 'title': 'Intro', }, { 'start_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'ending_start_time')), 1000), 'end_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'ending_end_time')), 1000), - 'title': 'Outro' + 'title': 'Outro', }] return { @@ -1086,7 +2260,9 @@ def _real_extract(self, url): **self._extract_video_metadata(url, video_id, season_id), 'formats': self._get_formats(ep_id=ep_id, aid=aid), 'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid), - 'chapters': chapters + 'chapters': chapters, + '__post_extractor': self.extract_comments(video_id, ep_id), + 'http_headers': self._HEADERS, } @@ -1132,12 +2308,13 @@ def _entries(self, series_id): episode_id = str(episode['episode_id']) yield self.url_result(smuggle_url( BiliIntlIE._make_url(episode_id, series_id), - self._parse_video_metadata(episode) + self._parse_video_metadata(episode), ), BiliIntlIE, episode_id) def _real_extract(self, url): series_id = self._match_id(url) - series_info = self._call_api(f'/web/v2/ogv/play/season_info?season_id={series_id}&platform=web', series_id).get('season') or {} + series_info = self._call_api( + f'/web/v2/ogv/play/season_info?season_id={series_id}&platform=web', series_id).get('season') or {} return self.playlist_result( self._entries(series_id), series_id, series_info.get('title'), series_info.get('description'), categories=traverse_obj(series_info, ('styles', ..., 'title'), expected_type=str_or_none), @@ -1145,25 +2322,25 @@ def _real_extract(self, url): class BiliLiveIE(InfoExtractor): - _VALID_URL = r'https?://live.bilibili.com/(?:blanc/)?(?P<id>\d+)' + _VALID_URL = r'https?://live\.bilibili\.com/(?:blanc/)?(?P<id>\d+)' _TESTS = [{ 'url': 'https://live.bilibili.com/196', 'info_dict': { 'id': '33989', - 'description': "周六杂谈回,其他时候随机游戏。 | \n录播:@下播型泛式录播组。 | \n直播通知群(全员禁言):666906670,902092584,59971⑧481 (功能一样,别多加)", + 'description': '周六杂谈回,其他时候随机游戏。 | \n录播:@下播型泛式录播组。 | \n直播通知群(全员禁言):666906670,902092584,59971⑧481 (功能一样,别多加)', 'ext': 'flv', - 'title': "太空狼人杀联动,不被爆杀就算赢", - 'thumbnail': "https://i0.hdslb.com/bfs/live/new_room_cover/e607bc1529057ef4b332e1026e62cf46984c314d.jpg", + 'title': '太空狼人杀联动,不被爆杀就算赢', + 'thumbnail': 'https://i0.hdslb.com/bfs/live/new_room_cover/e607bc1529057ef4b332e1026e62cf46984c314d.jpg', 'timestamp': 1650802769, }, - 'skip': 'not live' + 'skip': 'not live', }, { 'url': 'https://live.bilibili.com/196?broadcast_type=0&is_room_feed=1?spm_id_from=333.999.space_home.strengthen_live_card.click', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://live.bilibili.com/blanc/196', - 'only_matching': True + 'only_matching': True, }] _FORMATS = { @@ -1204,7 +2381,7 @@ def _real_extract(self, url): raise ExtractorError('Streamer is not live', expected=True) formats = [] - for qn in self._FORMATS.keys(): + for qn in self._FORMATS: stream_data = self._call_api('xlive/web-room/v2/index/getRoomPlayInfo', room_id, { 'room_id': room_id, 'qn': qn, diff --git a/yt_dlp/extractor/biqle.py b/yt_dlp/extractor/biqle.py deleted file mode 100644 index 027753503b..0000000000 --- a/yt_dlp/extractor/biqle.py +++ /dev/null @@ -1,110 +0,0 @@ -from .common import InfoExtractor -from .vk import VKIE -from ..compat import compat_b64decode -from ..utils import ( - int_or_none, - js_to_json, - traverse_obj, - unified_timestamp, -) - - -class BIQLEIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P<id>-?\d+_\d+)' - _TESTS = [{ - 'url': 'https://biqle.ru/watch/-2000421746_85421746', - 'md5': 'ae6ef4f04d19ac84e4658046d02c151c', - 'info_dict': { - 'id': '-2000421746_85421746', - 'ext': 'mp4', - 'title': 'Forsaken By Hope Studio Clip', - 'description': 'Forsaken By Hope Studio Clip — Смотреть онлайн', - 'upload_date': '19700101', - 'thumbnail': r're:https://[^/]+/impf/7vN3ACwSTgChP96OdOfzFjUCzFR6ZglDQgWsIw/KPaACiVJJxM\.jpg\?size=800x450&quality=96&keep_aspect_ratio=1&background=000000&sign=b48ea459c4d33dbcba5e26d63574b1cb&type=video_thumb', - 'timestamp': 0, - }, - }, { - 'url': 'http://biqle.org/watch/-44781847_168547604', - 'md5': '7f24e72af1db0edf7c1aaba513174f97', - 'info_dict': { - 'id': '-44781847_168547604', - 'ext': 'mp4', - 'title': 'Ребенок в шоке от автоматической мойки', - 'description': 'Ребенок в шоке от автоматической мойки — Смотреть онлайн', - 'timestamp': 1396633454, - 'upload_date': '20140404', - 'thumbnail': r're:https://[^/]+/c535507/u190034692/video/l_b84df002\.jpg', - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._html_search_meta('name', webpage, 'Title', fatal=False) - timestamp = unified_timestamp(self._html_search_meta('uploadDate', webpage, 'Upload Date', default=None)) - description = self._html_search_meta('description', webpage, 'Description', default=None) - - global_embed_url = self._search_regex( - r'<script[^<]+?window.globEmbedUrl\s*=\s*\'((?:https?:)?//(?:daxab\.com|dxb\.to|[^/]+/player)/[^\']+)\'', - webpage, 'global Embed url') - hash = self._search_regex( - r'<script id="data-embed-video[^<]+?hash: "([^"]+)"[^<]*</script>', webpage, 'Hash') - - embed_url = global_embed_url + hash - - if VKIE.suitable(embed_url): - return self.url_result(embed_url, VKIE.ie_key(), video_id) - - embed_page = self._download_webpage( - embed_url, video_id, 'Downloading embed webpage', headers={'Referer': url}) - - glob_params = self._parse_json(self._search_regex( - r'<script id="globParams">[^<]*window.globParams = ([^;]+);[^<]+</script>', - embed_page, 'Global Parameters'), video_id, transform_source=js_to_json) - host_name = compat_b64decode(glob_params['server'][::-1]).decode() - - item = self._download_json( - f'https://{host_name}/method/video.get/{video_id}', video_id, - headers={'Referer': url}, query={ - 'token': glob_params['video']['access_token'], - 'videos': video_id, - 'ckey': glob_params['c_key'], - 'credentials': glob_params['video']['credentials'], - })['response']['items'][0] - - formats = [] - for f_id, f_url in item.get('files', {}).items(): - if f_id == 'external': - return self.url_result(f_url) - ext, height = f_id.split('_') - height_extra_key = traverse_obj(glob_params, ('video', 'partial', 'quality', height)) - if height_extra_key: - formats.append({ - 'format_id': f'{height}p', - 'url': f'https://{host_name}/{f_url[8:]}&videos={video_id}&extra_key={height_extra_key}', - 'height': int_or_none(height), - 'ext': ext, - }) - - thumbnails = [] - for k, v in item.items(): - if k.startswith('photo_') and v: - width = k.replace('photo_', '') - thumbnails.append({ - 'id': width, - 'url': v, - 'width': int_or_none(width), - }) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'comment_count': int_or_none(item.get('comments')), - 'description': description, - 'duration': int_or_none(item.get('duration')), - 'thumbnails': thumbnails, - 'timestamp': timestamp, - 'view_count': int_or_none(item.get('views')), - } diff --git a/yt_dlp/extractor/bitchute.py b/yt_dlp/extractor/bitchute.py index 10e7b0b2bb..c83222ea5b 100644 --- a/yt_dlp/extractor/bitchute.py +++ b/yt_dlp/extractor/bitchute.py @@ -2,13 +2,15 @@ import re from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( ExtractorError, - HEADRequest, OnDemandPagedList, clean_html, + extract_attributes, get_element_by_class, get_element_by_id, + get_element_html_by_class, get_elements_html_by_class, int_or_none, orderedSet, @@ -17,11 +19,12 @@ traverse_obj, unified_strdate, urlencode_postdata, + urljoin, ) class BitChuteIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:(?:www|old)\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)' _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})'] _TESTS = [{ 'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/', @@ -34,6 +37,25 @@ class BitChuteIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'BitChute', 'upload_date': '20170103', + 'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/', + 'channel': 'BitChute', + 'channel_url': 'https://www.bitchute.com/channel/bitchute/', + }, + }, { + # test case: video with different channel and uploader + 'url': 'https://www.bitchute.com/video/Yti_j9A-UZ4/', + 'md5': 'f10e6a8e787766235946d0868703f1d0', + 'info_dict': { + 'id': 'Yti_j9A-UZ4', + 'ext': 'mp4', + 'title': 'Israel at War | Full Measure', + 'description': 'md5:38cf7bc6f42da1a877835539111c69ef', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'sharylattkisson', + 'upload_date': '20231106', + 'uploader_url': 'https://www.bitchute.com/profile/9K0kUWA9zmd9/', + 'channel': 'Full Measure with Sharyl Attkisson', + 'channel_url': 'https://www.bitchute.com/channel/sharylattkisson/', }, }, { # video not downloadable in browser, but we can recover it @@ -48,6 +70,9 @@ class BitChuteIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'BitChute', 'upload_date': '20181113', + 'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/', + 'channel': 'BitChute', + 'channel_url': 'https://www.bitchute.com/channel/bitchute/', }, 'params': {'check_formats': None}, }, { @@ -66,6 +91,9 @@ class BitChuteIE(InfoExtractor): }, { 'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent', 'only_matching': True, + }, { + 'url': 'https://old.bitchute.com/video/UGlrF9o9b-Q/', + 'only_matching': True, }] _GEO_BYPASS = False @@ -77,7 +105,10 @@ class BitChuteIE(InfoExtractor): def _check_format(self, video_url, video_id): urls = orderedSet( re.sub(r'(^https?://)(seed\d+)(?=\.bitchute\.com)', fr'\g<1>{host}', video_url) - for host in (r'\g<2>', 'seed150', 'seed151', 'seed152', 'seed153')) + for host in (r'\g<2>', 'seed122', 'seed125', 'seed126', 'seed128', + 'seed132', 'seed150', 'seed151', 'seed152', 'seed153', + 'seed167', 'seed171', 'seed177', 'seed305', 'seed307', + 'seedp29xb', 'zb10-7gsop1v78')) for url in urls: try: response = self._request_webpage( @@ -87,7 +118,7 @@ def _check_format(self, video_url, video_id): continue return { 'url': url, - 'filesize': int_or_none(response.headers.get('Content-Length')) + 'filesize': int_or_none(response.headers.get('Content-Length')), } def _raise_if_restricted(self, webpage): @@ -96,10 +127,15 @@ def _raise_if_restricted(self, webpage): reason = clean_html(get_element_by_id('page-detail', webpage)) or page_title self.raise_geo_restricted(reason) + @staticmethod + def _make_url(html): + path = extract_attributes(get_element_html_by_class('spa', html) or '').get('href') + return urljoin('https://www.bitchute.com', path) + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - f'https://www.bitchute.com/video/{video_id}', video_id, headers=self._HEADERS) + f'https://old.bitchute.com/video/{video_id}', video_id, headers=self._HEADERS) self._raise_if_restricted(webpage) publish_date = clean_html(get_element_by_class('video-publish-date', webpage)) @@ -118,12 +154,19 @@ def _real_extract(self, url): 'Video is unavailable. Please make sure this video is playable in the browser ' 'before reporting this issue.', expected=True, video_id=video_id) + details = get_element_by_class('details', webpage) or '' + uploader_html = get_element_html_by_class('creator', details) or '' + channel_html = get_element_html_by_class('name', details) or '' + return { 'id': video_id, 'title': self._html_extract_title(webpage) or self._og_search_title(webpage), 'description': self._og_search_description(webpage, default=None), 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader': clean_html(get_element_by_class('owner', webpage)), + 'uploader': clean_html(uploader_html), + 'uploader_url': self._make_url(uploader_html), + 'channel': clean_html(channel_html), + 'channel_url': self._make_url(channel_html), 'upload_date': unified_strdate(self._search_regex( r'at \d+:\d+ UTC on (.+?)\.', publish_date, 'upload date', fatal=False)), 'formats': formats, @@ -131,13 +174,13 @@ def _real_extract(self, url): class BitChuteChannelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?P<type>channel|playlist)/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:(?:www|old)\.)?bitchute\.com/(?P<type>channel|playlist)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bitchute.com/channel/bitchute/', 'info_dict': { 'id': 'bitchute', 'title': 'BitChute', - 'description': 'md5:5329fb3866125afa9446835594a9b138', + 'description': 'md5:2134c37d64fc3a4846787c402956adac', }, 'playlist': [ { @@ -145,16 +188,18 @@ class BitChuteChannelIE(InfoExtractor): 'info_dict': { 'id': 'UGlrF9o9b-Q', 'ext': 'mp4', - 'filesize': None, 'title': 'This is the first video on #BitChute !', 'description': 'md5:a0337e7b1fe39e32336974af8173a034', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'BitChute', 'upload_date': '20170103', + 'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/', + 'channel': 'BitChute', + 'channel_url': 'https://www.bitchute.com/channel/bitchute/', 'duration': 16, 'view_count': int, }, - } + }, ], 'params': { 'skip_download': True, @@ -166,8 +211,11 @@ class BitChuteChannelIE(InfoExtractor): 'info_dict': { 'id': 'wV9Imujxasw9', 'title': 'Bruce MacDonald and "The Light of Darkness"', - 'description': 'md5:04913227d2714af1d36d804aa2ab6b1e', - } + 'description': 'md5:747724ef404eebdfc04277714f81863e', + }, + }, { + 'url': 'https://old.bitchute.com/playlist/wV9Imujxasw9/', + 'only_matching': True, }] _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7' @@ -182,13 +230,13 @@ class BitChuteChannelIE(InfoExtractor): 'container': 'playlist-video', 'title': 'title', 'description': 'description', - } + }, } @staticmethod def _make_url(playlist_id, playlist_type): - return f'https://www.bitchute.com/{playlist_type}/{playlist_id}/' + return f'https://old.bitchute.com/{playlist_type}/{playlist_id}/' def _fetch_page(self, playlist_id, playlist_type, page_num): playlist_url = self._make_url(playlist_id, playlist_type) diff --git a/yt_dlp/extractor/bitwave.py b/yt_dlp/extractor/bitwave.py deleted file mode 100644 index a82cd263a7..0000000000 --- a/yt_dlp/extractor/bitwave.py +++ /dev/null @@ -1,58 +0,0 @@ -from .common import InfoExtractor - - -class BitwaveReplayIE(InfoExtractor): - IE_NAME = 'bitwave:replay' - _VALID_URL = r'https?://(?:www\.)?bitwave\.tv/(?P<user>\w+)/replay/(?P<id>\w+)/?$' - _TEST = { - 'url': 'https://bitwave.tv/RhythmicCarnage/replay/z4P6eq5L7WDrM85UCrVr', - 'only_matching': True - } - - def _real_extract(self, url): - replay_id = self._match_id(url) - replay = self._download_json( - 'https://api.bitwave.tv/v1/replays/' + replay_id, - replay_id - ) - - return { - 'id': replay_id, - 'title': replay['data']['title'], - 'uploader': replay['data']['name'], - 'uploader_id': replay['data']['name'], - 'url': replay['data']['url'], - 'thumbnails': [ - {'url': x} for x in replay['data']['thumbnails'] - ], - } - - -class BitwaveStreamIE(InfoExtractor): - IE_NAME = 'bitwave:stream' - _VALID_URL = r'https?://(?:www\.)?bitwave\.tv/(?P<id>\w+)/?$' - _TEST = { - 'url': 'https://bitwave.tv/doomtube', - 'only_matching': True - } - - def _real_extract(self, url): - username = self._match_id(url) - channel = self._download_json( - 'https://api.bitwave.tv/v1/channels/' + username, - username) - - formats = self._extract_m3u8_formats( - channel['data']['url'], username, - 'mp4') - - return { - 'id': username, - 'title': channel['data']['title'], - 'uploader': username, - 'uploader_id': username, - 'formats': formats, - 'thumbnail': channel['data']['thumbnail'], - 'is_live': True, - 'view_count': channel['data']['viewCount'] - } diff --git a/yt_dlp/extractor/blackboardcollaborate.py b/yt_dlp/extractor/blackboardcollaborate.py index 8f41c897ad..535890979b 100644 --- a/yt_dlp/extractor/blackboardcollaborate.py +++ b/yt_dlp/extractor/blackboardcollaborate.py @@ -47,7 +47,7 @@ def _real_extract(self, url): region = mobj.group('region') video_id = mobj.group('id') info = self._download_json( - 'https://{}.bbcollab.com/collab/api/csa/recordings/{}/data'.format(region, video_id), video_id) + f'https://{region}.bbcollab.com/collab/api/csa/recordings/{video_id}/data', video_id) duration = info.get('duration') title = info['name'] upload_date = info.get('created') diff --git a/yt_dlp/extractor/bleacherreport.py b/yt_dlp/extractor/bleacherreport.py index 8d8fabe331..71b237d4b2 100644 --- a/yt_dlp/extractor/bleacherreport.py +++ b/yt_dlp/extractor/bleacherreport.py @@ -1,13 +1,15 @@ -from .common import InfoExtractor from .amp import AMPIE +from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, parse_iso8601, + str_or_none, ) class BleacherReportIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/articles/(?P<id>\d+)' _TESTS = [{ 'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football', @@ -16,13 +18,13 @@ class BleacherReportIE(InfoExtractor): 'id': '2496438', 'ext': 'mp4', 'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?', - 'uploader_id': 3992341, + 'uploader_id': '3992341', 'description': 'CFB, ACC, Florida State', 'timestamp': 1434380212, 'upload_date': '20150615', 'uploader': 'Team Stream Now ', }, - 'add_ie': ['Ooyala'], + 'skip': 'Video removed', }, { 'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo', 'md5': '6a5cd403418c7b01719248ca97fb0692', @@ -33,7 +35,7 @@ class BleacherReportIE(InfoExtractor): 'timestamp': 1446839961, 'uploader': 'Sean Fay', 'description': 'md5:b1601e2314c4d8eec23b6eafe086a757', - 'uploader_id': 6466954, + 'uploader_id': '6466954', 'upload_date': '20151011', }, 'add_ie': ['Youtube'], @@ -42,7 +44,7 @@ class BleacherReportIE(InfoExtractor): def _real_extract(self, url): article_id = self._match_id(url) - article_data = self._download_json('http://api.bleacherreport.com/api/v1/articles/%s' % article_id, article_id)['article'] + article_data = self._download_json(f'http://api.bleacherreport.com/api/v1/articles/{article_id}', article_id)['article'] thumbnails = [] primary_photo = article_data.get('primaryPhoto') @@ -58,7 +60,7 @@ def _real_extract(self, url): 'id': article_id, 'title': article_data['title'], 'uploader': article_data.get('author', {}).get('name'), - 'uploader_id': article_data.get('authorId'), + 'uploader_id': str_or_none(article_data.get('authorId')), 'timestamp': parse_iso8601(article_data.get('createdAt')), 'thumbnails': thumbnails, 'comment_count': int_or_none(article_data.get('commentsCount')), @@ -69,13 +71,11 @@ def _real_extract(self, url): if video: video_type = video['type'] if video_type in ('cms.bleacherreport.com', 'vid.bleacherreport.com'): - info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id'] - elif video_type == 'ooyala.com': - info['url'] = 'ooyala:%s' % video['id'] + info['url'] = 'http://bleacherreport.com/video_embed?id={}'.format(video['id']) elif video_type == 'youtube.com': info['url'] = video['id'] elif video_type == 'vine.co': - info['url'] = 'https://vine.co/v/%s' % video['id'] + info['url'] = 'https://vine.co/v/{}'.format(video['id']) else: info['url'] = video_type + video['id'] return info @@ -84,6 +84,7 @@ def _real_extract(self, url): class BleacherReportCMSIE(AMPIE): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36}|\d{5})' _TESTS = [{ 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1&library=video-cms', @@ -98,12 +99,12 @@ class BleacherReportCMSIE(AMPIE): }, 'expected_warnings': [ - 'Unable to download f4m manifest' - ] + 'Unable to download f4m manifest', + ], }] def _real_extract(self, url): video_id = self._match_id(url) - info = self._extract_feed_info('http://vid.bleacherreport.com/videos/%s.akamai' % video_id) + info = self._extract_feed_info(f'http://vid.bleacherreport.com/videos/{video_id}.akamai') info['id'] = video_id return info diff --git a/yt_dlp/extractor/blerp.py b/yt_dlp/extractor/blerp.py index 4631ad2e97..f4f22488e9 100644 --- a/yt_dlp/extractor/blerp.py +++ b/yt_dlp/extractor/blerp.py @@ -16,7 +16,7 @@ class BlerpIE(InfoExtractor): 'uploader_id': '5fb81e51aa66ae000c395478', 'ext': 'mp3', 'tags': ['samsung', 'galaxy', 's8', 'over the horizon', '2016', 'ringtone'], - } + }, }, { 'url': 'https://blerp.com/soundbites/5bc94ef4796001000498429f', 'info_dict': { @@ -25,11 +25,11 @@ class BlerpIE(InfoExtractor): 'uploader': '179617322678353920', 'uploader_id': '5ba99cf71386730004552c42', 'ext': 'mp3', - 'tags': ['YEE', 'YEET', 'wo ha haah catchy tune yee', 'yee'] - } + 'tags': ['YEE', 'YEET', 'wo ha haah catchy tune yee', 'yee'], + }, }] - _GRAPHQL_OPERATIONNAME = "webBitePageGetBite" + _GRAPHQL_OPERATIONNAME = 'webBitePageGetBite' _GRAPHQL_QUERY = ( '''query webBitePageGetBite($_id: MongoID!) { web { @@ -141,27 +141,26 @@ def _real_extract(self, url): 'operationName': self._GRAPHQL_OPERATIONNAME, 'query': self._GRAPHQL_QUERY, 'variables': { - '_id': audio_id - } + '_id': audio_id, + }, } headers = { - 'Content-Type': 'application/json' + 'Content-Type': 'application/json', } - json_result = self._download_json('https://api.blerp.com/graphql', - audio_id, data=json.dumps(data).encode('utf-8'), headers=headers) + json_result = self._download_json( + 'https://api.blerp.com/graphql', audio_id, + data=json.dumps(data).encode(), headers=headers) bite_json = json_result['data']['web']['biteById'] - info_dict = { + return { 'id': bite_json['_id'], 'url': bite_json['audio']['mp3']['url'], 'title': bite_json['title'], 'uploader': traverse_obj(bite_json, ('ownerObject', 'username'), expected_type=strip_or_none), 'uploader_id': traverse_obj(bite_json, ('ownerObject', '_id'), expected_type=strip_or_none), 'ext': 'mp3', - 'tags': list(filter(None, map(strip_or_none, (traverse_obj(bite_json, 'userKeywords', expected_type=list) or []))) or None) + 'tags': list(filter(None, map(strip_or_none, (traverse_obj(bite_json, 'userKeywords', expected_type=list) or []))) or None), } - - return info_dict diff --git a/yt_dlp/extractor/blogger.py b/yt_dlp/extractor/blogger.py index 3d6e033042..1614b6f947 100644 --- a/yt_dlp/extractor/blogger.py +++ b/yt_dlp/extractor/blogger.py @@ -1,3 +1,4 @@ +from .common import InfoExtractor from ..utils import ( mimetype2ext, parse_duration, @@ -5,7 +6,6 @@ str_or_none, traverse_obj, ) -from .common import InfoExtractor class BloggerIE(InfoExtractor): @@ -21,14 +21,14 @@ class BloggerIE(InfoExtractor): 'ext': 'mp4', 'thumbnail': r're:^https?://.*', 'duration': 76.068, - } + }, }] def _real_extract(self, url): token_id = self._match_id(url) webpage = self._download_webpage(url, token_id) data_json = self._search_regex(r'var\s+VIDEO_CONFIG\s*=\s*(\{.*)', webpage, 'JSON data') - data = self._parse_json(data_json.encode('utf-8').decode('unicode_escape'), token_id) + data = self._parse_json(data_json.encode().decode('unicode_escape'), token_id) streams = data['streams'] formats = [{ 'ext': mimetype2ext(traverse_obj(parse_qs(stream['play_url']), ('mime', 0))), diff --git a/yt_dlp/extractor/bloomberg.py b/yt_dlp/extractor/bloomberg.py index 792155e51a..ec6b7a86eb 100644 --- a/yt_dlp/extractor/bloomberg.py +++ b/yt_dlp/extractor/bloomberg.py @@ -55,7 +55,7 @@ def _real_extract(self, url): title = re.sub(': Video$', '', self._og_search_title(webpage)) embed_info = self._download_json( - 'http://www.bloomberg.com/multimedia/api/embed?id=%s' % video_id, video_id) + f'http://www.bloomberg.com/multimedia/api/embed?id={video_id}', video_id) formats = [] for stream in embed_info['streams']: stream_url = stream.get('url') diff --git a/yt_dlp/extractor/bokecc.py b/yt_dlp/extractor/bokecc.py index ca326f25fa..5fe937a6ac 100644 --- a/yt_dlp/extractor/bokecc.py +++ b/yt_dlp/extractor/bokecc.py @@ -1,5 +1,6 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import compat_parse_qs from ..utils import ExtractorError @@ -9,20 +10,18 @@ def _extract_bokecc_formats(self, webpage, video_id, format_id=None): r'<(?:script|embed)[^>]+src=(?P<q>["\'])(?:https?:)?//p\.bokecc\.com/(?:player|flash/player\.swf)\?(?P<query>.+?)(?P=q)', webpage, 'player params', group='query') - player_params = compat_parse_qs(player_params_str) + player_params = urllib.parse.parse_qs(player_params_str) info_xml = self._download_xml( - 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % ( + 'http://p.bokecc.com/servlet/playinfo?uid={}&vid={}&m=1'.format( player_params['siteid'][0], player_params['vid'][0]), video_id) - formats = [{ + return [{ 'format_id': format_id, 'url': quality.find('./copy').attrib['playurl'], 'quality': int(quality.attrib['value']), } for quality in info_xml.findall('./video/quality')] - return formats - class BokeCCIE(BokeCCBaseIE): _IE_DESC = 'CC视频' @@ -38,11 +37,11 @@ class BokeCCIE(BokeCCBaseIE): }] def _real_extract(self, url): - qs = compat_parse_qs(self._match_valid_url(url).group('query')) + qs = urllib.parse.parse_qs(self._match_valid_url(url).group('query')) if not qs.get('vid') or not qs.get('uid'): raise ExtractorError('Invalid URL', expected=True) - video_id = '%s_%s' % (qs['uid'][0], qs['vid'][0]) + video_id = '{}_{}'.format(qs['uid'][0], qs['vid'][0]) webpage = self._download_webpage(url, video_id) diff --git a/yt_dlp/extractor/bongacams.py b/yt_dlp/extractor/bongacams.py index bf955668df..ab85477de4 100644 --- a/yt_dlp/extractor/bongacams.py +++ b/yt_dlp/extractor/bongacams.py @@ -1,5 +1,4 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, try_get, @@ -38,7 +37,7 @@ def _real_extract(self, url): channel_id = mobj.group('id') amf = self._download_json( - 'https://%s/tools/amf.php' % host, channel_id, + f'https://{host}/tools/amf.php', channel_id, data=urlencode_postdata(( ('method', 'getRoomData'), ('args[]', channel_id), @@ -48,14 +47,14 @@ def _real_extract(self, url): server_url = amf['localData']['videoServerUrl'] uploader_id = try_get( - amf, lambda x: x['performerData']['username'], compat_str) or channel_id + amf, lambda x: x['performerData']['username'], str) or channel_id uploader = try_get( - amf, lambda x: x['performerData']['displayName'], compat_str) + amf, lambda x: x['performerData']['displayName'], str) like_count = int_or_none(try_get( amf, lambda x: x['performerData']['loversCount'])) formats = self._extract_m3u8_formats( - '%s/hls/stream_%s/playlist.m3u8' % (server_url, uploader_id), + f'{server_url}/hls/stream_{uploader_id}/playlist.m3u8', channel_id, 'mp4', m3u8_id='hls', live=True) return { diff --git a/yt_dlp/extractor/boosty.py b/yt_dlp/extractor/boosty.py new file mode 100644 index 0000000000..d3aab7a1a8 --- /dev/null +++ b/yt_dlp/extractor/boosty.py @@ -0,0 +1,225 @@ +import json +import urllib.parse + +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import ( + ExtractorError, + bug_reports_message, + int_or_none, + qualities, + str_or_none, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class BoostyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?boosty\.to/(?P<user>[^/#?]+)/posts/(?P<post_id>[^/#?]+)' + _TESTS = [{ + # single ok_video + 'url': 'https://boosty.to/kuplinov/posts/e55d050c-e3bb-4873-a7db-ac7a49b40c38', + 'info_dict': { + 'id': 'd7473824-352e-48e2-ae53-d4aa39459968', + 'title': 'phasma_3', + 'channel': 'Kuplinov', + 'channel_id': '7958701', + 'timestamp': 1655031975, + 'upload_date': '20220612', + 'release_timestamp': 1655049000, + 'release_date': '20220612', + 'modified_timestamp': 1668680993, + 'modified_date': '20221117', + 'tags': ['куплинов', 'phasmophobia'], + 'like_count': int, + 'ext': 'mp4', + 'duration': 105, + 'view_count': int, + 'thumbnail': r're:^https://i\.mycdn\.me/videoPreview\?', + }, + }, { + # multiple ok_video + 'url': 'https://boosty.to/maddyson/posts/0c652798-3b35-471f-8b48-a76a0b28736f', + 'info_dict': { + 'id': '0c652798-3b35-471f-8b48-a76a0b28736f', + 'title': 'то что не пропустил юта6', + 'channel': 'Илья Давыдов', + 'channel_id': '6808257', + 'timestamp': 1694017040, + 'upload_date': '20230906', + 'release_timestamp': 1694017040, + 'release_date': '20230906', + 'modified_timestamp': 1694071178, + 'modified_date': '20230907', + 'like_count': int, + }, + 'playlist_count': 3, + 'playlist': [{ + 'info_dict': { + 'id': 'cc325a9f-a563-41c6-bf47-516c1b506c9a', + 'title': 'то что не пропустил юта6', + 'channel': 'Илья Давыдов', + 'channel_id': '6808257', + 'timestamp': 1694017040, + 'upload_date': '20230906', + 'release_timestamp': 1694017040, + 'release_date': '20230906', + 'modified_timestamp': 1694071178, + 'modified_date': '20230907', + 'like_count': int, + 'ext': 'mp4', + 'duration': 31204, + 'view_count': int, + 'thumbnail': r're:^https://i\.mycdn\.me/videoPreview\?', + }, + }, { + 'info_dict': { + 'id': 'd07b0a72-9493-4512-b54e-55ce468fd4b7', + 'title': 'то что не пропустил юта6', + 'channel': 'Илья Давыдов', + 'channel_id': '6808257', + 'timestamp': 1694017040, + 'upload_date': '20230906', + 'release_timestamp': 1694017040, + 'release_date': '20230906', + 'modified_timestamp': 1694071178, + 'modified_date': '20230907', + 'like_count': int, + 'ext': 'mp4', + 'duration': 25704, + 'view_count': int, + 'thumbnail': r're:^https://i\.mycdn\.me/videoPreview\?', + }, + }, { + 'info_dict': { + 'id': '4a3bba32-78c8-422a-9432-2791aff60b42', + 'title': 'то что не пропустил юта6', + 'channel': 'Илья Давыдов', + 'channel_id': '6808257', + 'timestamp': 1694017040, + 'upload_date': '20230906', + 'release_timestamp': 1694017040, + 'release_date': '20230906', + 'modified_timestamp': 1694071178, + 'modified_date': '20230907', + 'like_count': int, + 'ext': 'mp4', + 'duration': 31867, + 'view_count': int, + 'thumbnail': r're:^https://i\.mycdn\.me/videoPreview\?', + }, + }], + }, { + # single external video (youtube) + 'url': 'https://boosty.to/denischuzhoy/posts/6094a487-bcec-4cf8-a453-43313b463c38', + 'info_dict': { + 'id': 'EXelTnve5lY', + 'title': 'Послание Президента Федеральному Собранию | Класс народа', + 'upload_date': '20210425', + 'channel': 'Денис Чужой', + 'tags': 'count:10', + 'like_count': int, + 'ext': 'mp4', + 'duration': 816, + 'view_count': int, + 'thumbnail': r're:^https://i\.ytimg\.com/', + 'age_limit': 0, + 'availability': 'public', + 'categories': list, + 'channel_follower_count': int, + 'channel_id': 'UCCzVNbWZfYpBfyofCCUD_0w', + 'channel_is_verified': bool, + 'channel_url': r're:^https://www\.youtube\.com/', + 'comment_count': int, + 'description': str, + 'heatmap': 'count:100', + 'live_status': str, + 'playable_in_embed': bool, + 'uploader': str, + 'uploader_id': str, + 'uploader_url': r're:^https://www\.youtube\.com/', + }, + }] + + _MP4_TYPES = ('tiny', 'lowest', 'low', 'medium', 'high', 'full_hd', 'quad_hd', 'ultra_hd') + + def _extract_formats(self, player_urls, video_id): + formats = [] + quality = qualities(self._MP4_TYPES) + for player_url in traverse_obj(player_urls, lambda _, v: url_or_none(v['url'])): + url = player_url['url'] + format_type = player_url.get('type') + if format_type in ('hls', 'hls_live', 'live_ondemand_hls', 'live_playback_hls'): + formats.extend(self._extract_m3u8_formats(url, video_id, m3u8_id='hls', fatal=False)) + elif format_type in ('dash', 'dash_live', 'live_playback_dash'): + formats.extend(self._extract_mpd_formats(url, video_id, mpd_id='dash', fatal=False)) + elif format_type in self._MP4_TYPES: + formats.append({ + 'url': url, + 'ext': 'mp4', + 'format_id': format_type, + 'quality': quality(format_type), + }) + else: + self.report_warning(f'Unknown format type: {format_type!r}') + return formats + + def _real_extract(self, url): + user, post_id = self._match_valid_url(url).group('user', 'post_id') + + auth_headers = {} + auth_cookie = self._get_cookies('https://boosty.to/').get('auth') + if auth_cookie is not None: + try: + auth_data = json.loads(urllib.parse.unquote(auth_cookie.value)) + auth_headers['Authorization'] = f'Bearer {auth_data["accessToken"]}' + except (json.JSONDecodeError, KeyError): + self.report_warning(f'Failed to extract token from auth cookie{bug_reports_message()}') + + post = self._download_json( + f'https://api.boosty.to/v1/blog/{user}/post/{post_id}', post_id, + note='Downloading post data', errnote='Unable to download post data', headers=auth_headers) + + post_title = post.get('title') + if not post_title: + self.report_warning('Unable to extract post title. Falling back to parsing html page') + webpage = self._download_webpage(url, video_id=post_id) + post_title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage) + + common_metadata = { + 'title': post_title, + **traverse_obj(post, { + 'channel': ('user', 'name', {str}), + 'channel_id': ('user', 'id', {str_or_none}), + 'timestamp': ('createdAt', {int_or_none}), + 'release_timestamp': ('publishTime', {int_or_none}), + 'modified_timestamp': ('updatedAt', {int_or_none}), + 'tags': ('tags', ..., 'title', {str}), + 'like_count': ('count', 'likes', {int_or_none}), + }), + } + entries = [] + for item in traverse_obj(post, ('data', ..., {dict})): + item_type = item.get('type') + if item_type == 'video' and url_or_none(item.get('url')): + entries.append(self.url_result(item['url'], YoutubeIE)) + elif item_type == 'ok_video': + video_id = item.get('id') or post_id + entries.append({ + 'id': video_id, + 'formats': self._extract_formats(item.get('playerUrls'), video_id), + **common_metadata, + **traverse_obj(item, { + 'title': ('title', {str}), + 'duration': ('duration', {int_or_none}), + 'view_count': ('viewsCounter', {int_or_none}), + 'thumbnail': (('previewUrl', 'defaultPreview'), {url_or_none}), + }, get_all=False)}) + + if not entries and not post.get('hasAccess'): + self.raise_login_required('This post requires a subscription', metadata_available=True) + elif not entries: + raise ExtractorError('No videos found', expected=True) + if len(entries) == 1: + return entries[0] + return self.playlist_result(entries, post_id, post_title, **common_metadata) diff --git a/yt_dlp/extractor/booyah.py b/yt_dlp/extractor/booyah.py deleted file mode 100644 index 5c55f2c765..0000000000 --- a/yt_dlp/extractor/booyah.py +++ /dev/null @@ -1,86 +0,0 @@ -from .common import InfoExtractor -from ..utils import int_or_none, str_or_none, traverse_obj - - -class BooyahBaseIE(InfoExtractor): - _BOOYAH_SESSION_KEY = None - - def _real_initialize(self): - BooyahBaseIE._BOOYAH_SESSION_KEY = self._request_webpage( - 'https://booyah.live/api/v3/auths/sessions', None, data=b'').getheader('booyah-session-key') - - def _get_comments(self, video_id): - comment_json = self._download_json( - f'https://booyah.live/api/v3/playbacks/{video_id}/comments/tops', video_id, - headers={'Booyah-Session-Key': self._BOOYAH_SESSION_KEY}, fatal=False) or {} - - return [{ - 'id': comment.get('comment_id'), - 'author': comment.get('from_nickname'), - 'author_id': comment.get('from_uid'), - 'author_thumbnail': comment.get('from_thumbnail'), - 'text': comment.get('content'), - 'timestamp': comment.get('create_time'), - 'like_count': comment.get('like_cnt'), - } for comment in comment_json.get('comment_list') or ()] - - -class BooyahClipsIE(BooyahBaseIE): - _VALID_URL = r'https?://booyah.live/clips/(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://booyah.live/clips/13887261322952306617', - 'info_dict': { - 'id': '13887261322952306617', - 'ext': 'mp4', - 'view_count': int, - 'duration': 30, - 'channel_id': 90565760, - 'like_count': int, - 'title': 'Cayendo con estilo 😎', - 'uploader': '♡LɪꜱGΛ​MER​', - 'comment_count': int, - 'uploader_id': '90565760', - 'thumbnail': 'https://resmambet-a.akamaihd.net/mambet-storage/Clip/90565760/90565760-27204374-fba0-409d-9d7b-63a48b5c0e75.jpg', - 'upload_date': '20220617', - 'timestamp': 1655490556, - 'modified_timestamp': 1655490556, - 'modified_date': '20220617', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - json_data = self._download_json( - f'https://booyah.live/api/v3/playbacks/{video_id}', video_id, - headers={'Booyah-Session-key': self._BOOYAH_SESSION_KEY}) - - formats = [] - for video_data in json_data['playback']['endpoint_list']: - formats.extend(({ - 'url': video_data.get('stream_url'), - 'ext': 'mp4', - 'height': video_data.get('resolution'), - }, { - 'url': video_data.get('download_url'), - 'ext': 'mp4', - 'format_note': 'Watermarked', - 'height': video_data.get('resolution'), - 'preference': -10, - })) - - return { - 'id': video_id, - 'title': traverse_obj(json_data, ('playback', 'name')), - 'thumbnail': traverse_obj(json_data, ('playback', 'thumbnail_url')), - 'formats': formats, - 'view_count': traverse_obj(json_data, ('playback', 'views')), - 'like_count': traverse_obj(json_data, ('playback', 'likes')), - 'duration': traverse_obj(json_data, ('playback', 'duration')), - 'comment_count': traverse_obj(json_data, ('playback', 'comment_cnt')), - 'channel_id': traverse_obj(json_data, ('playback', 'channel_id')), - 'uploader': traverse_obj(json_data, ('user', 'nickname')), - 'uploader_id': str_or_none(traverse_obj(json_data, ('user', 'uid'))), - 'modified_timestamp': int_or_none(traverse_obj(json_data, ('playback', 'update_time_ms')), 1000), - 'timestamp': int_or_none(traverse_obj(json_data, ('playback', 'create_time_ms')), 1000), - '__post_extractor': self.extract_comments(video_id, self._get_comments(video_id)), - } diff --git a/yt_dlp/extractor/bostonglobe.py b/yt_dlp/extractor/bostonglobe.py index 92f8ea2cb4..f5b8196788 100644 --- a/yt_dlp/extractor/bostonglobe.py +++ b/yt_dlp/extractor/bostonglobe.py @@ -1,7 +1,6 @@ import re from .common import InfoExtractor - from ..utils import ( extract_attributes, ) @@ -58,8 +57,7 @@ def _real_extract(self, url): if video_id and account_id and player_id and embed: entries.append( - 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' - % (account_id, player_id, embed, video_id)) + f'http://players.brightcove.net/{account_id}/{player_id}_{embed}/index.html?videoId={video_id}') if len(entries) == 0: return self.url_result(url, 'Generic') diff --git a/yt_dlp/extractor/box.py b/yt_dlp/extractor/box.py index 8ab149626b..f06339f701 100644 --- a/yt_dlp/extractor/box.py +++ b/yt_dlp/extractor/box.py @@ -1,45 +1,82 @@ import json +import urllib.parse from .common import InfoExtractor from ..utils import ( - determine_ext, + ExtractorError, parse_iso8601, - # try_get, update_url_query, + url_or_none, ) +from ..utils.traversal import traverse_obj class BoxIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/]+)/file/(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:[^.]+\.)?(?P<service>app|ent)\.box\.com/s/(?P<shared_name>[^/?#]+)(?:/file/(?P<id>\d+))?' + _TESTS = [{ 'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538', 'md5': '1f81b2fd3960f38a40a3b8823e5fcd43', 'info_dict': { 'id': '510727257538', 'ext': 'mp4', 'title': 'Garber St. Louis will be 28th MLS team +scarving.mp4', - 'uploader': 'MLS Video', + 'uploader': '', 'timestamp': 1566320259, 'upload_date': '20190820', 'uploader_id': '235196876', - } - } + }, + 'params': {'skip_download': 'dash fragment too small'}, + }, { + 'url': 'https://utexas.app.box.com/s/2x6vanv85fdl8j2eqlcxmv0gp1wvps6e', + 'info_dict': { + 'id': '787379022466', + 'ext': 'mp4', + 'title': 'Webinar recording: Take the Leap!.mp4', + 'uploader': 'Patricia Mosele', + 'timestamp': 1615824864, + 'upload_date': '20210315', + 'uploader_id': '239068974', + }, + 'params': {'skip_download': 'dash fragment too small'}, + }, { + 'url': 'https://thejacksonlaboratory.ent.box.com/s/2x09dm6vcg6y28o0oox1so4l0t8wzt6l/file/1536173056065', + 'info_dict': { + 'id': '1536173056065', + 'ext': 'mp4', + 'uploader_id': '18523128264', + 'uploader': 'Lexi Hennigan', + 'title': 'iPSC Symposium recording part 1.mp4', + 'timestamp': 1716228343, + 'upload_date': '20240520', + }, + 'params': {'skip_download': 'dash fragment too small'}, + }] def _real_extract(self, url): - shared_name, file_id = self._match_valid_url(url).groups() - webpage = self._download_webpage(url, file_id) - request_token = self._parse_json(self._search_regex( - r'Box\.config\s*=\s*({.+?});', webpage, - 'Box config'), file_id)['requestToken'] + shared_name, file_id, service = self._match_valid_url(url).group('shared_name', 'id', 'service') + webpage = self._download_webpage(url, file_id or shared_name) + + if not file_id: + post_stream_data = self._search_json( + r'Box\.postStreamData\s*=', webpage, 'Box post-stream data', shared_name) + shared_item = traverse_obj( + post_stream_data, ('/app-api/enduserapp/shared-item', {dict})) or {} + if shared_item.get('itemType') != 'file': + raise ExtractorError('The requested resource is not a file', expected=True) + + file_id = str(shared_item['itemID']) + + request_token = self._search_json( + r'Box\.config\s*=', webpage, 'Box config', file_id)['requestToken'] access_token = self._download_json( - 'https://app.box.com/app-api/enduserapp/elements/tokens', file_id, + f'https://{service}.box.com/app-api/enduserapp/elements/tokens', file_id, 'Downloading token JSON metadata', data=json.dumps({'fileIDs': [file_id]}).encode(), headers={ 'Content-Type': 'application/json', 'X-Request-Token': request_token, 'X-Box-EndUser-API': 'sharedName=' + shared_name, })[file_id]['read'] - shared_link = 'https://app.box.com/s/' + shared_name + shared_link = f'https://{service}.box.com/s/{shared_name}' f = self._download_json( 'https://api.box.com/2.0/files/' + file_id, file_id, 'Downloading file JSON metadata', headers={ @@ -47,37 +84,26 @@ def _real_extract(self, url): 'BoxApi': 'shared_link=' + shared_link, 'X-Rep-Hints': '[dash]', # TODO: extract `hls` formats }, query={ - 'fields': 'authenticated_download_url,created_at,created_by,description,extension,is_download_available,name,representations,size' + 'fields': 'authenticated_download_url,created_at,created_by,description,extension,is_download_available,name,representations,size', }) title = f['name'] query = { 'access_token': access_token, - 'shared_link': shared_link + 'shared_link': shared_link, } formats = [] - # for entry in (try_get(f, lambda x: x['representations']['entries'], list) or []): - # entry_url_template = try_get( - # entry, lambda x: x['content']['url_template']) - # if not entry_url_template: - # continue - # representation = entry.get('representation') - # if representation == 'dash': - # TODO: append query to every fragment URL - # formats.extend(self._extract_mpd_formats( - # entry_url_template.replace('{+asset_path}', 'manifest.mpd'), - # file_id, query=query)) - - authenticated_download_url = f.get('authenticated_download_url') - if authenticated_download_url and f.get('is_download_available'): - formats.append({ - 'ext': f.get('extension') or determine_ext(title), - 'filesize': f.get('size'), - 'format_id': 'download', - 'url': update_url_query(authenticated_download_url, query), - }) + for url_tmpl in traverse_obj(f, ( + 'representations', 'entries', lambda _, v: v['representation'] == 'dash', + 'content', 'url_template', {url_or_none}, + )): + manifest_url = update_url_query(url_tmpl.replace('{+asset_path}', 'manifest.mpd'), query) + fmts = self._extract_mpd_formats(manifest_url, file_id) + for fmt in fmts: + fmt['extra_param_to_segment_url'] = urllib.parse.urlparse(manifest_url).query + formats.extend(fmts) creator = f.get('created_by') or {} diff --git a/yt_dlp/extractor/boxcast.py b/yt_dlp/extractor/boxcast.py index 51f9eb7873..efa66994aa 100644 --- a/yt_dlp/extractor/boxcast.py +++ b/yt_dlp/extractor/boxcast.py @@ -1,9 +1,5 @@ from .common import InfoExtractor -from ..utils import ( - js_to_json, - traverse_obj, - unified_timestamp -) +from ..utils import js_to_json, traverse_obj, unified_timestamp class BoxCastVideoIE(InfoExtractor): @@ -25,7 +21,7 @@ class BoxCastVideoIE(InfoExtractor): 'release_date': '20221210', 'uploader_id': 're8w0v8hohhvpqtbskpe', 'uploader': 'Children\'s Health Defense', - } + }, }, { 'url': 'https://boxcast.tv/video-portal/vctwevwntun3o0ikq7af/rvyblnn0fxbfjx5nwxhl/otbpltj2kzkveo2qz3ad', 'info_dict': { @@ -34,8 +30,8 @@ class BoxCastVideoIE(InfoExtractor): 'uploader_id': 'vctwevwntun3o0ikq7af', 'uploader': 'Legacy Christian Church', 'title': 'The Quest | 1: Beginner\'s Bay | Jamie Schools', - 'thumbnail': r're:https?://uploads.boxcast.com/(?:[\w-]+/){3}.+\.jpg' - } + 'thumbnail': r're:https?://uploads.boxcast.com/(?:[\w-]+/){3}.+\.jpg', + }, }, { 'url': 'https://boxcast.tv/channel/z03fqwaeaby5lnaawox2?b=ssihlw5gvfij2by8tkev', 'info_dict': { @@ -48,7 +44,7 @@ class BoxCastVideoIE(InfoExtractor): 'uploader': 'Lighthouse Ministries International - Beltsville, Maryland', 'description': 'md5:ac23e3d01b0b0be592e8f7fe0ec3a340', 'title': 'New Year\'s Eve CROSSOVER Service at LHMI | December 31, 2022', - } + }, }] _WEBPAGE_TESTS = [{ 'url': 'https://childrenshealthdefense.eu/live-stream/', @@ -61,7 +57,7 @@ class BoxCastVideoIE(InfoExtractor): 'release_date': '20221210', 'uploader_id': 're8w0v8hohhvpqtbskpe', 'uploader': 'Children\'s Health Defense', - } + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/bpb.py b/yt_dlp/extractor/bpb.py index f28e581b87..7fe0899449 100644 --- a/yt_dlp/extractor/bpb.py +++ b/yt_dlp/extractor/bpb.py @@ -1,56 +1,170 @@ +import functools import re from .common import InfoExtractor from ..utils import ( + clean_html, + extract_attributes, + get_element_text_and_html_by_tag, + get_elements_by_class, + join_nonempty, js_to_json, - determine_ext, + mimetype2ext, + unified_strdate, + url_or_none, + urljoin, + variadic, ) +from ..utils.traversal import traverse_obj + + +def html_get_element(tag=None, cls=None): + assert tag or cls, 'One of tag or class is required' + + if cls: + func = functools.partial(get_elements_by_class, cls, tag=tag) + else: + func = functools.partial(get_element_text_and_html_by_tag, tag) + + def html_get_element_wrapper(html): + return variadic(func(html))[0] + + return html_get_element_wrapper class BpbIE(InfoExtractor): IE_DESC = 'Bundeszentrale für politische Bildung' - _VALID_URL = r'https?://(?:www\.)?bpb\.de/mediathek/(?P<id>[0-9]+)/' + _VALID_URL = r'https?://(?:www\.|m\.)?bpb\.de/(?:[^/?#]+/)*(?P<id>\d+)(?:[/?#]|$)' - _TEST = { + _TESTS = [{ 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', - 'md5': 'c4f84c8a8044ca9ff68bb8441d300b3f', 'info_dict': { 'id': '297', 'ext': 'mp4', + 'creator': 'Kooperative Berlin', + 'description': 'md5:f4f75885ba009d3e2b156247a8941ce6', + 'release_date': '20160115', + 'series': 'Interview auf dem Geschichtsforum 1989 | 2009', + 'tags': ['Friedliche Revolution', 'Erinnerungskultur', 'Vergangenheitspolitik', 'DDR 1949 - 1990', 'Freiheitsrecht', 'BStU', 'Deutschland'], + 'thumbnail': 'https://www.bpb.de/cache/images/7/297_teaser_16x9_1240.jpg?8839D', 'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR', - 'description': 'Joachim Gauck, erster Beauftragter für die Stasi-Unterlagen, spricht auf dem Geschichtsforum über die friedliche Revolution 1989 und eine "gewisse Traurigkeit" im Umgang mit der DDR-Vergangenheit.' + 'uploader': 'Bundeszentrale für politische Bildung', + }, + }, { + 'url': 'https://www.bpb.de/mediathek/video/522184/krieg-flucht-und-falschmeldungen-wirstattdesinformation-2/', + 'info_dict': { + 'id': '522184', + 'ext': 'mp4', + 'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)', + 'description': 'md5:f83c795ff8f825a69456a9e51fc15903', + 'release_date': '20230621', + 'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'], + 'thumbnail': 'https://www.bpb.de/cache/images/4/522184_teaser_16x9_1240.png?EABFB', + 'title': 'md5:9b01ccdbf58dbf9e5c9f6e771a803b1c', + 'uploader': 'Bundeszentrale für politische Bildung', + }, + }, { + 'url': 'https://www.bpb.de/lernen/bewegtbild-und-politische-bildung/webvideo/518789/krieg-flucht-und-falschmeldungen-wirstattdesinformation-1/', + 'info_dict': { + 'id': '518789', + 'ext': 'mp4', + 'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)', + 'description': 'md5:85228aed433e84ff0ff9bc582abd4ea8', + 'release_date': '20230302', + 'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'], + 'thumbnail': 'https://www.bpb.de/cache/images/9/518789_teaser_16x9_1240.jpeg?56D0D', + 'title': 'md5:3e956f264bb501f6383f10495a401da4', + 'uploader': 'Bundeszentrale für politische Bildung', + }, + }, { + 'url': 'https://www.bpb.de/mediathek/podcasts/apuz-podcast/539727/apuz-20-china/', + 'only_matching': True, + }, { + 'url': 'https://www.bpb.de/mediathek/audio/315813/folge-1-eine-einfuehrung/', + 'info_dict': { + 'id': '315813', + 'ext': 'mp3', + 'creator': 'Axel Schröder', + 'description': 'md5:eda9d1af34e5912efef5baf54fba4427', + 'release_date': '20200921', + 'series': 'Auf Endlagersuche. Der deutsche Weg zu einem sicheren Atommülllager', + 'tags': ['Atomenergie', 'Endlager', 'hoch-radioaktiver Abfall', 'Endlagersuche', 'Atommüll', 'Atomendlager', 'Gorleben', 'Deutschland'], + 'thumbnail': 'https://www.bpb.de/cache/images/3/315813_teaser_16x9_1240.png?92A94', + 'title': 'Folge 1: Eine Einführung', + 'uploader': 'Bundeszentrale für politische Bildung', + }, + }, { + 'url': 'https://www.bpb.de/517806/die-weltanschauung-der-neuen-rechten/', + 'info_dict': { + 'id': '517806', + 'ext': 'mp3', + 'creator': 'Bundeszentrale für politische Bildung', + 'description': 'md5:594689600e919912aade0b2871cc3fed', + 'release_date': '20230127', + 'series': 'Vorträge des Fachtags "Modernisierer. Grenzgänger. Anstifter. Sechs Jahrzehnte \'Neue Rechte\'"', + 'tags': ['Rechtsextremismus', 'Konservatismus', 'Konservativismus', 'neue Rechte', 'Rechtspopulismus', 'Schnellroda', 'Deutschland'], + 'thumbnail': 'https://www.bpb.de/cache/images/6/517806_teaser_16x9_1240.png?7A7A0', + 'title': 'Die Weltanschauung der "Neuen Rechten"', + 'uploader': 'Bundeszentrale für politische Bildung', + }, + }, { + 'url': 'https://www.bpb.de/mediathek/reihen/zahlen-und-fakten-soziale-situation-filme/520153/zahlen-und-fakten-die-soziale-situation-in-deutschland-migration/', + 'only_matching': True, + }] + + _TITLE_RE = re.compile('(?P<title>[^<]*)<[^>]+>(?P<series>[^<]*)') + + def _parse_vue_attributes(self, name, string, video_id): + attributes = extract_attributes(self._search_regex(rf'(<{name}(?:"[^"]*?"|[^>])*>)', string, name)) + + for key, value in attributes.items(): + if key.startswith(':'): + attributes[key] = self._parse_json(value, video_id, transform_source=js_to_json, fatal=False) + + return attributes + + @staticmethod + def _process_source(source): + url = url_or_none(source['src']) + if not url: + return None + + source_type = source.get('type', '') + extension = mimetype2ext(source_type) + is_video = source_type.startswith('video') + note = url.rpartition('.')[0].rpartition('_')[2] if is_video else None + + return { + 'url': url, + 'ext': extension, + 'vcodec': None if is_video else 'none', + 'quality': 10 if note == 'high' else 0, + 'format_note': note, + 'format_id': join_nonempty(extension, note), } - } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'<h2 class="white">(.*?)</h2>', webpage, 'title') - video_info_dicts = re.findall( - r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage) - - formats = [] - for video_info in video_info_dicts: - video_info = self._parse_json( - video_info, video_id, transform_source=js_to_json, fatal=False) - if not video_info: - continue - video_url = video_info.get('src') - if not video_url: - continue - quality = 'high' if '_high' in video_url else 'low' - formats.append({ - 'url': video_url, - 'quality': 10 if quality == 'high' else 0, - 'format_note': quality, - 'format_id': '%s-%s' % (quality, determine_ext(video_url)), - }) + title_result = traverse_obj(webpage, ({html_get_element(cls='opening-header__title')}, {self._TITLE_RE.match})) + json_lds = list(self._yield_json_ld(webpage, video_id, fatal=False)) return { 'id': video_id, - 'formats': formats, - 'title': title, - 'description': self._og_search_description(webpage), + 'title': traverse_obj(title_result, ('title', {str.strip})) or None, + # This metadata could be interpreted otherwise, but it fits "series" the most + 'series': traverse_obj(title_result, ('series', {str.strip})) or None, + 'description': join_nonempty(*traverse_obj(webpage, [( + {html_get_element(cls='opening-intro')}, + [{html_get_element(tag='bpb-accordion-item')}, {html_get_element(cls='text-content')}], + ), {clean_html}]), delim='\n\n') or None, + 'creator': self._html_search_meta('author', webpage), + 'uploader': self._html_search_meta('publisher', webpage), + 'release_date': unified_strdate(self._html_search_meta('date', webpage)), + 'tags': traverse_obj(json_lds, (..., 'keywords', {lambda x: x.split(',')}, ...)), + **traverse_obj(self._parse_vue_attributes('bpb-player', webpage, video_id), { + 'formats': (':sources', ..., {self._process_source}), + 'thumbnail': ('poster', {lambda x: urljoin(url, x)}), + }), } diff --git a/yt_dlp/extractor/br.py b/yt_dlp/extractor/br.py index 309452d23e..0568e06f68 100644 --- a/yt_dlp/extractor/br.py +++ b/yt_dlp/extractor/br.py @@ -1,18 +1,15 @@ -import json - from .common import InfoExtractor from ..utils import ( - determine_ext, ExtractorError, int_or_none, parse_duration, - parse_iso8601, xpath_element, xpath_text, ) class BRIE(InfoExtractor): + _WORKING = False IE_DESC = 'Bayerischer Rundfunk' _VALID_URL = r'(?P<base_url>https?://(?:www\.)?br(?:-klassik)?\.de)/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html' @@ -64,7 +61,7 @@ class BRIE(InfoExtractor): 'title': 'Umweltbewusster Häuslebauer', 'description': 'md5:d52dae9792d00226348c1dbb13c9bae2', 'duration': 116, - } + }, }, { 'url': 'http://www.br.de/fernsehen/br-alpha/sendungen/kant-fuer-anfaenger/kritik-der-reinen-vernunft/kant-kritik-01-metaphysik100.html', @@ -77,7 +74,7 @@ class BRIE(InfoExtractor): 'duration': 893, 'uploader': 'Eva Maria Steimle', 'upload_date': '20170208', - } + }, }, ] @@ -145,7 +142,7 @@ def _extract_formats(self, assets, media_id): http_format_info = format_info.copy() http_format_info.update({ 'url': format_url, - 'format_id': 'http-%s' % asset_type, + 'format_id': f'http-{asset_type}', }) formats.append(http_format_info) server_prefix = xpath_text(asset, 'serverPrefix') @@ -154,7 +151,7 @@ def _extract_formats(self, assets, media_id): rtmp_format_info.update({ 'url': server_prefix, 'play_path': xpath_text(asset, 'fileName'), - 'format_id': 'rtmp-%s' % asset_type, + 'format_id': f'rtmp-{asset_type}', }) formats.append(rtmp_format_info) return formats @@ -167,142 +164,3 @@ def _extract_thumbnails(self, variants, base_url): } for variant in variants.findall('variant') if xpath_text(variant, 'url')] thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True) return thumbnails - - -class BRMediathekIE(InfoExtractor): - IE_DESC = 'Bayerischer Rundfunk Mediathek' - _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek//?video/(?:[^/?&#]+?-)?(?P<id>av:[0-9a-f]{24})' - - _TESTS = [{ - 'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e', - 'md5': 'fdc3d485835966d1622587d08ba632ec', - 'info_dict': { - 'id': 'av:5a1e6a6e8fce6d001871cc8e', - 'ext': 'mp4', - 'title': 'Die Sendung vom 28.11.2017', - 'description': 'md5:6000cdca5912ab2277e5b7339f201ccc', - 'timestamp': 1511942766, - 'upload_date': '20171129', - } - }, { - 'url': 'https://www.br.de/mediathek//video/av:61b0db581aed360007558c12', - 'only_matching': True, - }] - - def _real_extract(self, url): - clip_id = self._match_id(url) - - clip = self._download_json( - 'https://proxy-base.master.mango.express/graphql', - clip_id, data=json.dumps({ - "query": """{ - viewer { - clip(id: "%s") { - title - description - duration - createdAt - ageRestriction - videoFiles { - edges { - node { - publicLocation - fileSize - videoProfile { - width - height - bitrate - encoding - } - } - } - } - captionFiles { - edges { - node { - publicLocation - } - } - } - teaserImages { - edges { - node { - imageFiles { - edges { - node { - publicLocation - width - height - } - } - } - } - } - } - } - } -}""" % clip_id}).encode(), headers={ - 'Content-Type': 'application/json', - })['data']['viewer']['clip'] - title = clip['title'] - - formats = [] - for edge in clip.get('videoFiles', {}).get('edges', []): - node = edge.get('node', {}) - n_url = node.get('publicLocation') - if not n_url: - continue - ext = determine_ext(n_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - n_url, clip_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - video_profile = node.get('videoProfile', {}) - tbr = int_or_none(video_profile.get('bitrate')) - format_id = 'http' - if tbr: - format_id += '-%d' % tbr - formats.append({ - 'format_id': format_id, - 'url': n_url, - 'width': int_or_none(video_profile.get('width')), - 'height': int_or_none(video_profile.get('height')), - 'tbr': tbr, - 'filesize': int_or_none(node.get('fileSize')), - }) - - subtitles = {} - for edge in clip.get('captionFiles', {}).get('edges', []): - node = edge.get('node', {}) - n_url = node.get('publicLocation') - if not n_url: - continue - subtitles.setdefault('de', []).append({ - 'url': n_url, - }) - - thumbnails = [] - for edge in clip.get('teaserImages', {}).get('edges', []): - for image_edge in edge.get('node', {}).get('imageFiles', {}).get('edges', []): - node = image_edge.get('node', {}) - n_url = node.get('publicLocation') - if not n_url: - continue - thumbnails.append({ - 'url': n_url, - 'width': int_or_none(node.get('width')), - 'height': int_or_none(node.get('height')), - }) - - return { - 'id': clip_id, - 'title': title, - 'description': clip.get('description'), - 'duration': int_or_none(clip.get('duration')), - 'timestamp': parse_iso8601(clip.get('createdAt')), - 'age_limit': int_or_none(clip.get('ageRestriction')), - 'formats': formats, - 'subtitles': subtitles, - 'thumbnails': thumbnails, - } diff --git a/yt_dlp/extractor/brainpop.py b/yt_dlp/extractor/brainpop.py new file mode 100644 index 0000000000..df10299a0c --- /dev/null +++ b/yt_dlp/extractor/brainpop.py @@ -0,0 +1,318 @@ +import json +import re + +from .common import InfoExtractor +from ..utils import ( + classproperty, + int_or_none, + traverse_obj, + urljoin, +) + + +class BrainPOPBaseIE(InfoExtractor): + _NETRC_MACHINE = 'brainpop' + _ORIGIN = '' # So that _VALID_URL doesn't crash + _LOGIN_ERRORS = { + 1502: 'The username and password you entered did not match.', # LOGIN_FAILED + 1503: 'Payment method is expired.', # LOGIN_FAILED_ACCOUNT_NOT_ACTIVE + 1506: 'Your BrainPOP plan has expired.', # LOGIN_FAILED_ACCOUNT_EXPIRED + 1507: 'Terms not accepted.', # LOGIN_FAILED_TERMS_NOT_ACCEPTED + 1508: 'Account not activated.', # LOGIN_FAILED_SUBSCRIPTION_NOT_ACTIVE + 1512: 'The maximum number of devices permitted are logged in with your account right now.', # LOGIN_FAILED_LOGIN_LIMIT_REACHED + 1513: 'You are trying to access your account from outside of its allowed IP range.', # LOGIN_FAILED_INVALID_IP + 1514: 'Individual accounts are not included in your plan. Try again with your shared username and password.', # LOGIN_FAILED_MBP_DISABLED + 1515: 'Account not activated.', # LOGIN_FAILED_TEACHER_NOT_ACTIVE + 1523: 'That username and password won\'t work on this BrainPOP site.', # LOGIN_FAILED_NO_ACCESS + 1524: 'You\'ll need to join a class before you can login.', # LOGIN_FAILED_STUDENT_NO_PERIOD + 1526: 'Your account is locked. Reset your password, or ask a teacher or administrator for help.', # LOGIN_FAILED_ACCOUNT_LOCKED + } + + @classproperty + def _VALID_URL(cls): + root = re.escape(cls._ORIGIN).replace(r'https:', r'https?:').replace(r'www\.', r'(?:www\.)?') + return rf'{root}/(?P<slug>[^/]+/[^/]+/(?P<id>[^/?#&]+))' + + def _assemble_formats(self, slug, format_id, display_id, token='', extra_fields={}): + formats = [] + formats = self._extract_m3u8_formats( + f'{urljoin(self._HLS_URL, slug)}.m3u8?{token}', + display_id, 'mp4', m3u8_id=f'{format_id}-hls', fatal=False) + formats.append({ + 'format_id': format_id, + 'url': f'{urljoin(self._VIDEO_URL, slug)}?{token}', + }) + for f in formats: + f.update(extra_fields) + return formats + + def _extract_adaptive_formats(self, data, token, display_id, key_format='%s', extra_fields={}): + formats = [] + additional_key_formats = { + '%s': {}, + 'ad_%s': { + 'format_note': 'Audio description', + 'source_preference': -2, + }, + } + for additional_key_format, additional_key_fields in additional_key_formats.items(): + for key_quality, key_index in enumerate(('high', 'low')): + full_key_index = additional_key_format % (key_format % key_index) + if data.get(full_key_index): + formats.extend(self._assemble_formats(data[full_key_index], full_key_index, display_id, token, { + 'quality': -1 - key_quality, + **additional_key_fields, + **extra_fields, + })) + return formats + + def _perform_login(self, username, password): + login_res = self._download_json( + 'https://api.brainpop.com/api/login', None, + data=json.dumps({'username': username, 'password': password}).encode(), + headers={ + 'Content-Type': 'application/json', + 'Referer': self._ORIGIN, + }, note='Logging in', errnote='Unable to log in', expected_status=400) + status_code = int_or_none(login_res['status_code']) + if status_code != 1505: + self.report_warning( + f'Unable to login: {self._LOGIN_ERRORS.get(status_code) or login_res.get("message")}' + or f'Got status code {status_code}') + + +class BrainPOPIE(BrainPOPBaseIE): + _ORIGIN = 'https://www.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com' + _TESTS = [{ + 'url': 'https://www.brainpop.com/health/conflictresolution/martinlutherkingjr/movie?ref=null', + 'md5': '3ead374233ae74c7f1b0029a01c972f0', + 'info_dict': { + 'id': '1f3259fa457292b4', + 'ext': 'mp4', + 'title': 'Martin Luther King, Jr.', + 'display_id': 'martinlutherkingjr', + 'description': 'md5:f403dbb2bf3ccc7cf4c59d9e43e3c349', + }, + }, { + 'url': 'https://www.brainpop.com/science/space/bigbang/', + 'md5': '9a1ff0e77444dd9e437354eb669c87ec', + 'info_dict': { + 'id': 'acae52cd48c99acf', + 'ext': 'mp4', + 'title': 'Big Bang', + 'display_id': 'bigbang', + 'description': 'md5:3e53b766b0f116f631b13f4cae185d38', + }, + 'skip': 'Requires login', + }] + + def _real_extract(self, url): + slug, display_id = self._match_valid_url(url).group('slug', 'id') + movie_data = self._download_json( + f'https://api.brainpop.com/api/content/published/bp/en/{slug}/movie?full=1', display_id, + 'Downloading movie data JSON', 'Unable to download movie data')['data'] + topic_data = traverse_obj(self._download_json( + f'https://api.brainpop.com/api/content/published/bp/en/{slug}?full=1', display_id, + 'Downloading topic data JSON', 'Unable to download topic data', fatal=False), + ('data', 'topic'), expected_type=dict) or movie_data['topic'] + + if not traverse_obj(movie_data, ('access', 'allow')): + reason = traverse_obj(movie_data, ('access', 'reason')) + if 'logged' in reason: + self.raise_login_required(reason, metadata_available=True) + else: + self.raise_no_formats(reason, video_id=display_id) + movie_feature = movie_data['feature'] + movie_feature_data = movie_feature['data'] + + formats, subtitles = [], {} + formats.extend(self._extract_adaptive_formats(movie_feature_data, movie_feature_data.get('token', ''), display_id, '%s_v2', { + 'language': movie_feature.get('language') or 'en', + 'language_preference': 10, + })) + for lang, localized_feature in traverse_obj(movie_feature, 'localization', default={}, expected_type=dict).items(): + formats.extend(self._extract_adaptive_formats(localized_feature, localized_feature.get('token', ''), display_id, '%s_v2', { + 'language': lang, + 'language_preference': -10, + })) + + # TODO: Do localization fields also have subtitles? + for name, url in movie_feature_data.items(): + lang = self._search_regex( + r'^subtitles_(?P<lang>\w+)$', name, 'subtitle metadata', default=None) + if lang and url: + subtitles.setdefault(lang, []).append({ + 'url': urljoin(self._CDN_URL, url), + }) + + return { + 'id': topic_data['topic_id'], + 'display_id': display_id, + 'title': topic_data.get('name'), + 'description': topic_data.get('synopsis'), + 'formats': formats, + 'subtitles': subtitles, + } + + +class BrainPOPLegacyBaseIE(BrainPOPBaseIE): + def _parse_js_topic_data(self, topic_data, display_id, token): + movie_data = topic_data['movies'] + # TODO: Are there non-burned subtitles? + formats = self._extract_adaptive_formats(movie_data, token, display_id) + + return { + 'id': topic_data['EntryID'], + 'display_id': display_id, + 'title': topic_data.get('name'), + 'alt_title': topic_data.get('title'), + 'description': topic_data.get('synopsis'), + 'formats': formats, + } + + def _real_extract(self, url): + slug, display_id = self._match_valid_url(url).group('slug', 'id') + webpage = self._download_webpage(url, display_id) + topic_data = self._search_json( + r'var\s+content\s*=\s*', webpage, 'content data', + display_id, end_pattern=';')['category']['unit']['topic'] + token = self._search_regex(r'ec_token\s*:\s*[\'"]([^\'"]+)', webpage, 'video token') + return self._parse_js_topic_data(topic_data, display_id, token) + + +class BrainPOPJrIE(BrainPOPLegacyBaseIE): + _ORIGIN = 'https://jr.brainpop.com' + _VIDEO_URL = 'https://svideos-jr.brainpop.com' + _HLS_URL = 'https://hls-jr.brainpop.com' + _CDN_URL = 'https://cdn-jr.brainpop.com' + _TESTS = [{ + 'url': 'https://jr.brainpop.com/health/feelingsandsel/emotions/', + 'md5': '04e0561bb21770f305a0ce6cf0d869ab', + 'info_dict': { + 'id': '347', + 'ext': 'mp4', + 'title': 'Emotions', + 'display_id': 'emotions', + }, + }, { + 'url': 'https://jr.brainpop.com/science/habitats/arctichabitats/', + 'md5': 'b0ed063bbd1910df00220ee29340f5d6', + 'info_dict': { + 'id': '29', + 'ext': 'mp4', + 'title': 'Arctic Habitats', + 'display_id': 'arctichabitats', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPELLIE(BrainPOPLegacyBaseIE): + _ORIGIN = 'https://ell.brainpop.com' + _VIDEO_URL = 'https://svideos-esl.brainpop.com' + _HLS_URL = 'https://hls-esl.brainpop.com' + _CDN_URL = 'https://cdn-esl.brainpop.com' + _TESTS = [{ + 'url': 'https://ell.brainpop.com/level1/unit1/lesson1/', + 'md5': 'a2012700cfb774acb7ad2e8834eed0d0', + 'info_dict': { + 'id': '1', + 'ext': 'mp4', + 'title': 'Lesson 1', + 'display_id': 'lesson1', + 'alt_title': 'Personal Pronouns', + }, + }, { + 'url': 'https://ell.brainpop.com/level3/unit6/lesson5/', + 'md5': 'be19c8292c87b24aacfb5fda2f3f8363', + 'info_dict': { + 'id': '101', + 'ext': 'mp4', + 'title': 'Lesson 5', + 'display_id': 'lesson5', + 'alt_title': 'Review: Unit 6', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPEspIE(BrainPOPLegacyBaseIE): + IE_DESC = 'BrainPOP Español' + _ORIGIN = 'https://esp.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com/mx' + _TESTS = [{ + 'url': 'https://esp.brainpop.com/ciencia/la_diversidad_de_la_vida/ecosistemas/', + 'md5': 'cb3f062db2b3c5240ddfcfde7108f8c9', + 'info_dict': { + 'id': '3893', + 'ext': 'mp4', + 'title': 'Ecosistemas', + 'display_id': 'ecosistemas', + 'description': 'md5:80fc55b07e241f8c8f2aa8d74deaf3c3', + }, + }, { + 'url': 'https://esp.brainpop.com/espanol/la_escritura/emily_dickinson/', + 'md5': '98c1b9559e0e33777209c425cda7dac4', + 'info_dict': { + 'id': '7146', + 'ext': 'mp4', + 'title': 'Emily Dickinson', + 'display_id': 'emily_dickinson', + 'description': 'md5:2795ad87b1d239c9711c1e92ab5a978b', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPFrIE(BrainPOPLegacyBaseIE): + IE_DESC = 'BrainPOP Français' + _ORIGIN = 'https://fr.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com/fr' + _TESTS = [{ + 'url': 'https://fr.brainpop.com/sciencesdelaterre/energie/sourcesdenergie/', + 'md5': '97e7f48af8af93f8a2be11709f239371', + 'info_dict': { + 'id': '1651', + 'ext': 'mp4', + 'title': 'Sources d\'énergie', + 'display_id': 'sourcesdenergie', + 'description': 'md5:7eece350f019a21ef9f64d4088b2d857', + }, + }, { + 'url': 'https://fr.brainpop.com/francais/ecrire/plagiat/', + 'md5': '0cf2b4f89804d0dd4a360a51310d445a', + 'info_dict': { + 'id': '5803', + 'ext': 'mp4', + 'title': 'Plagiat', + 'display_id': 'plagiat', + 'description': 'md5:4496d87127ace28e8b1eda116e77cd2b', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPIlIE(BrainPOPLegacyBaseIE): + IE_DESC = 'BrainPOP Hebrew' + _ORIGIN = 'https://il.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com/he' + _TESTS = [{ + 'url': 'https://il.brainpop.com/category_9/subcategory_150/subjects_3782/', + 'md5': '9e4ea9dc60ecd385a6e5ca12ccf31641', + 'info_dict': { + 'id': '3782', + 'ext': 'mp4', + 'title': 'md5:e993632fcda0545d9205602ec314ad67', + 'display_id': 'subjects_3782', + 'description': 'md5:4cc084a8012beb01f037724423a4d4ed', + }, + }] diff --git a/yt_dlp/extractor/bravotv.py b/yt_dlp/extractor/bravotv.py index d4895848e0..ec72f0d884 100644 --- a/yt_dlp/extractor/bravotv.py +++ b/yt_dlp/extractor/bravotv.py @@ -1,117 +1,189 @@ -import re - from .adobepass import AdobePassIE +from ..networking import HEADRequest from ..utils import ( - smuggle_url, - update_url_query, - int_or_none, + extract_attributes, float_or_none, - try_get, - dict_get, + get_element_html_by_class, + int_or_none, + merge_dicts, + parse_age_limit, + remove_end, + str_or_none, + traverse_obj, + unescapeHTML, + unified_timestamp, + update_url_query, + url_or_none, ) class BravoTVIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?P<req_id>bravotv|oxygen)\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<site>bravotv|oxygen)\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', - 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', 'info_dict': { - 'id': 'epL0pmK1kQlT', + 'id': '3923059', 'ext': 'mp4', 'title': 'The Top Chef Season 16 Winner Is...', 'description': 'Find out who takes the title of Top Chef!', - 'uploader': 'NBCU-BRAV', 'upload_date': '20190314', 'timestamp': 1552591860, 'season_number': 16, 'episode_number': 15, 'series': 'Top Chef', 'episode': 'The Top Chef Season 16 Winner Is...', - 'duration': 190.0, - } + 'duration': 190.357, + 'season': 'Season 16', + 'thumbnail': r're:^https://.+\.jpg', + }, + 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', - 'only_matching': True, + 'url': 'https://www.bravotv.com/top-chef/season-20/episode-1/london-calling', + 'info_dict': { + 'id': '9000234570', + 'ext': 'mp4', + 'title': 'London Calling', + 'description': 'md5:5af95a8cbac1856bd10e7562f86bb759', + 'upload_date': '20230310', + 'timestamp': 1678410000, + 'season_number': 20, + 'episode_number': 1, + 'series': 'Top Chef', + 'episode': 'London Calling', + 'duration': 3266.03, + 'season': 'Season 20', + 'chapters': 'count:7', + 'thumbnail': r're:^https://.+\.jpg', + 'age_limit': 14, + }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'This video requires AdobePass MSO credentials', + }, { + 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-1/closing-night', + 'info_dict': { + 'id': '3692045', + 'ext': 'mp4', + 'title': 'Closing Night', + 'description': 'md5:3170065c5c2f19548d72a4cbc254af63', + 'upload_date': '20180401', + 'timestamp': 1522623600, + 'season_number': 1, + 'episode_number': 1, + 'series': 'In Ice Cold Blood', + 'episode': 'Closing Night', + 'duration': 2629.051, + 'season': 'Season 1', + 'chapters': 'count:6', + 'thumbnail': r're:^https://.+\.jpg', + 'age_limit': 14, + }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'This video requires AdobePass MSO credentials', }, { 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2', + 'info_dict': { + 'id': '3974019', + 'ext': 'mp4', + 'title': '\'Handling The Horwitz House After The Murder (Season 2, Episode 16)', + 'description': 'md5:f9d638dd6946a1c1c0533a9c6100eae5', + 'upload_date': '20190617', + 'timestamp': 1560790800, + 'season_number': 2, + 'episode_number': 16, + 'series': 'In Ice Cold Blood', + 'episode': '\'Handling The Horwitz House After The Murder (Season 2, Episode 16)', + 'duration': 68.235, + 'season': 'Season 2', + 'thumbnail': r're:^https://.+\.jpg', + 'age_limit': 14, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', 'only_matching': True, }] def _real_extract(self, url): - site, display_id = self._match_valid_url(url).groups() + site, display_id = self._match_valid_url(url).group('site', 'id') webpage = self._download_webpage(url, display_id) - settings = self._parse_json(self._search_regex( - r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'), - display_id) - info = {} + settings = self._search_json( + r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>', webpage, 'settings', display_id) + tve = extract_attributes(get_element_html_by_class('tve-video-deck-app', webpage) or '') query = { - 'mbr': 'true', + 'manifest': 'm3u', + 'formats': 'm3u,mpeg4', } - account_pid, release_pid = [None] * 2 - tve = settings.get('ls_tve') + if tve: - query['manifest'] = 'm3u' - mobj = re.search(r'<[^>]+id="pdk-player"[^>]+data-url=["\']?(?:https?:)?//player\.theplatform\.com/p/([^/]+)/(?:[^/]+/)*select/([^?#&"\']+)', webpage) - if mobj: - account_pid, tp_path = mobj.groups() - release_pid = tp_path.strip('/').split('/')[-1] - else: - account_pid = 'HNK2IC' - tp_path = release_pid = tve['release_pid'] - if tve.get('entitlement') == 'auth': - adobe_pass = settings.get('tve_adobe_auth', {}) - if site == 'bravotv': - site = 'bravo' + account_pid = tve.get('data-mpx-media-account-pid') or 'HNK2IC' + account_id = tve['data-mpx-media-account-id'] + metadata = self._parse_json( + tve.get('data-normalized-video', ''), display_id, fatal=False, transform_source=unescapeHTML) + video_id = tve.get('data-guid') or metadata['guid'] + if tve.get('data-entitlement') == 'auth': + auth = traverse_obj(settings, ('tve_adobe_auth', {dict})) or {} + site = remove_end(site, 'tv') + release_pid = tve['data-release-pid'] resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId') or site, - tve['title'], release_pid, tve.get('rating')) - query['auth'] = self._extract_mvpd_auth( - url, release_pid, - adobe_pass.get('adobePassRequestorId') or site, resource) + tve.get('data-adobe-pass-resource-id') or auth.get('adobePassResourceId') or site, + tve['data-title'], release_pid, tve.get('data-rating')) + query.update({ + 'switch': 'HLSServiceSecure', + 'auth': self._extract_mvpd_auth( + url, release_pid, auth.get('adobePassRequestorId') or site, resource), + }) + else: - shared_playlist = settings['ls_playlist'] - account_pid = shared_playlist['account_pid'] - metadata = shared_playlist['video_metadata'][shared_playlist['default_clip']] - tp_path = release_pid = metadata.get('release_pid') - if not release_pid: - release_pid = metadata['guid'] - tp_path = 'media/guid/2140479951/' + release_pid - info.update({ - 'title': metadata['title'], - 'description': metadata.get('description'), - 'season_number': int_or_none(metadata.get('season_num')), - 'episode_number': int_or_none(metadata.get('episode_num')), - }) - query['switch'] = 'progressive' - - tp_url = 'http://link.theplatform.com/s/%s/%s' % (account_pid, tp_path) + ls_playlist = traverse_obj(settings, ('ls_playlist', ..., {dict}), get_all=False) or {} + account_pid = ls_playlist.get('mpxMediaAccountPid') or 'PHSl-B' + account_id = ls_playlist['mpxMediaAccountId'] + video_id = ls_playlist['defaultGuid'] + metadata = traverse_obj( + ls_playlist, ('videos', lambda _, v: v['guid'] == video_id, {dict}), get_all=False) + tp_url = f'https://link.theplatform.com/s/{account_pid}/media/guid/{account_id}/{video_id}' tp_metadata = self._download_json( - update_url_query(tp_url, {'format': 'preview'}), - display_id, fatal=False) - if tp_metadata: - info.update({ - 'title': tp_metadata.get('title'), - 'description': tp_metadata.get('description'), - 'duration': float_or_none(tp_metadata.get('duration'), 1000), - 'season_number': int_or_none( - dict_get(tp_metadata, ('pl1$seasonNumber', 'nbcu$seasonNumber'))), - 'episode_number': int_or_none( - dict_get(tp_metadata, ('pl1$episodeNumber', 'nbcu$episodeNumber'))), - # For some reason the series is sometimes wrapped into a single element array. - 'series': try_get( - dict_get(tp_metadata, ('pl1$show', 'nbcu$show')), - lambda x: x[0] if isinstance(x, list) else x, - expected_type=str), - 'episode': dict_get( - tp_metadata, ('pl1$episodeName', 'nbcu$episodeName', 'title')), - }) + update_url_query(tp_url, {'format': 'preview'}), video_id, fatal=False) - info.update({ - '_type': 'url_transparent', - 'id': release_pid, - 'url': smuggle_url(update_url_query(tp_url, query), {'force_smil_url': True}), - 'ie_key': 'ThePlatform', - }) - return info + seconds_or_none = lambda x: float_or_none(x, 1000) + chapters = traverse_obj(tp_metadata, ('chapters', ..., { + 'start_time': ('startTime', {seconds_or_none}), + 'end_time': ('endTime', {seconds_or_none}), + })) + # prune pointless single chapters that span the entire duration from short videos + if len(chapters) == 1 and not traverse_obj(chapters, (0, 'end_time')): + chapters = None + + m3u8_url = self._request_webpage(HEADRequest( + update_url_query(f'{tp_url}/stream.m3u8', query)), video_id, 'Checking m3u8 URL').url + if 'mpeg_cenc' in m3u8_url: + self.report_drm(video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'chapters': chapters, + **merge_dicts(traverse_obj(tp_metadata, { + 'title': 'title', + 'description': 'description', + 'duration': ('duration', {seconds_or_none}), + 'timestamp': ('pubDate', {seconds_or_none}), + 'season_number': (('pl1$seasonNumber', 'nbcu$seasonNumber'), {int_or_none}), + 'episode_number': (('pl1$episodeNumber', 'nbcu$episodeNumber'), {int_or_none}), + 'series': (('pl1$show', 'nbcu$show'), (None, ...), {str}), + 'episode': (('title', 'pl1$episodeNumber', 'nbcu$episodeNumber'), {str_or_none}), + 'age_limit': ('ratings', ..., 'rating', {parse_age_limit}), + }, get_all=False), traverse_obj(metadata, { + 'title': 'title', + 'description': 'description', + 'duration': ('durationInSeconds', {int_or_none}), + 'timestamp': ('airDate', {unified_timestamp}), + 'thumbnail': ('thumbnailUrl', {url_or_none}), + 'season_number': ('seasonNumber', {int_or_none}), + 'episode_number': ('episodeNumber', {int_or_none}), + 'episode': 'episodeTitle', + 'series': 'show', + })), + } diff --git a/yt_dlp/extractor/breakcom.py b/yt_dlp/extractor/breakcom.py deleted file mode 100644 index 00cf308c7a..0000000000 --- a/yt_dlp/extractor/breakcom.py +++ /dev/null @@ -1,86 +0,0 @@ -from .common import InfoExtractor -from .youtube import YoutubeIE -from ..utils import ( - int_or_none, - url_or_none, -) - - -class BreakIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?break\.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' - _TESTS = [{ - 'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', - 'info_dict': { - 'id': '2468056', - 'ext': 'mp4', - 'title': 'When Girls Act Like D-Bags', - 'age_limit': 13, - }, - }, { - # youtube embed - 'url': 'http://www.break.com/video/someone-forgot-boat-brakes-work', - 'info_dict': { - 'id': 'RrrDLdeL2HQ', - 'ext': 'mp4', - 'title': 'Whale Watching Boat Crashing Into San Diego Dock', - 'description': 'md5:afc1b2772f0a8468be51dd80eb021069', - 'upload_date': '20160331', - 'uploader': 'Steve Holden', - 'uploader_id': 'sdholden07', - }, - 'params': { - 'skip_download': True, - } - }, { - 'url': 'http://www.break.com/video/ugc/baby-flex-2773063', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id, video_id = self._match_valid_url(url).groups() - - webpage = self._download_webpage(url, display_id) - - youtube_url = YoutubeIE._extract_url(webpage) - if youtube_url: - return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) - - content = self._parse_json( - self._search_regex( - r'(?s)content["\']\s*:\s*(\[.+?\])\s*[,\n]', webpage, - 'content'), - display_id) - - formats = [] - for video in content: - video_url = url_or_none(video.get('url')) - if not video_url: - continue - bitrate = int_or_none(self._search_regex( - r'(\d+)_kbps', video_url, 'tbr', default=None)) - formats.append({ - 'url': video_url, - 'format_id': 'http-%d' % bitrate if bitrate else 'http', - 'tbr': bitrate, - }) - - title = self._search_regex( - (r'title["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', - r'<h1[^>]*>(?P<value>[^<]+)'), webpage, 'title', group='value') - - def get(key, name): - return int_or_none(self._search_regex( - r'%s["\']\s*:\s*["\'](\d+)' % key, webpage, name, - default=None)) - - age_limit = get('ratings', 'age limit') - video_id = video_id or get('pid', 'video id') or display_id - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage), - 'age_limit': age_limit, - 'formats': formats, - } diff --git a/yt_dlp/extractor/breitbart.py b/yt_dlp/extractor/breitbart.py index ea0a59c866..fedf4772a9 100644 --- a/yt_dlp/extractor/breitbart.py +++ b/yt_dlp/extractor/breitbart.py @@ -2,7 +2,7 @@ class BreitBartIE(InfoExtractor): - _VALID_URL = r'https?:\/\/(?:www\.)breitbart.com/videos/v/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?breitbart\.com/videos/v/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.breitbart.com/videos/v/5cOz1yup/?pl=Ij6NDOji', 'md5': '0aa6d1d6e183ac5ca09207fe49f17ade', @@ -13,7 +13,7 @@ class BreitBartIE(InfoExtractor): 'description': 'md5:bac35eb0256d1cb17f517f54c79404d5', 'thumbnail': 'https://cdn.jwplayer.com/thumbs/5cOz1yup-1920.jpg', 'age_limit': 0, - } + }, }, { 'url': 'https://www.breitbart.com/videos/v/eaiZjVOn/', 'only_matching': True, @@ -30,5 +30,5 @@ def _real_extract(self, url): 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), 'age_limit': self._rta_search(webpage), - 'formats': formats + 'formats': formats, } diff --git a/yt_dlp/extractor/brightcove.py b/yt_dlp/extractor/brightcove.py index 2b7ddcae8d..2526f25dac 100644 --- a/yt_dlp/extractor/brightcove.py +++ b/yt_dlp/extractor/brightcove.py @@ -1,25 +1,24 @@ import base64 import re import struct +import urllib.parse import xml.etree.ElementTree from .adobepass import AdobePassIE from .common import InfoExtractor -from ..compat import ( - compat_etree_fromstring, - compat_HTTPError, - compat_parse_qs, - compat_urlparse, -) +from ..compat import compat_etree_fromstring +from ..networking.exceptions import HTTPError from ..utils import ( + ExtractorError, + UnsupportedError, clean_html, dict_get, extract_attributes, - ExtractorError, find_xpath_attr, fix_xml_ampersands, float_or_none, int_or_none, + join_nonempty, js_to_json, mimetype2ext, parse_iso8601, @@ -29,7 +28,6 @@ try_get, unescapeHTML, unsmuggle_url, - UnsupportedError, update_url_query, url_or_none, ) @@ -142,7 +140,7 @@ class BrightcoveLegacyIE(InfoExtractor): # from http://www.un.org/chinese/News/story.asp?NewsID=27724 'url': 'https://link.brightcove.com/services/player/bcpid1722935254001/?bctid=5360463607001&autoStart=false&secureConnections=true&width=650&height=350', 'only_matching': True, # Tested in GenericIE - } + }, ] _WEBPAGE_TESTS = [{ @@ -315,7 +313,7 @@ def _build_brightcove_url(cls, object_str): object_str = fix_xml_ampersands(object_str) try: - object_doc = compat_etree_fromstring(object_str.encode('utf-8')) + object_doc = compat_etree_fromstring(object_str.encode()) except xml.etree.ElementTree.ParseError: return @@ -323,7 +321,7 @@ def _build_brightcove_url(cls, object_str): if fv_el is not None: flashvars = dict( (k, v[0]) - for k, v in compat_parse_qs(fv_el.attrib['value']).items()) + for k, v in urllib.parse.parse_qs(fv_el.attrib['value']).items()) else: flashvars = {} @@ -340,32 +338,32 @@ def find_param(name): params = {} - playerID = find_param('playerID') or find_param('playerId') - if playerID is None: + player_id = find_param('playerID') or find_param('playerId') + if player_id is None: raise ExtractorError('Cannot find player ID') - params['playerID'] = playerID + params['playerID'] = player_id - playerKey = find_param('playerKey') + player_key = find_param('playerKey') # Not all pages define this value - if playerKey is not None: - params['playerKey'] = playerKey + if player_key is not None: + params['playerKey'] = player_key # These fields hold the id of the video - videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList') - if videoPlayer is not None: - if isinstance(videoPlayer, list): - videoPlayer = videoPlayer[0] - videoPlayer = videoPlayer.strip() + video_player = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList') + if video_player is not None: + if isinstance(video_player, list): + video_player = video_player[0] + video_player = video_player.strip() # UUID is also possible for videoPlayer (e.g. # http://www.popcornflix.com/hoodies-vs-hooligans/7f2d2b87-bbf2-4623-acfb-ea942b4f01dd # or http://www8.hp.com/cn/zh/home.html) if not (re.match( r'^(?:\d+|[\da-fA-F]{8}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{12})$', - videoPlayer) or videoPlayer.startswith('ref:')): + video_player) or video_player.startswith('ref:')): return None - params['@videoPlayer'] = videoPlayer - linkBase = find_param('linkBaseURL') - if linkBase is not None: - params['linkBaseURL'] = linkBase + params['@videoPlayer'] = video_player + link_base = find_param('linkBaseURL') + if link_base is not None: + params['linkBaseURL'] = link_base return cls._make_brightcove_url(params) @classmethod @@ -389,7 +387,7 @@ def _build_brightcove_url_from_js(cls, object_js): @classmethod def _make_brightcove_url(cls, params): return update_url_query( - 'http://c.brightcove.com/services/viewer/htmlFederated', params) + 'https://c.brightcove.com/services/viewer/htmlFederated', params) @classmethod def _extract_brightcove_url(cls, webpage): @@ -448,13 +446,13 @@ def _real_extract(self, url): url = re.sub(r'(?<=[?&])bckey', 'playerKey', url) mobj = self._match_valid_url(url) query_str = mobj.group('query') - query = compat_urlparse.parse_qs(query_str) + query = urllib.parse.parse_qs(query_str) - videoPlayer = query.get('@videoPlayer') - if videoPlayer: + video_player = query.get('@videoPlayer') + if video_player: # We set the original url as the default 'Referer' header referer = query.get('linkBaseURL', [None])[0] or smuggled_data.get('Referer', url) - video_id = videoPlayer[0] + video_id = video_player[0] if 'playerID' not in query: mobj = re.search(r'/bcpid(\d+)', url) if mobj is not None: @@ -473,7 +471,7 @@ def _real_extract(self, url): if referer: headers['Referer'] = referer player_page = self._download_webpage( - 'http://link.brightcove.com/services/player/bcpid' + player_id[0], + 'https://link.brightcove.com/services/player/bcpid' + player_id[0], video_id, headers=headers, fatal=False) if player_page: player_key = self._search_regex( @@ -483,7 +481,7 @@ def _real_extract(self, url): enc_pub_id = player_key.split(',')[1].replace('~', '=') publisher_id = struct.unpack('>Q', base64.urlsafe_b64decode(enc_pub_id))[0] if publisher_id: - brightcove_new_url = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (publisher_id, video_id) + brightcove_new_url = f'https://players.brightcove.net/{publisher_id}/default_default/index.html?videoId={video_id}' if referer: brightcove_new_url = smuggle_url(brightcove_new_url, {'referrer': referer}) return self.url_result(brightcove_new_url, BrightcoveNewIE.ie_key(), video_id) @@ -541,12 +539,7 @@ def _parse_brightcove_metadata(self, json_data, video_id, headers={}): }) def build_format_id(kind): - format_id = kind - if tbr: - format_id += '-%dk' % int(tbr) - if height: - format_id += '-%dp' % height - return format_id + return join_nonempty(kind, tbr and f'{int(tbr)}k', height and f'{height}p') if src or streaming_src: f.update({ @@ -575,6 +568,7 @@ def build_format_id(kind): self.raise_no_formats( error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + headers.pop('Authorization', None) # or else http formats will give error 400 for f in formats: f.setdefault('http_headers', {}).update(headers) @@ -653,7 +647,7 @@ class BrightcoveNewIE(BrightcoveNewBaseIE): 'params': { # m3u8 download 'skip_download': True, - } + }, }, { # playlist stream 'url': 'https://players.brightcove.net/1752604059001/S13cJdUBz_default/index.html?playlistId=5718313430001', @@ -665,7 +659,7 @@ class BrightcoveNewIE(BrightcoveNewBaseIE): 'params': { # m3u8 download 'skip_download': True, - } + }, }, { 'url': 'http://players.brightcove.net/5690807595001/HyZNerRl7_default/index.html?playlistId=5743160747001', 'only_matching': True, @@ -803,7 +797,7 @@ def _extract_brightcove_urls(ie, webpage): # Look for iframe embeds [1] for _, url in re.findall( r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): - entries.append(url if url.startswith('http') else 'http:' + url) + entries.append(url if url.startswith(('http:', 'https:')) else 'https:' + url) # Look for <video> tags [2] and embed_in_page embeds [3] # [2] looks like: @@ -832,8 +826,7 @@ def _extract_brightcove_urls(ie, webpage): player_id = player_id or attrs.get('data-player') or 'default' embed = embed or attrs.get('data-embed') or 'default' - bc_url = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % ( - account_id, player_id, embed, video_id) + bc_url = f'https://players.brightcove.net/{account_id}/{player_id}_{embed}/index.html?videoId={video_id}' # Some brightcove videos may be embedded with video tag only and # without script tag or any mentioning of brightcove at all. Such @@ -864,13 +857,13 @@ def _real_extract(self, url): account_id, player_id, embed, content_type, video_id = self._match_valid_url(url).groups() - policy_key_id = '%s_%s' % (account_id, player_id) + policy_key_id = f'{account_id}_{player_id}' policy_key = self.cache.load('brightcove', policy_key_id) policy_key_extracted = False store_pk = lambda x: self.cache.store('brightcove', policy_key_id, x) def extract_policy_key(): - base_url = 'http://players.brightcove.net/%s/%s_%s/' % (account_id, player_id, embed) + base_url = f'https://players.brightcove.net/{account_id}/{player_id}_{embed}/' config = self._download_json( base_url + 'config.json', video_id, fatal=False) or {} policy_key = try_get( @@ -895,8 +888,9 @@ def extract_policy_key(): store_pk(policy_key) return policy_key - api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id) - headers = {} + token = smuggled_data.get('token') + api_url = f'https://{"edge-auth" if token else "edge"}.api.brightcove.com/playback/v1/accounts/{account_id}/{content_type}s/{video_id}' + headers = {'Authorization': f'Bearer {token}'} if token else {} referrer = smuggled_data.get('referrer') # XXX: notice the spelling/case of the key if referrer: headers.update({ @@ -908,13 +902,13 @@ def extract_policy_key(): if not policy_key: policy_key = extract_policy_key() policy_key_extracted = True - headers['Accept'] = 'application/json;pk=%s' % policy_key + headers['Accept'] = f'application/json;pk={policy_key}' try: json_data = self._download_json(api_url, video_id, headers=headers) break except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): - json_data = self._parse_json(e.cause.read().decode(), video_id)[0] + if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403): + json_data = self._parse_json(e.cause.response.read().decode(), video_id)[0] message = json_data.get('message') or json_data['error_code'] if json_data.get('error_subcode') == 'CLIENT_GEO': self.raise_geo_restricted(msg=message) @@ -934,7 +928,7 @@ def extract_policy_key(): custom_fields['bcadobepassresourceid']) json_data = self._download_json( api_url, video_id, headers={ - 'Accept': 'application/json;pk=%s' % policy_key + 'Accept': f'application/json;pk={policy_key}', }, query={ 'tveToken': tve_token, }) diff --git a/yt_dlp/extractor/brilliantpala.py b/yt_dlp/extractor/brilliantpala.py new file mode 100644 index 0000000000..950a70a5e1 --- /dev/null +++ b/yt_dlp/extractor/brilliantpala.py @@ -0,0 +1,136 @@ +import hashlib + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + urlencode_postdata, +) + + +class BrilliantpalaBaseIE(InfoExtractor): + _NETRC_MACHINE = 'brilliantpala' + _DOMAIN = '{subdomain}.brilliantpala.org' + + def _initialize_pre_login(self): + self._HOMEPAGE = f'https://{self._DOMAIN}' + self._LOGIN_API = f'{self._HOMEPAGE}/login/' + self._LOGOUT_DEVICES_API = f'{self._HOMEPAGE}/logout_devices/?next=/' + self._CONTENT_API = f'{self._HOMEPAGE}/api/v2.4/contents/{{content_id}}/' + self._HLS_AES_URI = f'{self._HOMEPAGE}/api/v2.5/video_contents/{{content_id}}/key/' + + def _get_logged_in_username(self, url, video_id): + webpage, urlh = self._download_webpage_handle(url, video_id) + if urlh.url.startswith(self._LOGIN_API): + self.raise_login_required() + return self._html_search_regex( + r'"username"\s*:\s*"(?P<username>[^"]+)"', webpage, 'logged-in username') + + def _perform_login(self, username, password): + login_page, urlh = self._download_webpage_handle( + self._LOGIN_API, None, 'Downloading login page', expected_status=401) + if urlh.status != 401 and not urlh.url.startswith(self._LOGIN_API): + self.write_debug('Cookies are valid, no login required.') + return + + if urlh.status == 401: + self.write_debug('Got HTTP Error 401; cookies have been invalidated') + login_page = self._download_webpage(self._LOGIN_API, None, 'Re-downloading login page') + + login_form = self._hidden_inputs(login_page) + login_form.update({ + 'username': username, + 'password': password, + }) + self._set_cookie(self._DOMAIN, 'csrftoken', login_form['csrfmiddlewaretoken']) + + logged_page = self._download_webpage( + self._LOGIN_API, None, note='Logging in', headers={'Referer': self._LOGIN_API}, + data=urlencode_postdata(login_form)) + + if self._html_search_regex( + r'(Your username / email and password)', logged_page, 'auth fail', default=None): + raise ExtractorError('wrong username or password', expected=True) + + # the maximum number of logins is one + if self._html_search_regex( + r'(Logout Other Devices)', logged_page, 'logout devices button', default=None): + logout_device_form = self._hidden_inputs(logged_page) + self._download_webpage( + self._LOGOUT_DEVICES_API, None, headers={'Referer': self._LOGIN_API}, + note='Logging out other devices', data=urlencode_postdata(logout_device_form)) + + def _real_extract(self, url): + course_id, content_id = self._match_valid_url(url).group('course_id', 'content_id') + video_id = f'{course_id}-{content_id}' + + username = self._get_logged_in_username(url, video_id) + + content_json = self._download_json( + self._CONTENT_API.format(content_id=content_id), video_id, + note='Fetching content info', errnote='Unable to fetch content info') + + entries = [] + for stream in traverse_obj(content_json, ('video', 'streams', lambda _, v: v['id'] and v['url'])): + formats = self._extract_m3u8_formats(stream['url'], video_id, fatal=False) + if not formats: + continue + entries.append({ + 'id': str(stream['id']), + 'title': content_json.get('title'), + 'formats': formats, + 'hls_aes': {'uri': self._HLS_AES_URI.format(content_id=content_id)}, + 'http_headers': {'X-Key': hashlib.sha256(username.encode('ascii')).hexdigest()}, + 'thumbnail': content_json.get('cover_image'), + }) + + return self.playlist_result( + entries, playlist_id=video_id, playlist_title=content_json.get('title')) + + +class BrilliantpalaElearnIE(BrilliantpalaBaseIE): + IE_NAME = 'Brilliantpala:Elearn' + IE_DESC = 'VoD on elearn.brilliantpala.org' + _VALID_URL = r'https?://elearn\.brilliantpala\.org/courses/(?P<course_id>\d+)/contents/(?P<content_id>\d+)/?' + _TESTS = [{ + 'url': 'https://elearn.brilliantpala.org/courses/42/contents/12345/', + 'only_matching': True, + }, { + 'url': 'https://elearn.brilliantpala.org/courses/98/contents/36683/', + 'info_dict': { + 'id': '23577', + 'ext': 'mp4', + 'title': 'Physical World, Units and Measurements - 1', + 'thumbnail': 'https://d1j3vi2u94ebt0.cloudfront.net/institute/brilliantpalalms/chapter_contents/26237/e657f81b90874be19795c7ea081f8d5c.png', + 'live_status': 'not_live', + }, + 'params': { + 'skip_download': True, + }, + }] + + _DOMAIN = BrilliantpalaBaseIE._DOMAIN.format(subdomain='elearn') + + +class BrilliantpalaClassesIE(BrilliantpalaBaseIE): + IE_NAME = 'Brilliantpala:Classes' + IE_DESC = 'VoD on classes.brilliantpala.org' + _VALID_URL = r'https?://classes\.brilliantpala\.org/courses/(?P<course_id>\d+)/contents/(?P<content_id>\d+)/?' + _TESTS = [{ + 'url': 'https://classes.brilliantpala.org/courses/42/contents/12345/', + 'only_matching': True, + }, { + 'url': 'https://classes.brilliantpala.org/courses/416/contents/25445/', + 'info_dict': { + 'id': '9128', + 'ext': 'mp4', + 'title': 'Motion in a Straight Line - Class 1', + 'thumbnail': 'https://d3e4y8hquds3ek.cloudfront.net/institute/brilliantpalaelearn/chapter_contents/ff5ba838d0ec43419f67387fe1a01fa8.png', + 'live_status': 'not_live', + }, + 'params': { + 'skip_download': True, + }, + }] + + _DOMAIN = BrilliantpalaBaseIE._DOMAIN.format(subdomain='classes') diff --git a/yt_dlp/extractor/bundesliga.py b/yt_dlp/extractor/bundesliga.py index e76dd58ddb..29f8f94157 100644 --- a/yt_dlp/extractor/bundesliga.py +++ b/yt_dlp/extractor/bundesliga.py @@ -16,17 +16,17 @@ class BundesligaIE(InfoExtractor): 'upload_date': '20220928', 'duration': 146, 'timestamp': 1664366511, - 'description': 'md5:803d4411bd134140c774021dd4b7598b' - } + 'description': 'md5:803d4411bd134140c774021dd4b7598b', + }, }, { 'url': 'https://www.bundesliga.com/en/bundesliga/videos/latest-features/T8IKc8TX?vid=ROHjs06G', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.bundesliga.com/en/bundesliga/videos/goals?vid=mOG56vWA', - 'only_matching': True - } + 'only_matching': True, + }, ] def _real_extract(self, url): diff --git a/yt_dlp/extractor/bundestag.py b/yt_dlp/extractor/bundestag.py new file mode 100644 index 0000000000..71f7726659 --- /dev/null +++ b/yt_dlp/extractor/bundestag.py @@ -0,0 +1,123 @@ +import functools +import re + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + bug_reports_message, + clean_html, + format_field, + get_element_text_and_html_by_tag, + int_or_none, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class BundestagIE(InfoExtractor): + _VALID_URL = [ + r'https?://dbtg\.tv/[cf]vid/(?P<id>\d+)', + r'https?://www\.bundestag\.de/mediathek/?\?(?:[^#]+&)?videoid=(?P<id>\d+)', + ] + _TESTS = [{ + 'url': 'https://dbtg.tv/cvid/7605304', + 'info_dict': { + 'id': '7605304', + 'ext': 'mp4', + 'title': '145. Sitzung vom 15.12.2023, TOP 24 Barrierefreiheit', + 'description': 'md5:321a9dc6bdad201264c0045efc371561', + }, + }, { + 'url': 'https://www.bundestag.de/mediathek?videoid=7602120&url=L21lZGlhdGhla292ZXJsYXk=&mod=mediathek', + 'info_dict': { + 'id': '7602120', + 'ext': 'mp4', + 'title': '130. Sitzung vom 18.10.2023, TOP 1 Befragung der Bundesregierung', + 'description': 'Befragung der Bundesregierung', + }, + }, { + 'url': 'https://www.bundestag.de/mediathek?videoid=7604941#url=L21lZGlhdGhla292ZXJsYXk/dmlkZW9pZD03NjA0OTQx&mod=mediathek', + 'only_matching': True, + }, { + 'url': 'http://dbtg.tv/fvid/3594346', + 'only_matching': True, + }] + + _OVERLAY_URL = 'https://www.bundestag.de/mediathekoverlay' + _INSTANCE_FORMAT = 'https://cldf-wzw-od.r53.cdn.tv1.eu/13014bundestagod/_definst_/13014bundestag/ondemand/3777parlamentsfernsehen/archiv/app144277506/145293313/{0}/{0}_playlist.smil/playlist.m3u8' + + _SHARE_URL = 'https://webtv.bundestag.de/player/macros/_x_s-144277506/shareData.json?contentId=' + _SHARE_AUDIO_REGEX = r'/\d+_(?P<codec>\w+)_(?P<bitrate>\d+)kb_(?P<channels>\w+)_\w+_\d+\.(?P<ext>\w+)' + _SHARE_VIDEO_REGEX = r'/\d+_(?P<codec>\w+)_(?P<width>\w+)_(?P<height>\w+)_(?P<bitrate>\d+)kb_\w+_\w+_\d+\.(?P<ext>\w+)' + + def _bt_extract_share_formats(self, video_id): + share_data = self._download_json( + f'{self._SHARE_URL}{video_id}', video_id, note='Downloading share format JSON') + if traverse_obj(share_data, ('status', 'code', {int})) != 1: + self.report_warning(format_field( + share_data, [('status', 'message', {str})], + 'Share API response: %s', default='Unknown Share API Error') + + bug_reports_message()) + return + + for name, url in share_data.items(): + if not isinstance(name, str) or not url_or_none(url): + continue + + elif name.startswith('audio'): + match = re.search(self._SHARE_AUDIO_REGEX, url) + yield { + 'format_id': name, + 'url': url, + 'vcodec': 'none', + **traverse_obj(match, { + 'acodec': 'codec', + 'audio_channels': ('channels', {{'mono': 1, 'stereo': 2}.get}), + 'abr': ('bitrate', {int_or_none}), + 'ext': 'ext', + }), + } + + elif name.startswith('download'): + match = re.search(self._SHARE_VIDEO_REGEX, url) + yield { + 'format_id': name, + 'url': url, + **traverse_obj(match, { + 'vcodec': 'codec', + 'tbr': ('bitrate', {int_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'ext': 'ext', + }), + } + + def _real_extract(self, url): + video_id = self._match_id(url) + formats = [] + result = {'id': video_id, 'formats': formats} + + try: + formats.extend(self._extract_m3u8_formats( + self._INSTANCE_FORMAT.format(video_id), video_id, m3u8_id='instance')) + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 404: + raise ExtractorError('Could not find video id', expected=True) + self.report_warning(f'Error extracting hls formats: {error}', video_id) + formats.extend(self._bt_extract_share_formats(video_id)) + if not formats: + self.raise_no_formats('Could not find suitable formats', video_id=video_id) + + result.update(traverse_obj(self._download_webpage( + self._OVERLAY_URL, video_id, + query={'videoid': video_id, 'view': 'main'}, + note='Downloading metadata overlay', fatal=False, + ), { + 'title': ( + {functools.partial(get_element_text_and_html_by_tag, 'h3')}, 0, + {functools.partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}), + 'description': ({functools.partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}), + })) + + return result diff --git a/yt_dlp/extractor/businessinsider.py b/yt_dlp/extractor/businessinsider.py index 4b3f5e68b8..7cb9af692a 100644 --- a/yt_dlp/extractor/businessinsider.py +++ b/yt_dlp/extractor/businessinsider.py @@ -10,7 +10,7 @@ class BusinessInsiderIE(InfoExtractor): 'info_dict': { 'id': 'cjGDb0X9', 'ext': 'mp4', - 'title': "Bananas give you more radiation exposure than living next to a nuclear power plant", + 'title': 'Bananas give you more radiation exposure than living next to a nuclear power plant', 'description': 'md5:0175a3baf200dd8fa658f94cade841b3', 'upload_date': '20160611', 'timestamp': 1465675620, @@ -41,5 +41,5 @@ def _real_extract(self, url): r'(?:jwplatform\.com/players/|jwplayer_)([a-zA-Z0-9]{8})'), webpage, 'jwplatform id') return self.url_result( - 'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(), + f'jwplatform:{jwplatform_id}', ie=JWPlatformIE.ie_key(), video_id=video_id) diff --git a/yt_dlp/extractor/buzzfeed.py b/yt_dlp/extractor/buzzfeed.py index b30a3b7ae2..9847095bcf 100644 --- a/yt_dlp/extractor/buzzfeed.py +++ b/yt_dlp/extractor/buzzfeed.py @@ -23,8 +23,8 @@ class BuzzFeedIE(InfoExtractor): 'upload_date': '20141024', 'uploader_id': 'Buddhanz1', 'uploader': 'Angry Ram', - } - }] + }, + }], }, { 'url': 'http://www.buzzfeed.com/sheridanwatson/look-at-this-cute-dog-omg?utm_term=4ldqpia', 'params': { @@ -45,7 +45,7 @@ class BuzzFeedIE(InfoExtractor): 'uploader_id': 'CindysMunchkin', 'uploader': 're:^Munchkin the', }, - }] + }], }, { 'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK', 'info_dict': { diff --git a/yt_dlp/extractor/byutv.py b/yt_dlp/extractor/byutv.py index 9ed6efe799..e9796f7dab 100644 --- a/yt_dlp/extractor/byutv.py +++ b/yt_dlp/extractor/byutv.py @@ -8,9 +8,9 @@ class BYUtvIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?byutv\.org/(?:watch|player)/(?!event/)(?P<id>[0-9a-f-]+)(?:/(?P<display_id>[^/?#&]+))?' _TESTS = [{ - # ooyalaVOD 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5', 'info_dict': { 'id': 'ZvanRocTpW-G5_yZFeltTAMv6jxOU9KH', @@ -24,7 +24,6 @@ class BYUtvIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': ['Ooyala'], }, { # dvr 'url': 'https://www.byutv.org/player/8f1dab9b-b243-47c8-b525-3e2d021a3451/byu-softball-pacific-vs-byu-41219---game-2', @@ -37,7 +36,7 @@ class BYUtvIE(InfoExtractor): 'duration': 11645, }, 'params': { - 'skip_download': True + 'skip_download': True, }, }, { 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d', @@ -63,19 +62,6 @@ def _real_extract(self, url): 'x-byutv-platformkey': 'xsaaw9c7y5', }) - ep = video.get('ooyalaVOD') - if ep: - return { - '_type': 'url_transparent', - 'ie_key': 'Ooyala', - 'url': 'ooyala:%s' % ep['providerId'], - 'id': video_id, - 'display_id': display_id, - 'title': ep.get('title'), - 'description': ep.get('description'), - 'thumbnail': ep.get('imageThumbnail'), - } - info = {} formats = [] subtitles = {} diff --git a/yt_dlp/extractor/c56.py b/yt_dlp/extractor/c56.py index e4b1c9a84c..6264803dd6 100644 --- a/yt_dlp/extractor/c56.py +++ b/yt_dlp/extractor/c56.py @@ -38,7 +38,7 @@ def _real_extract(self, url): return self.url_result(sohu_video_info['url'], 'Sohu') page = self._download_json( - 'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info') + f'http://vxml.56.com/json/{text_id}/', text_id, 'Downloading video info') info = page['info'] @@ -46,7 +46,7 @@ def _real_extract(self, url): { 'format_id': f['type'], 'filesize': int(f['filesize']), - 'url': f['url'] + 'url': f['url'], } for f in info['rfiles'] ] diff --git a/yt_dlp/extractor/cableav.py b/yt_dlp/extractor/cableav.py deleted file mode 100644 index 2e374e5eba..0000000000 --- a/yt_dlp/extractor/cableav.py +++ /dev/null @@ -1,32 +0,0 @@ -from .common import InfoExtractor - - -class CableAVIE(InfoExtractor): - _VALID_URL = r'https://cableav\.tv/(?P<id>[a-zA-Z0-9]+)' - _TESTS = [{ - 'url': 'https://cableav.tv/lS4iR9lWjN8/', - 'md5': '7e3fe5e49d61c4233b7f5b0f69b15e18', - 'info_dict': { - 'id': 'lS4iR9lWjN8', - 'ext': 'mp4', - 'title': '國產麻豆AV 叮叮映畫 DDF001 情欲小說家 - CableAV', - 'description': '國產AV 480p, 720p 国产麻豆AV 叮叮映画 DDF001 情欲小说家', - 'thumbnail': r're:^https?://.*\.jpg$', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_url = self._og_search_video_url(webpage, secure=False) - - formats = self._extract_m3u8_formats(video_url, video_id, 'mp4') - - return { - 'id': video_id, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'formats': formats, - } diff --git a/yt_dlp/extractor/caffeinetv.py b/yt_dlp/extractor/caffeinetv.py new file mode 100644 index 0000000000..aa107f8585 --- /dev/null +++ b/yt_dlp/extractor/caffeinetv.py @@ -0,0 +1,74 @@ +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + parse_iso8601, + traverse_obj, + urljoin, +) + + +class CaffeineTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?caffeine\.tv/[^/?#]+/video/(?P<id>[\da-f-]+)' + _TESTS = [{ + 'url': 'https://www.caffeine.tv/TsuSurf/video/cffc0a00-e73f-11ec-8080-80017d29f26e', + 'info_dict': { + 'id': 'cffc0a00-e73f-11ec-8080-80017d29f26e', + 'ext': 'mp4', + 'title': 'GOOOOD MORNINNNNN #highlights', + 'timestamp': 1654702180, + 'upload_date': '20220608', + 'uploader': 'RahJON Wicc', + 'uploader_id': 'TsuSurf', + 'duration': 3145, + 'age_limit': 17, + 'thumbnail': 'https://www.caffeine.tv/broadcasts/776b6f84-9cd5-42e3-af1d-4a776eeed697/replay/lobby.jpg', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'tags': ['highlights', 'battlerap'], + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json( + f'https://api.caffeine.tv/social/public/activity/{video_id}', video_id) + broadcast_info = traverse_obj(json_data, ('broadcast_info', {dict})) or {} + + video_url = broadcast_info['video_url'] + ext = determine_ext(video_url) + if ext == 'm3u8': + formats = self._extract_m3u8_formats(video_url, video_id, 'mp4') + else: + formats = [{'url': video_url}] + + return { + 'id': video_id, + 'formats': formats, + **traverse_obj(json_data, { + 'like_count': ('like_count', {int_or_none}), + 'view_count': ('view_count', {int_or_none}), + 'comment_count': ('comment_count', {int_or_none}), + 'tags': ('tags', ..., {str}, {lambda x: x or None}), + 'uploader': ('user', 'name', {str}), + 'uploader_id': (((None, 'user'), 'username'), {str}, any), + 'is_live': ('is_live', {bool}), + }), + **traverse_obj(broadcast_info, { + 'title': ('broadcast_title', {str}), + 'duration': ('content_duration', {int_or_none}), + 'timestamp': ('broadcast_start_time', {parse_iso8601}), + 'thumbnail': ('preview_image_path', {lambda x: urljoin(url, x)}), + }), + 'age_limit': { + # assume Apple Store ratings: https://en.wikipedia.org/wiki/Mobile_software_content_rating_system + 'FOUR_PLUS': 0, + 'NINE_PLUS': 9, + 'TWELVE_PLUS': 12, + 'SEVENTEEN_PLUS': 17, + }.get(broadcast_info.get('content_rating'), 17), + } diff --git a/yt_dlp/extractor/callin.py b/yt_dlp/extractor/callin.py index c77179c7bb..b7061a7d14 100644 --- a/yt_dlp/extractor/callin.py +++ b/yt_dlp/extractor/callin.py @@ -29,8 +29,8 @@ class CallinIE(InfoExtractor): 'series_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553', 'episode': 'The Title IX Regime and the Long March Through and Beyond the Institutions', 'episode_number': 1, - 'episode_id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd' - } + 'episode_id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd', + }, }, { 'url': 'https://www.callin.com/episode/fcc-commissioner-brendan-carr-on-elons-PrumRdSQJW', 'md5': '14ede27ee2c957b7e4db93140fc0745c', @@ -54,7 +54,7 @@ class CallinIE(InfoExtractor): 'thumbnail': 'https://d1z76fhpoqkd01.cloudfront.net/shows/legacy/1ade9142625344045dc17cf523469ced1d93610762f4c886d06aa190a2f979e8.png', 'episode_id': 'c3dab47f237bf953d180d3f243477a84302798be0e0b29bc9ade6d60a69f04f5', 'timestamp': 1662100688.005, - } + }, }, { 'url': 'https://www.callin.com/episode/episode-81-elites-melt-down-over-student-debt-lzxMidUnjA', 'md5': '16f704ddbf82a27e3930533b12062f07', @@ -78,7 +78,7 @@ class CallinIE(InfoExtractor): 'thumbnail': 'https://d1z76fhpoqkd01.cloudfront.net/shows/legacy/461ea0d86172cb6aff7d6c80fd49259cf5e64bdf737a4650f8bc24cf392ca218.png', 'episode_id': '8d06f869798f93a7814e380bceabea72d501417e620180416ff6bd510596e83c', 'timestamp': 1661476708.282, - } + }, }] def try_get_user_name(self, d): @@ -94,7 +94,7 @@ def _real_extract(self, url): next_data = self._search_nextjs_data(webpage, display_id) episode = next_data['props']['pageProps']['episode'] - id = episode['id'] + video_id = episode['id'] title = episode.get('title') or self._generic_title('', webpage) url = episode['m3u8'] formats = self._extract_m3u8_formats(url, display_id, ext='ts') @@ -125,11 +125,11 @@ def _real_extract(self, url): episode_list = traverse_obj(show_json, ('pageProps', 'show', 'episodes')) or [] episode_number = next( - (len(episode_list) - i for (i, e) in enumerate(episode_list) if e.get('id') == id), + (len(episode_list) - i for i, e in enumerate(episode_list) if e.get('id') == video_id), None) return { - 'id': id, + 'id': video_id, '_old_archive_ids': [make_archive_id(self, display_id.rsplit('-', 1)[-1])], 'display_id': display_id, 'title': title, @@ -151,5 +151,5 @@ def _real_extract(self, url): 'series_id': show_id, 'episode': title, 'episode_number': episode_number, - 'episode_id': id + 'episode_id': video_id, } diff --git a/yt_dlp/extractor/caltrans.py b/yt_dlp/extractor/caltrans.py index f4a4a834b8..5513bb2dfa 100644 --- a/yt_dlp/extractor/caltrans.py +++ b/yt_dlp/extractor/caltrans.py @@ -11,7 +11,7 @@ class CaltransIE(InfoExtractor): 'title': 'US-50 : Sacramento : Hwy 50 at 24th', 'live_status': 'is_live', 'thumbnail': 'https://cwwp2.dot.ca.gov/data/d3/cctv/image/hwy50at24th/hwy50at24th.jpg', - } + }, } def _real_extract(self, url): diff --git a/yt_dlp/extractor/cam4.py b/yt_dlp/extractor/cam4.py index 2650cc1ef1..0d0dccb794 100644 --- a/yt_dlp/extractor/cam4.py +++ b/yt_dlp/extractor/cam4.py @@ -12,12 +12,12 @@ class CAM4IE(InfoExtractor): 'age_limit': 18, 'live_status': 'is_live', 'thumbnail': 'https://snapshots.xcdnpro.com/thumbnails/foxynesss', - } + }, } def _real_extract(self, url): channel_id = self._match_id(url) - m3u8_playlist = self._download_json('https://www.cam4.com/rest/v1.0/profile/{}/streamInfo'.format(channel_id), channel_id).get('cdnURL') + m3u8_playlist = self._download_json(f'https://www.cam4.com/rest/v1.0/profile/{channel_id}/streamInfo', channel_id).get('cdnURL') formats = self._extract_m3u8_formats(m3u8_playlist, channel_id, 'mp4', m3u8_id='hls', live=True) diff --git a/yt_dlp/extractor/camdemy.py b/yt_dlp/extractor/camdemy.py index c7079e4224..34dc095af8 100644 --- a/yt_dlp/extractor/camdemy.py +++ b/yt_dlp/extractor/camdemy.py @@ -1,10 +1,7 @@ import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlencode, - compat_urlparse, -) from ..utils import ( clean_html, parse_duration, @@ -28,7 +25,7 @@ class CamdemyIE(InfoExtractor): 'duration': 1591, 'upload_date': '20130114', 'view_count': int, - } + }, }, { # With non-empty description # webpage returns "No permission or not login" @@ -42,7 +39,7 @@ class CamdemyIE(InfoExtractor): 'description': 'md5:2a9f989c2b153a2342acee579c6e7db6', 'creator': 'evercam', 'duration': 318, - } + }, }, { # External source (YouTube) 'url': 'http://www.camdemy.com/media/14842', @@ -76,12 +73,12 @@ def _real_extract(self, url): title = oembed_obj['title'] thumb_url = oembed_obj['thumbnail_url'] - video_folder = compat_urlparse.urljoin(thumb_url, 'video/') + video_folder = urllib.parse.urljoin(thumb_url, 'video/') file_list_doc = self._download_xml( - compat_urlparse.urljoin(video_folder, 'fileList.xml'), + urllib.parse.urljoin(video_folder, 'fileList.xml'), video_id, 'Downloading filelist XML') file_name = file_list_doc.find('./video/item/fileName').text - video_url = compat_urlparse.urljoin(video_folder, file_name) + video_url = urllib.parse.urljoin(video_folder, file_name) # Some URLs return "No permission or not login" in a webpage despite being # freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885) @@ -117,35 +114,35 @@ class CamdemyFolderIE(InfoExtractor): 'id': '450', 'title': '信號與系統 2012 & 2011 (Signals and Systems)', }, - 'playlist_mincount': 145 + 'playlist_mincount': 145, }, { # links without trailing slash # and multi-page 'url': 'http://www.camdemy.com/folder/853', 'info_dict': { 'id': '853', - 'title': '科學計算 - 使用 Matlab' + 'title': '科學計算 - 使用 Matlab', }, - 'playlist_mincount': 20 + 'playlist_mincount': 20, }, { # with displayMode parameter. For testing the codes to add parameters 'url': 'http://www.camdemy.com/folder/853/?displayMode=defaultOrderByOrg', 'info_dict': { 'id': '853', - 'title': '科學計算 - 使用 Matlab' + 'title': '科學計算 - 使用 Matlab', }, - 'playlist_mincount': 20 + 'playlist_mincount': 20, }] def _real_extract(self, url): folder_id = self._match_id(url) # Add displayMode=list so that all links are displayed in a single page - parsed_url = list(compat_urlparse.urlparse(url)) - query = dict(compat_urlparse.parse_qsl(parsed_url[4])) + parsed_url = list(urllib.parse.urlparse(url)) + query = dict(urllib.parse.parse_qsl(parsed_url[4])) query.update({'displayMode': 'list'}) - parsed_url[4] = compat_urllib_parse_urlencode(query) - final_url = compat_urlparse.urlunparse(parsed_url) + parsed_url[4] = urllib.parse.urlencode(query) + final_url = urllib.parse.urlunparse(parsed_url) page = self._download_webpage(final_url, folder_id) matches = re.findall(r"href='(/media/\d+/?)'", page) diff --git a/yt_dlp/extractor/camfm.py b/yt_dlp/extractor/camfm.py new file mode 100644 index 0000000000..6036f136fd --- /dev/null +++ b/yt_dlp/extractor/camfm.py @@ -0,0 +1,85 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + get_element_by_class, + get_elements_by_class, + join_nonempty, + traverse_obj, + unified_timestamp, + urljoin, +) + + +class CamFMShowIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?camfm\.co\.uk/shows/(?P<id>[^/]+)' + _TESTS = [{ + 'playlist_mincount': 5, + 'url': 'https://camfm.co.uk/shows/soul-mining/', + 'info_dict': { + 'id': 'soul-mining', + 'thumbnail': 'md5:6a873091f92c936f23bdcce80f75e66a', + 'title': 'Soul Mining', + 'description': 'Telling the stories of jazz, funk and soul from all corners of the world.', + }, + }] + + def _real_extract(self, url): + show_id = self._match_id(url) + page = self._download_webpage(url, show_id) + + return { + '_type': 'playlist', + 'id': show_id, + 'entries': [self.url_result(urljoin('https://camfm.co.uk', i), CamFMEpisodeIE) + for i in re.findall(r"javascript:popup\('(/player/[^']+)', 'listen'", page)], + 'thumbnail': urljoin('https://camfm.co.uk', self._search_regex( + r'<img[^>]+class="thumb-expand"[^>]+src="([^"]+)"', page, 'thumbnail', fatal=False)), + 'title': self._html_search_regex('<h1>([^<]+)</h1>', page, 'title', fatal=False), + 'description': clean_html(get_element_by_class('small-12 medium-8 cell', page)), + } + + +class CamFMEpisodeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?camfm\.co\.uk/player/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://camfm.co.uk/player/43336', + 'skip': 'Episode will expire - don\'t actually know when, but it will go eventually', + 'info_dict': { + 'id': '43336', + 'title': 'AITAA: Am I the Agony Aunt? - 19:00 Tue 16/05/2023', + 'ext': 'mp3', + 'upload_date': '20230516', + 'description': 'md5:f165144f94927c0f1bfa2ee6e6ab7bbf', + 'timestamp': 1684263600, + 'series': 'AITAA: Am I the Agony Aunt?', + 'thumbnail': 'md5:5980a831360d0744c3764551be3d09c1', + 'categories': ['Entertainment'], + }, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + page = self._download_webpage(url, episode_id) + audios = self._parse_html5_media_entries('https://audio.camfm.co.uk', page, episode_id) + + caption = get_element_by_class('caption', page) + series = clean_html(re.sub(r'<span[^<]+<[^<]+>', '', caption)) + + card_section = get_element_by_class('card-section', page) + date = self._html_search_regex('>Aired at ([^<]+)<', card_section, 'air date', fatal=False) + + return { + 'id': episode_id, + 'title': join_nonempty(series, date, delim=' - '), + 'formats': traverse_obj(audios, (..., 'formats', ...)), + 'timestamp': unified_timestamp(date), # XXX: Does not account for UK's daylight savings + 'series': series, + 'description': clean_html(re.sub(r'<b>[^<]+</b><br[^>]+/>', '', card_section)), + 'thumbnail': urljoin('https://camfm.co.uk', self._search_regex( + r'<div[^>]+class="cover-art"[^>]+style="[^"]+url\(\'([^\']+)', + page, 'thumbnail', fatal=False)), + 'categories': get_elements_by_class('label', caption), + 'was_live': True, + } diff --git a/yt_dlp/extractor/cammodels.py b/yt_dlp/extractor/cammodels.py index 135b31529f..7388cfb6cd 100644 --- a/yt_dlp/extractor/cammodels.py +++ b/yt_dlp/extractor/cammodels.py @@ -7,14 +7,14 @@ class CamModelsIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.cammodels.com/cam/AutumnKnight/', 'only_matching': True, - 'age_limit': 18 + 'age_limit': 18, }] def _real_extract(self, url): user_id = self._match_id(url) manifest = self._download_json( - 'https://manifest-server.naiadsystems.com/live/s:%s.json' % user_id, user_id) + f'https://manifest-server.naiadsystems.com/live/s:{user_id}.json', user_id) formats = [] thumbnails = [] @@ -36,7 +36,7 @@ def _real_extract(self, url): format_id_list = [format_id] height = int_or_none(media.get('videoHeight')) if height is not None: - format_id_list.append('%dp' % height) + format_id_list.append(f'{height}p') f = { 'url': media_url, 'format_id': '-'.join(format_id_list), @@ -73,5 +73,5 @@ def _real_extract(self, url): 'thumbnails': thumbnails, 'is_live': True, 'formats': formats, - 'age_limit': 18 + 'age_limit': 18, } diff --git a/yt_dlp/extractor/camtasia.py b/yt_dlp/extractor/camtasia.py index 70ab6c62a1..326643175b 100644 --- a/yt_dlp/extractor/camtasia.py +++ b/yt_dlp/extractor/camtasia.py @@ -17,7 +17,7 @@ class CamtasiaEmbedIE(InfoExtractor): 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1', 'ext': 'flv', 'duration': 2235.90, - } + }, }, { 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63', 'info_dict': { @@ -25,12 +25,12 @@ class CamtasiaEmbedIE(InfoExtractor): 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip', 'ext': 'flv', 'duration': 2235.93, - } + }, }], 'info_dict': { 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', }, - 'skip': 'webpage dead' + 'skip': 'webpage dead', }, ] diff --git a/yt_dlp/extractor/camwithher.py b/yt_dlp/extractor/camwithher.py deleted file mode 100644 index a0b3749edf..0000000000 --- a/yt_dlp/extractor/camwithher.py +++ /dev/null @@ -1,87 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, - unified_strdate, -) - - -class CamWithHerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?camwithher\.tv/view_video\.php\?.*\bviewkey=(?P<id>\w+)' - - _TESTS = [{ - 'url': 'http://camwithher.tv/view_video.php?viewkey=6e9a24e2c0e842e1f177&page=&viewtype=&category=', - 'info_dict': { - 'id': '5644', - 'ext': 'flv', - 'title': 'Periscope Tease', - 'description': 'In the clouds teasing on periscope to my favorite song', - 'duration': 240, - 'view_count': int, - 'comment_count': int, - 'uploader': 'MileenaK', - 'upload_date': '20160322', - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - } - }, { - 'url': 'http://camwithher.tv/view_video.php?viewkey=6dfd8b7c97531a459937', - 'only_matching': True, - }, { - 'url': 'http://camwithher.tv/view_video.php?page=&viewkey=6e9a24e2c0e842e1f177&viewtype=&category=', - 'only_matching': True, - }, { - 'url': 'http://camwithher.tv/view_video.php?viewkey=b6c3b5bea9515d1a1fc4&page=&viewtype=&category=mv', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - flv_id = self._html_search_regex( - r'<a[^>]+href=["\']/download/\?v=(\d+)', webpage, 'video id') - - # Video URL construction algorithm is reverse-engineered from cwhplayer.swf - rtmp_url = 'rtmp://camwithher.tv/clipshare/%s' % ( - ('mp4:%s.mp4' % flv_id) if int(flv_id) > 2010 else flv_id) - - title = self._html_search_regex( - r'<div[^>]+style="float:left"[^>]*>\s*<h2>(.+?)</h2>', webpage, 'title') - description = self._html_search_regex( - r'>Description:</span>(.+?)</div>', webpage, 'description', default=None) - - runtime = self._search_regex( - r'Runtime\s*:\s*(.+?) \|', webpage, 'duration', default=None) - if runtime: - runtime = re.sub(r'[\s-]', '', runtime) - duration = parse_duration(runtime) - view_count = int_or_none(self._search_regex( - r'Views\s*:\s*(\d+)', webpage, 'view count', default=None)) - comment_count = int_or_none(self._search_regex( - r'Comments\s*:\s*(\d+)', webpage, 'comment count', default=None)) - - uploader = self._search_regex( - r'Added by\s*:\s*<a[^>]+>([^<]+)</a>', webpage, 'uploader', default=None) - upload_date = unified_strdate(self._search_regex( - r'Added on\s*:\s*([\d-]+)', webpage, 'upload date', default=None)) - - return { - 'id': flv_id, - 'url': rtmp_url, - 'ext': 'flv', - 'no_resume': True, - 'title': title, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'comment_count': comment_count, - 'uploader': uploader, - 'upload_date': upload_date, - 'age_limit': 18 - } diff --git a/yt_dlp/extractor/canal1.py b/yt_dlp/extractor/canal1.py new file mode 100644 index 0000000000..587a11ab8c --- /dev/null +++ b/yt_dlp/extractor/canal1.py @@ -0,0 +1,39 @@ +from .common import InfoExtractor + + +class Canal1IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.|noticias\.)?canal1\.com\.co/(?:[^?#&])+/(?P<id>[\w-]+)' + + _TESTS = [{ + 'url': 'https://canal1.com.co/noticias/napa-i-una-cadena-de-produccion-de-arroz-que-se-quedo-en-veremos-y-abandonada-en-el-departamento-del-choco/', + 'info_dict': { + 'id': '63b39f6b354977084b85ab54', + 'display_id': 'napa-i-una-cadena-de-produccion-de-arroz-que-se-quedo-en-veremos-y-abandonada-en-el-departamento-del-choco', + 'title': 'Ñapa I Una cadena de producción de arroz que se quedó en veremos y abandonada en el departamento del Chocó', + 'description': 'md5:bc49c6d64d20610ea1e7daf079a0d013', + 'thumbnail': r're:^https?://[^?#]+63b39f6b354977084b85ab54', + 'ext': 'mp4', + }, + }, { + 'url': 'https://noticias.canal1.com.co/noticias/tres-i-el-triste-record-que-impuso-elon-musk-el-dueno-de-tesla-y-de-twitter/', + 'info_dict': { + 'id': '63b39e93f5fd223aa32250fb', + 'display_id': 'tres-i-el-triste-record-que-impuso-elon-musk-el-dueno-de-tesla-y-de-twitter', + 'title': 'Tres I El triste récord que impuso Elon Musk, el dueño de Tesla y de Twitter', + 'description': 'md5:d9f691f131a21ce6767ca6c05d17d791', + 'thumbnail': r're:^https?://[^?#]+63b39e93f5fd223aa32250fb', + 'ext': 'mp4', + }, + }, { + # Geo-restricted to Colombia + 'url': 'https://canal1.com.co/programas/guerreros-canal-1/video-inedito-guerreros-despedida-kewin-zarate/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + return self.url_result( + self._search_regex(r'"embedUrl"\s*:\s*"([^"]+)', webpage, 'embed url'), + display_id=display_id, url_transparent=True) diff --git a/yt_dlp/extractor/canalalpha.py b/yt_dlp/extractor/canalalpha.py index df5ca58187..3a0df95450 100644 --- a/yt_dlp/extractor/canalalpha.py +++ b/yt_dlp/extractor/canalalpha.py @@ -21,7 +21,7 @@ class CanalAlphaIE(InfoExtractor): 'upload_date': '20211028', 'duration': 1125, }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }, { 'url': 'https://www.canalalpha.ch/play/le-journal/topic/24512/la-poste-fait-de-neuchatel-un-pole-cryptographique', 'info_dict': { @@ -33,19 +33,19 @@ class CanalAlphaIE(InfoExtractor): 'upload_date': '20211028', 'duration': 138, }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }, { 'url': 'https://www.canalalpha.ch/play/eureka/episode/24484/ces-innovations-qui-veulent-rendre-lagriculture-plus-durable', 'info_dict': { 'id': '24484', 'ext': 'mp4', 'title': 'Ces innovations qui veulent rendre l’agriculture plus durable', - 'description': 'md5:3de3f151180684621e85be7c10e4e613', + 'description': 'md5:85d594a3b5dc6ccfc4a85aba6e73b129', 'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_10236.jpg', 'upload_date': '20211026', 'duration': 360, }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }, { 'url': 'https://www.canalalpha.ch/play/avec-le-temps/episode/23516/redonner-de-leclat-grace-au-polissage', 'info_dict': { @@ -57,15 +57,26 @@ class CanalAlphaIE(InfoExtractor): 'upload_date': '20210726', 'duration': 360, }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, + }, { + 'url': 'https://www.canalalpha.ch/play/le-journal/topic/33500/encore-des-mesures-deconomie-dans-le-jura', + 'info_dict': { + 'id': '33500', + 'ext': 'mp4', + 'title': 'Encore des mesures d\'économie dans le Jura', + 'description': 'md5:938b5b556592f2d1b9ab150268082a80', + 'thumbnail': 'https://static.canalalpha.ch/poster/news/news_46665.jpg', + 'upload_date': '20240411', + 'duration': 105, + }, }] def _real_extract(self, url): - id = self._match_id(url) - webpage = self._download_webpage(url, id) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) data_json = self._parse_json(self._search_regex( r'window\.__SERVER_STATE__\s?=\s?({(?:(?!};)[^"]|"([^"]|\\")*")+})\s?;', - webpage, 'data_json'), id)['1']['data']['data'] + webpage, 'data_json'), video_id)['1']['data']['data'] manifests = try_get(data_json, lambda x: x['video']['manifests'], expected_type=dict) or {} subtitles = {} formats = [{ @@ -75,15 +86,17 @@ def _real_extract(self, url): 'height': try_get(video, lambda x: x['res']['height'], expected_type=int), } for video in try_get(data_json, lambda x: x['video']['mp4'], expected_type=list) or [] if video.get('$url')] if manifests.get('hls'): - m3u8_frmts, m3u8_subs = self._parse_m3u8_formats_and_subtitles(manifests['hls'], video_id=id) - formats.extend(m3u8_frmts) - subtitles = self._merge_subtitles(subtitles, m3u8_subs) + fmts, subs = self._extract_m3u8_formats_and_subtitles( + manifests['hls'], video_id, m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) if manifests.get('dash'): - dash_frmts, dash_subs = self._parse_mpd_formats_and_subtitles(manifests['dash']) - formats.extend(dash_frmts) - subtitles = self._merge_subtitles(subtitles, dash_subs) + fmts, subs = self._extract_mpd_formats_and_subtitles( + manifests['dash'], video_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) return { - 'id': id, + 'id': video_id, 'title': data_json.get('title').strip(), 'description': clean_html(dict_get(data_json, ('longDesc', 'shortDesc'))), 'thumbnail': data_json.get('poster'), diff --git a/yt_dlp/extractor/canalc2.py b/yt_dlp/extractor/canalc2.py index 597cb2a6b0..c725545fa2 100644 --- a/yt_dlp/extractor/canalc2.py +++ b/yt_dlp/extractor/canalc2.py @@ -26,7 +26,7 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://www.canalc2.tv/video/%s' % video_id, video_id) + f'http://www.canalc2.tv/video/{video_id}', video_id) title = self._html_search_regex( r'(?s)class="[^"]*col_description[^"]*">.*?<h3>(.+?)</h3>', diff --git a/yt_dlp/extractor/canalplus.py b/yt_dlp/extractor/canalplus.py index b7e2f9dd46..728b7a0472 100644 --- a/yt_dlp/extractor/canalplus.py +++ b/yt_dlp/extractor/canalplus.py @@ -53,7 +53,7 @@ def _real_extract(self, url): video_data = self._download_json(info_url, video_id, 'Downloading video JSON') if isinstance(video_data, list): - video_data = [video for video in video_data if video.get('ID') == video_id][0] + video_data = next(video for video in video_data if video.get('ID') == video_id) media = video_data['MEDIA'] infos = video_data['INFOS'] @@ -64,7 +64,7 @@ def _real_extract(self, url): # response = self._request_webpage( # HEADRequest(fmt_url), video_id, # 'Checking if the video is georestricted') - # if '/blocage' in response.geturl(): + # if '/blocage' in response.url: # raise ExtractorError( # 'The video is not available in your country', # expected=True) @@ -97,8 +97,7 @@ def _real_extract(self, url): return { 'id': video_id, 'display_id': display_id, - 'title': '%s - %s' % (titrage['TITRE'], - titrage['SOUS_TITRE']), + 'title': '{} - {}'.format(titrage['TITRE'], titrage['SOUS_TITRE']), 'upload_date': unified_strdate(infos.get('PUBLICATION', {}).get('DATE')), 'thumbnails': thumbnails, 'description': infos.get('DESCRIPTION'), diff --git a/yt_dlp/extractor/canvas.py b/yt_dlp/extractor/canvas.py deleted file mode 100644 index ae6e03a4d5..0000000000 --- a/yt_dlp/extractor/canvas.py +++ /dev/null @@ -1,383 +0,0 @@ -import json - - -from .common import InfoExtractor -from .gigya import GigyaBaseIE -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - clean_html, - extract_attributes, - float_or_none, - get_element_by_class, - int_or_none, - merge_dicts, - str_or_none, - strip_or_none, - url_or_none, - urlencode_postdata -) - - -class CanvasIE(InfoExtractor): - _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9', - 'info_dict': { - 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'ext': 'mp4', - 'title': 'Nachtwacht: De Greystook', - 'description': 'Nachtwacht: De Greystook', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1468.02, - }, - 'expected_warnings': ['is not a supported codec'], - }, { - 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', - 'only_matching': True, - }] - _GEO_BYPASS = False - _HLS_ENTRY_PROTOCOLS_MAP = { - 'HLS': 'm3u8_native', - 'HLS_AES': 'm3u8_native', - } - _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2' - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - site_id, video_id = mobj.group('site_id'), mobj.group('id') - - data = None - if site_id != 'vrtvideo': - # Old API endpoint, serves more formats but may fail for some videos - data = self._download_json( - 'https://mediazone.vrt.be/api/v1/%s/assets/%s' - % (site_id, video_id), video_id, 'Downloading asset JSON', - 'Unable to download asset JSON', fatal=False) - - # New API endpoint - if not data: - vrtnutoken = self._download_json('https://token.vrt.be/refreshtoken', - video_id, note='refreshtoken: Retrieve vrtnutoken', - errnote='refreshtoken failed')['vrtnutoken'] - headers = self.geo_verification_headers() - headers.update({'Content-Type': 'application/json; charset=utf-8'}) - vrtPlayerToken = self._download_json( - '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', headers=headers, data=json.dumps({ - 'identityToken': vrtnutoken - }).encode('utf-8'))['vrtPlayerToken'] - data = self._download_json( - '%s/videos/%s' % (self._REST_API_BASE, video_id), - video_id, 'Downloading video JSON', query={ - 'vrtPlayerToken': vrtPlayerToken, - 'client': 'null', - }, expected_status=400) - if 'title' not in data: - code = data.get('code') - if code == 'AUTHENTICATION_REQUIRED': - self.raise_login_required() - elif code == 'INVALID_LOCATION': - self.raise_geo_restricted(countries=['BE']) - raise ExtractorError(data.get('message') or code, expected=True) - - # Note: The title may be an empty string - title = data['title'] or f'{site_id} {video_id}' - description = data.get('description') - - formats = [] - subtitles = {} - for target in data['targetUrls']: - format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type')) - if not format_url or not format_type: - continue - format_type = format_type.upper() - if format_type in self._HLS_ENTRY_PROTOCOLS_MAP: - fmts, subs = self._extract_m3u8_formats_and_subtitles( - format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type], - m3u8_id=format_type, fatal=False) - formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) - elif format_type == 'HDS': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_type, fatal=False)) - elif format_type == 'MPEG_DASH': - fmts, subs = self._extract_mpd_formats_and_subtitles( - format_url, video_id, mpd_id=format_type, fatal=False) - formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) - elif format_type == 'HSS': - fmts, subs = self._extract_ism_formats_and_subtitles( - format_url, video_id, ism_id='mss', fatal=False) - formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) - else: - formats.append({ - 'format_id': format_type, - 'url': format_url, - }) - - subtitle_urls = data.get('subtitleUrls') - if isinstance(subtitle_urls, list): - for subtitle in subtitle_urls: - subtitle_url = subtitle.get('url') - if subtitle_url and subtitle.get('type') == 'CLOSED': - subtitles.setdefault('nl', []).append({'url': subtitle_url}) - - return { - 'id': video_id, - 'display_id': video_id, - 'title': title, - 'description': description, - 'formats': formats, - 'duration': float_or_none(data.get('duration'), 1000), - 'thumbnail': data.get('posterImageUrl'), - 'subtitles': subtitles, - } - - -class CanvasEenIE(InfoExtractor): - IE_DESC = 'canvas.be and een.be' - _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week', - 'md5': 'ed66976748d12350b118455979cca293', - 'info_dict': { - 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', - 'display_id': 'de-afspraak-veilt-voor-de-warmste-week', - 'ext': 'flv', - 'title': 'De afspraak veilt voor de Warmste Week', - 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 49.02, - }, - 'expected_warnings': ['is not a supported codec'], - }, { - # with subtitles - 'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167', - 'info_dict': { - 'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625', - 'display_id': 'pieter-0167', - 'ext': 'mp4', - 'title': 'Pieter 0167', - 'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 2553.08, - 'subtitles': { - 'nl': [{ - 'ext': 'vtt', - }], - }, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Pagina niet gevonden', - }, { - 'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan', - 'info_dict': { - 'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8', - 'display_id': 'emma-pakt-thilly-aan', - 'ext': 'mp4', - 'title': 'Emma pakt Thilly aan', - 'description': 'md5:c5c9b572388a99b2690030afa3f3bad7', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 118.24, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['is not a supported codec'], - }, { - 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - site_id, display_id = mobj.group('site_id'), mobj.group('id') - - webpage = self._download_webpage(url, display_id) - - title = strip_or_none(self._search_regex( - r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>', - webpage, 'title', default=None) or self._og_search_title( - webpage, default=None)) - - video_id = self._html_search_regex( - r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', - group='id') - - return { - '_type': 'url_transparent', - 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id), - 'ie_key': CanvasIE.ie_key(), - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': self._og_search_description(webpage), - } - - -class VrtNUIE(GigyaBaseIE): - IE_DESC = 'VrtNU.be' - _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)' - _TESTS = [{ - # Available via old API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/', - 'info_dict': { - 'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', - 'ext': 'mp4', - 'title': 'Postbus X - Aflevering 1 (Seizoen 1989)', - 'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7', - 'duration': 1457.04, - 'thumbnail': r're:^https?://.*\.jpg$', - 'series': 'Postbus X', - 'season': 'Seizoen 1989', - 'season_number': 1989, - 'episode': 'De zwarte weduwe', - 'episode_number': 1, - 'timestamp': 1595822400, - 'upload_date': '20200727', - }, - 'skip': 'This video is only available for registered users', - 'expected_warnings': ['is not a supported codec'], - }, { - # Only available via new API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/', - 'info_dict': { - 'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1', - 'ext': 'mp4', - 'title': 'Aflevering 5', - 'description': 'Wie valt door de mand tijdens een missie?', - 'duration': 2967.06, - 'season': 'Season 1', - 'season_number': 1, - 'episode_number': 5, - }, - 'skip': 'This video is only available for registered users', - 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], - }] - _NETRC_MACHINE = 'vrtnu' - _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' - _CONTEXT_ID = 'R3595707040' - - def _perform_login(self, username, password): - auth_info = self._gigya_login({ - 'APIKey': self._APIKEY, - 'targetEnv': 'jssdk', - 'loginID': username, - 'password': password, - 'authMode': 'cookie', - }) - - if auth_info.get('errorDetails'): - raise ExtractorError('Unable to login: VrtNU said: ' + auth_info.get('errorDetails'), expected=True) - - # Sometimes authentication fails for no good reason, retry - login_attempt = 1 - while login_attempt <= 3: - try: - self._request_webpage('https://token.vrt.be/vrtnuinitlogin', - None, note='Requesting XSRF Token', errnote='Could not get XSRF Token', - query={'provider': 'site', 'destination': 'https://www.vrt.be/vrtnu/'}) - - post_data = { - 'UID': auth_info['UID'], - 'UIDSignature': auth_info['UIDSignature'], - 'signatureTimestamp': auth_info['signatureTimestamp'], - '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, - } - - self._request_webpage( - 'https://login.vrt.be/perform_login', - None, note='Performing login', errnote='perform login failed', - headers={}, query={ - 'client_id': 'vrtnu-site' - }, data=urlencode_postdata(post_data)) - - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - login_attempt += 1 - self.report_warning('Authentication failed') - self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again') - else: - raise e - else: - break - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - attrs = extract_attributes(self._search_regex( - r'(<nui-media[^>]+>)', webpage, 'media element')) - video_id = attrs['videoid'] - publication_id = attrs.get('publicationid') - if publication_id: - video_id = publication_id + '$' + video_id - - page = (self._parse_json(self._search_regex( - r'digitalData\s*=\s*({.+?});', webpage, 'digial data', - default='{}'), video_id, fatal=False) or {}).get('page') or {} - - info = self._search_json_ld(webpage, display_id, default={}) - return merge_dicts(info, { - '_type': 'url_transparent', - 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, - 'ie_key': CanvasIE.ie_key(), - 'id': video_id, - 'display_id': display_id, - 'season_number': int_or_none(page.get('episode_season')), - }) - - -class DagelijkseKostIE(InfoExtractor): - IE_DESC = 'dagelijksekost.een.be' - _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', - 'md5': '30bfffc323009a3e5f689bef6efa2365', - 'info_dict': { - 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', - 'display_id': 'hachis-parmentier-met-witloof', - 'ext': 'mp4', - 'title': 'Hachis parmentier met witloof', - 'description': 'md5:9960478392d87f63567b5b117688cdc5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 283.02, - }, - 'expected_warnings': ['is not a supported codec'], - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - title = strip_or_none(get_element_by_class( - 'dish-metadata__title', webpage - ) or self._html_search_meta( - 'twitter:title', webpage)) - - description = clean_html(get_element_by_class( - 'dish-description', webpage) - ) or self._html_search_meta( - ('description', 'twitter:description', 'og:description'), - webpage) - - video_id = self._html_search_regex( - r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', - group='id') - - return { - '_type': 'url_transparent', - 'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id, - 'ie_key': CanvasIE.ie_key(), - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - } diff --git a/yt_dlp/extractor/caracoltv.py b/yt_dlp/extractor/caracoltv.py new file mode 100644 index 0000000000..493ffdae5e --- /dev/null +++ b/yt_dlp/extractor/caracoltv.py @@ -0,0 +1,136 @@ +import base64 +import json +import uuid + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + js_to_json, + traverse_obj, + urljoin, +) + + +class CaracolTvPlayIE(InfoExtractor): + _VALID_URL = r'https?://play\.caracoltv\.com/videoDetails/(?P<id>[^/?#]+)' + _NETRC_MACHINE = 'caracoltv-play' + + _TESTS = [{ + 'url': 'https://play.caracoltv.com/videoDetails/OTo4NGFmNjUwOWQ2ZmM0NTg2YWRiOWU0MGNhOWViOWJkYQ==', + 'info_dict': { + 'id': 'OTo4NGFmNjUwOWQ2ZmM0NTg2YWRiOWU0MGNhOWViOWJkYQ==', + 'title': 'La teoría del promedio', + 'description': 'md5:1cdd6d2c13f19ef0d9649ab81a023ac3', + }, + 'playlist_count': 6, + }, { + 'url': 'https://play.caracoltv.com/videoDetails/OTo3OWM4ZTliYzQxMmM0MTMxYTk4Mjk2YjdjNGQ4NGRkOQ==/ella?season=0', + 'info_dict': { + 'id': 'OTo3OWM4ZTliYzQxMmM0MTMxYTk4Mjk2YjdjNGQ4NGRkOQ==', + 'title': 'Ella', + 'description': 'md5:a639b1feb5ddcc0cff92a489b4e544b8', + }, + 'playlist_count': 10, + }, { + 'url': 'https://play.caracoltv.com/videoDetails/OTpiYTY1YTVmOTI5MzI0ZWJhOGZiY2Y3MmRlOWZlYmJkOA==/la-vuelta-al-mundo-en-80-risas-2022?season=0', + 'info_dict': { + 'id': 'OTpiYTY1YTVmOTI5MzI0ZWJhOGZiY2Y3MmRlOWZlYmJkOA==', + 'title': 'La vuelta al mundo en 80 risas 2022', + 'description': 'md5:e97aac36106e5c37ebf947b3350106a4', + }, + 'playlist_count': 17, + }, { + 'url': 'https://play.caracoltv.com/videoDetails/MzoxX3BwbjRmNjB1', + 'only_matching': True, + }] + + _USER_TOKEN = None + + def _extract_app_token(self, webpage): + config_js_path = self._search_regex( + r'<script[^>]+src\s*=\s*"([^"]+coreConfig.js[^"]+)', webpage, 'config js url', fatal=False) + + mediation_config = {} if not config_js_path else self._search_json( + r'mediation\s*:', self._download_webpage( + urljoin('https://play.caracoltv.com/', config_js_path), None, fatal=False, note='Extracting JS config'), + 'mediation_config', None, transform_source=js_to_json, fatal=False) + + key = traverse_obj( + mediation_config, ('live', 'key')) or '795cd9c089a1fc48094524a5eba85a3fca1331817c802f601735907c8bbb4f50' + secret = traverse_obj( + mediation_config, ('live', 'secret')) or '64dec00a6989ba83d087621465b5e5d38bdac22033b0613b659c442c78976fa0' + + return base64.b64encode(f'{key}:{secret}'.encode()).decode() + + def _perform_login(self, email, password): + webpage = self._download_webpage('https://play.caracoltv.com/', None, fatal=False) + app_token = self._extract_app_token(webpage) + + bearer_token = self._download_json( + 'https://eu-gateway.inmobly.com/applications/oauth', None, data=b'', note='Retrieving bearer token', + headers={'Authorization': f'Basic {app_token}'})['token'] + + self._USER_TOKEN = self._download_json( + 'https://eu-gateway.inmobly.com/user/login', None, note='Performing login', headers={ + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {bearer_token}', + }, data=json.dumps({ + 'device_data': { + 'device_id': str(uuid.uuid4()), + 'device_token': '', + 'device_type': 'web', + }, + 'login_data': { + 'enabled': True, + 'email': email, + 'password': password, + }, + }).encode())['user_token'] + + def _extract_video(self, video_data, series_id=None, season_id=None, season_number=None): + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_data['stream_url'], series_id, 'mp4') + + return { + 'id': video_data['id'], + 'title': video_data.get('name'), + 'description': video_data.get('description'), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': traverse_obj( + video_data, ('extra_thumbs', ..., {'url': 'thumb_url', 'height': 'height', 'width': 'width'})), + 'series_id': series_id, + 'season_id': season_id, + 'season_number': int_or_none(season_number), + 'episode_number': int_or_none(video_data.get('item_order')), + 'is_live': video_data.get('entry_type') == 3, + } + + def _extract_series_seasons(self, seasons, series_id): + for season in seasons: + api_response = self._download_json( + 'https://eu-gateway.inmobly.com/feed', series_id, query={'season_id': season['id']}, + headers={'Authorization': f'Bearer {self._USER_TOKEN}'}) + + season_number = season.get('order') + for episode in api_response['items']: + yield self._extract_video(episode, series_id, season['id'], season_number) + + def _real_extract(self, url): + series_id = self._match_id(url) + + if self._USER_TOKEN is None: + self._perform_login('guest@inmobly.com', 'Test@gus1') + + api_response = self._download_json( + 'https://eu-gateway.inmobly.com/feed', series_id, query={'include_ids': series_id}, + headers={'Authorization': f'Bearer {self._USER_TOKEN}'})['items'][0] + + if not api_response.get('seasons'): + return self._extract_video(api_response) + + return self.playlist_result( + self._extract_series_seasons(api_response['seasons'], series_id), + series_id, **traverse_obj(api_response, { + 'title': 'name', + 'description': 'description', + })) diff --git a/yt_dlp/extractor/carambatv.py b/yt_dlp/extractor/carambatv.py deleted file mode 100644 index d6044a3193..0000000000 --- a/yt_dlp/extractor/carambatv.py +++ /dev/null @@ -1,105 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - format_field, - float_or_none, - int_or_none, - try_get, -) - -from .videomore import VideomoreIE - - -class CarambaTVIE(InfoExtractor): - _VALID_URL = r'(?:carambatv:|https?://video1\.carambatv\.ru/v/)(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://video1.carambatv.ru/v/191910501', - 'md5': '2f4a81b7cfd5ab866ee2d7270cb34a2a', - 'info_dict': { - 'id': '191910501', - 'ext': 'mp4', - 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 2678.31, - }, - }, { - 'url': 'carambatv:191910501', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - video = self._download_json( - 'http://video1.carambatv.ru/v/%s/videoinfo.js' % video_id, - video_id) - - title = video['title'] - - base_url = video.get('video') or 'http://video1.carambatv.ru/v/%s/' % video_id - - formats = [{ - 'url': base_url + f['fn'], - 'height': int_or_none(f.get('height')), - 'format_id': format_field(f, 'height', '%sp'), - } for f in video['qualities'] if f.get('fn')] - - thumbnail = video.get('splash') - duration = float_or_none(try_get( - video, lambda x: x['annotations'][0]['end_time'], compat_str)) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - } - - -class CarambaTVPageIE(InfoExtractor): - _VALID_URL = r'https?://carambatv\.ru/(?:[^/]+/)+(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'http://carambatv.ru/movie/bad-comedian/razborka-v-manile/', - 'md5': 'a49fb0ec2ad66503eeb46aac237d3c86', - 'info_dict': { - 'id': '475222', - 'ext': 'flv', - 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', - 'thumbnail': r're:^https?://.*\.jpg', - # duration reported by videomore is incorrect - 'duration': int, - }, - 'add_ie': [VideomoreIE.ie_key()], - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - videomore_url = VideomoreIE._extract_url(webpage) - if not videomore_url: - videomore_id = self._search_regex( - r'getVMCode\s*\(\s*["\']?(\d+)', webpage, 'videomore id', - default=None) - if videomore_id: - videomore_url = 'videomore:%s' % videomore_id - if videomore_url: - title = self._og_search_title(webpage) - return { - '_type': 'url_transparent', - 'url': videomore_url, - 'ie_key': VideomoreIE.ie_key(), - 'title': title, - } - - video_url = self._og_search_property('video:iframe', webpage, default=None) - - if not video_url: - video_id = self._search_regex( - r'(?:video_id|crmb_vuid)\s*[:=]\s*["\']?(\d+)', - webpage, 'video id') - video_url = 'carambatv:%s' % video_id - - return self.url_result(video_url, CarambaTVIE.ie_key()) diff --git a/yt_dlp/extractor/cartoonnetwork.py b/yt_dlp/extractor/cartoonnetwork.py index 4dd7ac46d4..1749a008a2 100644 --- a/yt_dlp/extractor/cartoonnetwork.py +++ b/yt_dlp/extractor/cartoonnetwork.py @@ -27,7 +27,7 @@ def find_field(global_re, name, content_re=None, value_re='[^"]+', fatal=False): if content_re: metadata_re = r'|video_metadata\.content_' + content_re return self._search_regex( - r'(?:_cnglobal\.currentVideo\.%s%s)\s*=\s*"(%s)";' % (global_re, metadata_re, value_re), + rf'(?:_cnglobal\.currentVideo\.{global_re}{metadata_re})\s*=\s*"({value_re})";', webpage, name, fatal=fatal) media_id = find_field('mediaId', 'media id', 'id', '[0-9a-f]{40}', True) diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index 210f5f8eea..40224f63f5 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -1,21 +1,30 @@ -import re -import json import base64 +import functools +import json +import re import time +import urllib.parse +import xml.etree.ElementTree from .common import InfoExtractor -from ..compat import ( - compat_str, -) +from ..networking import HEADRequest from ..utils import ( + ExtractorError, + float_or_none, int_or_none, join_nonempty, js_to_json, + mimetype2ext, orderedSet, + parse_iso8601, + replace_extension, smuggle_url, strip_or_none, + traverse_obj, try_get, - ExtractorError, + update_url, + url_basename, + url_or_none, ) @@ -63,6 +72,7 @@ class CBCIE(InfoExtractor): 'uploader': 'CBCC-NEW', 'timestamp': 255977160, }, + 'skip': '404 Not Found', }, { # multiple iframes 'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot', @@ -94,7 +104,7 @@ class CBCIE(InfoExtractor): # multiple CBC.APP.Caffeine.initInstance(...) 'url': 'http://www.cbc.ca/news/canada/calgary/dog-indoor-exercise-winter-1.3928238', 'info_dict': { - 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', + 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', # FIXME: actual title includes " | CBC News" 'id': 'dog-indoor-exercise-winter-1.3928238', 'description': 'md5:c18552e41726ee95bd75210d1ca9194c', }, @@ -103,7 +113,7 @@ class CBCIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if CBCPlayerIE.suitable(url) else super(CBCIE, cls).suitable(url) + return False if CBCPlayerIE.suitable(url) else super().suitable(url) def _extract_player_init(self, player_init, display_id): player_info = self._parse_json(player_init, display_id, js_to_json) @@ -111,15 +121,15 @@ def _extract_player_init(self, player_init, display_id): if not media_id: clip_id = player_info['clipId'] feed = self._download_json( - 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id, + f'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={{:mpsReleases}}{{{clip_id}}}', clip_id, fatal=False) if feed: - media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str) + media_id = try_get(feed, lambda x: x['entries'][0]['guid'], str) if not media_id: media_id = self._download_json( 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, clip_id)['entries'][0]['id'].split('/')[-1] - return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) + return self.url_result(f'cbcplayer:{media_id}', 'CBCPlayer', media_id) def _real_extract(self, url): display_id = self._match_id(url) @@ -137,7 +147,7 @@ def _real_extract(self, url): r'guid["\']\s*:\s*["\'](\d+)'): media_ids.extend(re.findall(media_id_re, webpage)) entries.extend([ - self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) + self.url_result(f'cbcplayer:{media_id}', 'CBCPlayer', media_id) for media_id in orderedSet(media_ids)]) return self.playlist_result( entries, display_id, strip_or_none(title), @@ -146,7 +156,8 @@ def _real_extract(self, url): class CBCPlayerIE(InfoExtractor): IE_NAME = 'cbc.ca:player' - _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)' + _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/(?:video/)?|i/caffeine/syndicate/\?mediaId=))(?P<id>(?:\d\.)?\d+)' + _GEO_COUNTRIES = ['CA'] _TESTS = [{ 'url': 'http://www.cbc.ca/player/play/2683190193', 'md5': '64d25f841ddf4ddb28a235338af32e2c', @@ -159,23 +170,31 @@ class CBCPlayerIE(InfoExtractor): 'upload_date': '20160210', 'uploader': 'CBCC-NEW', }, - 'skip': 'Geo-restricted to Canada', + 'skip': 'Geo-restricted to Canada and no longer available', }, { - # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ - 'url': 'http://www.cbc.ca/player/play/2657631896', + 'url': 'http://www.cbc.ca/i/caffeine/syndicate/?mediaId=2657631896', 'md5': 'e5e708c34ae6fca156aafe17c43e8b75', 'info_dict': { 'id': '2657631896', 'ext': 'mp3', 'title': 'CBC Montreal is organizing its first ever community hackathon!', - 'description': 'The modern technology we tend to depend on so heavily, is never without it\'s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.', + 'description': 'md5:dd3b692f0a139b0369943150bd1c46a9', 'timestamp': 1425704400, 'upload_date': '20150307', - 'uploader': 'CBCC-NEW', + 'thumbnail': 'https://i.cbc.ca/ais/1.2985700,1717262248558/full/max/0/default.jpg', + 'chapters': [], + 'duration': 494.811, + 'categories': ['All in a Weekend Montreal'], + 'tags': 'count:11', + 'location': 'Quebec', + 'series': 'All in a Weekend Montreal', + 'season': 'Season 2015', + 'season_number': 2015, + 'media_type': 'Excerpt', + 'genres': ['Other'], }, }, { - 'url': 'http://www.cbc.ca/player/play/2164402062', - 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6', + 'url': 'http://www.cbc.ca/i/caffeine/syndicate/?mediaId=2164402062', 'info_dict': { 'id': '2164402062', 'ext': 'mp4', @@ -183,26 +202,325 @@ class CBCPlayerIE(InfoExtractor): 'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', 'timestamp': 1320410746, 'upload_date': '20111104', - 'uploader': 'CBCC-NEW', + 'thumbnail': 'https://i.cbc.ca/ais/1.1711287,1717139372111/full/max/0/default.jpg', + 'chapters': [], + 'duration': 186.867, + 'series': 'CBC News: Windsor at 6:00', + 'categories': ['Windsor'], + 'location': 'Windsor', + 'tags': ['Cancer', 'News/Canada/Windsor', 'Windsor'], + 'media_type': 'Excerpt', + 'genres': ['News'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ + 'url': 'https://www.cbc.ca/player/play/1.2985700', + 'md5': 'e5e708c34ae6fca156aafe17c43e8b75', + 'info_dict': { + 'id': '1.2985700', + 'ext': 'mp3', + 'title': 'CBC Montreal is organizing its first ever community hackathon!', + 'description': 'The modern technology we tend to depend on so heavily, is never without it\'s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.', + 'timestamp': 1425704400, + 'upload_date': '20150307', + 'thumbnail': 'https://i.cbc.ca/ais/1.2985700,1717262248558/full/max/0/default.jpg', + 'chapters': [], + 'duration': 494.811, + 'categories': ['All in a Weekend Montreal'], + 'tags': 'count:11', + 'location': 'Quebec', + 'series': 'All in a Weekend Montreal', + 'season': 'Season 2015', + 'season_number': 2015, + 'media_type': 'Excerpt', + 'genres': ['Other'], + }, + }, { + 'url': 'https://www.cbc.ca/player/play/1.1711287', + 'info_dict': { + 'id': '1.1711287', + 'ext': 'mp4', + 'title': 'Cancer survivor four times over', + 'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', + 'timestamp': 1320410746, + 'upload_date': '20111104', + 'thumbnail': 'https://i.cbc.ca/ais/1.1711287,1717139372111/full/max/0/default.jpg', + 'chapters': [], + 'duration': 186.867, + 'series': 'CBC News: Windsor at 6:00', + 'categories': ['Windsor'], + 'location': 'Windsor', + 'tags': ['Cancer', 'News/Canada/Windsor', 'Windsor'], + 'media_type': 'Excerpt', + 'genres': ['News'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + # Has subtitles + # These broadcasts expire after ~1 month, can find new test URL here: + # https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast + 'url': 'https://www.cbc.ca/player/play/video/9.6424403', + 'md5': '8025909eaffcf0adf59922904def9a5e', + 'info_dict': { + 'id': '9.6424403', + 'ext': 'mp4', + 'title': 'The National | N.W.T. wildfire emergency', + 'description': 'md5:ada33d36d1df69347ed575905bfd496c', + 'timestamp': 1718589600, + 'duration': 2692.833, + 'subtitles': { + 'en-US': [{ + 'name': 'English Captions', + 'url': 'https://cbchls.akamaized.net/delivery/news-shows/2024/06/17/NAT_JUN16-00-55-00/NAT_JUN16_cc.vtt', + }], + }, + 'thumbnail': 'https://i.cbc.ca/ais/6272b5c6-5e78-4c05-915d-0e36672e33d1,1714756287822/full/max/0/default.jpg', + 'chapters': 'count:5', + 'upload_date': '20240617', + 'categories': ['News', 'The National', 'The National Latest Broadcasts'], + 'series': 'The National - Full Show', + 'tags': ['The National'], + 'location': 'Canada', + 'media_type': 'Full Program', + 'genres': ['News'], + }, + }, { + 'url': 'https://www.cbc.ca/player/play/video/1.7194274', + 'md5': '188b96cf6bdcb2540e178a6caa957128', + 'info_dict': { + 'id': '1.7194274', + 'ext': 'mp4', + 'title': '#TheMoment a rare white spirit moose was spotted in Alberta', + 'description': 'md5:18ae269a2d0265c5b0bbe4b2e1ac61a3', + 'timestamp': 1714788791, + 'duration': 77.678, + 'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]}, + 'thumbnail': 'https://i.cbc.ca/ais/1.7194274,1717224990425/full/max/0/default.jpg', + 'chapters': [], + 'categories': 'count:3', + 'series': 'The National', + 'tags': 'count:17', + 'location': 'Canada', + 'media_type': 'Excerpt', + 'upload_date': '20240504', + 'genres': ['News'], + }, + }, { + 'url': 'https://www.cbc.ca/player/play/video/9.6427282', + 'info_dict': { + 'id': '9.6427282', + 'ext': 'mp4', + 'title': 'Men\'s Soccer - Argentina vs Morocco', + 'description': 'Argentina faces Morocco on the football pitch at Saint Etienne Stadium.', + 'series': 'CBC Sports', + 'media_type': 'Event Coverage', + 'thumbnail': 'https://i.cbc.ca/ais/a4c5c0c2-99fa-4bd3-8061-5a63879c1b33,1718828053500/full/max/0/default.jpg', + 'timestamp': 1721825400.0, + 'upload_date': '20240724', + 'duration': 10568.0, + 'chapters': [], + 'genres': [], + 'tags': ['2024 Paris Olympic Games'], + 'categories': ['Olympics Summer Soccer', 'Summer Olympics Replays', 'Summer Olympics Soccer Replays'], + 'location': 'Canada', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.cbc.ca/player/play/video/9.6459530', + 'md5': '6c1bb76693ab321a2e99c347a1d5ecbc', + 'info_dict': { + 'id': '9.6459530', + 'ext': 'mp4', + 'title': 'Parts of Jasper incinerated as wildfire rages', + 'description': 'md5:6f1caa8d128ad3f629257ef5fecf0962', + 'series': 'The National', + 'media_type': 'Excerpt', + 'thumbnail': 'https://i.cbc.ca/ais/507c0086-31a2-494d-96e4-bffb1048d045,1721953984375/full/max/0/default.jpg', + 'timestamp': 1721964091.012, + 'upload_date': '20240726', + 'duration': 952.285, + 'chapters': [], + 'genres': [], + 'tags': 'count:23', + 'categories': ['News (FAST)', 'News', 'The National', 'TV News Shows', 'The National '], + }, + }, { + 'url': 'https://www.cbc.ca/player/play/video/9.6420651', + 'md5': '71a850c2c6ee5e912de169f5311bb533', + 'info_dict': { + 'id': '9.6420651', + 'ext': 'mp4', + 'title': 'Is it a breath of fresh air? Measuring air quality in Edmonton', + 'description': 'md5:3922b92cc8b69212d739bd9dd095b1c3', + 'series': 'CBC News Edmonton', + 'media_type': 'Excerpt', + 'thumbnail': 'https://i.cbc.ca/ais/73c4ab9c-7ad4-46ee-bb9b-020fdc01c745,1718214547576/full/max/0/default.jpg', + 'timestamp': 1718220065.768, + 'upload_date': '20240612', + 'duration': 286.086, + 'chapters': [], + 'genres': ['News'], + 'categories': ['News', 'Edmonton'], + 'tags': 'count:7', + 'location': 'Edmonton', + }, + }, { + 'url': 'cbcplayer:1.7159484', + 'only_matching': True, + }, { + 'url': 'cbcplayer:2164402062', + 'only_matching': True, + }, { + 'url': 'http://www.cbc.ca/player/play/2657631896', + 'only_matching': True, + }] + + def _parse_param(self, asset_data, name): + return traverse_obj(asset_data, ('params', lambda _, v: v['name'] == name, 'value', {str}, any)) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(f'https://www.cbc.ca/player/play/{video_id}', video_id) + data = self._search_json( + r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id)['video']['currentClip'] + assets = traverse_obj( + data, ('media', 'assets', lambda _, v: url_or_none(v['key']) and v['type'])) + + if not assets and (media_id := traverse_obj(data, ('mediaId', {str}))): + # XXX: Deprecated; CBC is migrating off of ThePlatform + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url( + f'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/{media_id}?mbr=true&formats=MPEG4,FLV,MP3', { + 'force_smil_url': True, + }), + 'id': media_id, + '_format_sort_fields': ('res', 'proto'), # Prioritize direct http formats over HLS + } + + is_live = traverse_obj(data, ('media', 'streamType', {str})) == 'Live' + formats, subtitles = [], {} + + for sub in traverse_obj(data, ('media', 'textTracks', lambda _, v: url_or_none(v['src']))): + subtitles.setdefault(sub.get('language') or 'und', []).append({ + 'url': sub['src'], + 'name': sub.get('label'), + }) + + for asset in assets: + asset_key = asset['key'] + asset_type = asset['type'] + if asset_type != 'medianet': + self.report_warning(f'Skipping unsupported asset type "{asset_type}": {asset_key}') + continue + asset_data = self._download_json(asset_key, video_id, f'Downloading {asset_type} JSON') + ext = mimetype2ext(self._parse_param(asset_data, 'contentType')) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + asset_data['url'], video_id, 'mp4', m3u8_id='hls', live=is_live) + formats.extend(fmts) + # Avoid slow/error-prone webvtt-over-m3u8 if direct https vtt is available + if not subtitles: + self._merge_subtitles(subs, target=subtitles) + if is_live or not fmts: + continue + # Check for direct https mp4 format + best_video_fmt = traverse_obj(fmts, ( + lambda _, v: v.get('vcodec') != 'none' and v['tbr'], all, + {functools.partial(sorted, key=lambda x: x['tbr'])}, -1, {dict})) or {} + base_url = self._search_regex( + r'(https?://[^?#]+?/)hdntl=', best_video_fmt.get('url'), 'base url', default=None) + if not base_url or '/live/' in base_url: + continue + mp4_url = base_url + replace_extension(url_basename(best_video_fmt['url']), 'mp4') + if self._request_webpage( + HEADRequest(mp4_url), video_id, 'Checking for https format', + errnote=False, fatal=False): + formats.append({ + **best_video_fmt, + 'url': mp4_url, + 'format_id': 'https-mp4', + 'protocol': 'https', + 'manifest_url': None, + 'acodec': None, + }) + else: + formats.append({ + 'url': asset_data['url'], + 'ext': ext, + 'vcodec': 'none' if self._parse_param(asset_data, 'mediaType') == 'audio' else None, + }) + + chapters = traverse_obj(data, ( + 'media', 'chapters', lambda _, v: float(v['startTime']) is not None, { + 'start_time': ('startTime', {functools.partial(float_or_none, scale=1000)}), + 'end_time': ('endTime', {functools.partial(float_or_none, scale=1000)}), + 'title': ('name', {str}), + })) + # Filter out pointless single chapters with start_time==0 and no end_time + if len(chapters) == 1 and not (chapters[0].get('start_time') or chapters[0].get('end_time')): + chapters = [] + + return { + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str.strip}), + 'thumbnail': ('image', 'url', {url_or_none}, {functools.partial(update_url, query=None)}), + 'timestamp': ('publishedAt', {functools.partial(float_or_none, scale=1000)}), + 'media_type': ('media', 'clipType', {str}), + 'series': ('showName', {str}), + 'season_number': ('media', 'season', {int_or_none}), + 'duration': ('media', 'duration', {float_or_none}, {lambda x: None if is_live else x}), + 'location': ('media', 'region', {str}), + 'tags': ('tags', ..., 'name', {str}), + 'genres': ('media', 'genre', all), + 'categories': ('categories', ..., 'name', {str}), + }), + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'chapters': chapters, + 'is_live': is_live, + } + + +class CBCPlayerPlaylistIE(InfoExtractor): + IE_NAME = 'cbc.ca:player:playlist' + _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?:player/)(?!play/)(?P<id>[^?#]+)' + _TESTS = [{ + 'url': 'https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast', + 'playlist_mincount': 25, + 'info_dict': { + 'id': 'news/tv shows/the national/latest broadcast', + }, + }, { + 'url': 'https://www.cbc.ca/player/news/Canada/North', + 'playlist_mincount': 25, + 'info_dict': { + 'id': 'news/canada/north', }, }] def _real_extract(self, url): - video_id = self._match_id(url) - return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': smuggle_url( - 'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/%s?mbr=true&formats=MPEG4,FLV,MP3' % video_id, { - 'force_smil_url': True - }), - 'id': video_id, - } + playlist_id = urllib.parse.unquote(self._match_id(url)).lower() + webpage = self._download_webpage(url, playlist_id) + json_content = self._search_json( + r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', playlist_id) + + def entries(): + for video_id in traverse_obj(json_content, ( + 'video', 'clipsByCategory', lambda k, _: k.lower() == playlist_id, 'items', ..., 'id', + )): + yield self.url_result(f'https://www.cbc.ca/player/play/{video_id}', CBCPlayerIE) + + return self.playlist_result(entries(), playlist_id) class CBCGemIE(InfoExtractor): IE_NAME = 'gem.cbc.ca' - _VALID_URL = r'https?://gem\.cbc\.ca/media/(?P<id>[0-9a-z-]+/s[0-9]+[a-z][0-9]+)' + _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P<id>[0-9a-z-]+/s[0-9]+[a-z][0-9]+)' _TESTS = [{ # This is a normal, public, TV show video 'url': 'https://gem.cbc.ca/media/schitts-creek/s06e01', @@ -245,6 +563,9 @@ class CBCGemIE(InfoExtractor): }, 'params': {'format': 'bv'}, 'skip': 'Geo-restricted to Canada', + }, { + 'url': 'https://gem.cbc.ca/nadiyas-family-favourites/s01e01', + 'only_matching': True, }] _GEO_COUNTRIES = ['CA'] @@ -275,12 +596,12 @@ def _new_claims_token(self, email, password): data = json.dumps({'jwt': sig}).encode() headers = {'content-type': 'application/json', 'ott-device-type': 'web'} resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/token', - None, data=data, headers=headers) + None, data=data, headers=headers, expected_status=426) cbc_access_token = resp['accessToken'] headers = {'content-type': 'application/json', 'ott-device-type': 'web', 'ott-access-token': cbc_access_token} resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/profile', - None, headers=headers) + None, headers=headers, expected_status=426) return resp['claimsToken'] def _get_claims_token_expiry(self): @@ -288,15 +609,13 @@ def _get_claims_token_expiry(self): # JWT is decoded here and 'exp' field is extracted # It is a Unix timestamp for when the token expires b64_data = self._claims_token.split('.')[1] - data = base64.urlsafe_b64decode(b64_data + "==") + data = base64.urlsafe_b64decode(b64_data + '==') return json.loads(data)['exp'] def claims_token_expired(self): exp = self._get_claims_token_expiry() - if exp - time.time() < 10: - # It will expire in less than 10 seconds, or has already expired - return True - return False + # It will expire in less than 10 seconds, or has already expired + return exp - time.time() < 10 def claims_token_valid(self): return self._claims_token is not None and not self.claims_token_expired() @@ -322,7 +641,7 @@ def _find_secret_formats(self, formats, video_id): url = re.sub(r'(Manifest\(.*?),format=[\w-]+(.*?\))', r'\1\2', base_url) secret_xml = self._download_xml(url, video_id, note='Downloading secret XML', fatal=False) - if not secret_xml: + if not isinstance(secret_xml, xml.etree.ElementTree.Element): return for child in secret_xml: @@ -346,7 +665,9 @@ def _find_secret_formats(self, formats, video_id): def _real_extract(self, url): video_id = self._match_id(url) - video_info = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + video_id, video_id) + video_info = self._download_json( + f'https://services.radio-canada.ca/ott/cbc-api/v2/assets/{video_id}', + video_id, expected_status=426) email, password = self._get_login_info() if email and password: @@ -368,17 +689,17 @@ def _real_extract(self, url): self._remove_duplicate_formats(formats) formats.extend(self._find_secret_formats(formats, video_id)) - for format in formats: - if format.get('vcodec') == 'none': - if format.get('ext') is None: - format['ext'] = 'm4a' - if format.get('acodec') is None: - format['acodec'] = 'mp4a.40.2' + for fmt in formats: + if fmt.get('vcodec') == 'none': + if fmt.get('ext') is None: + fmt['ext'] = 'm4a' + if fmt.get('acodec') is None: + fmt['acodec'] = 'mp4a.40.2' # Put described audio at the beginning of the list, so that it # isn't chosen by default, as most people won't want it. - if 'descriptive' in format['format_id'].lower(): - format['preference'] = -2 + if 'descriptive' in fmt['format_id'].lower(): + fmt['preference'] = -2 return { 'id': video_id, @@ -401,7 +722,7 @@ def _real_extract(self, url): class CBCGemPlaylistIE(InfoExtractor): IE_NAME = 'gem.cbc.ca:playlist' - _VALID_URL = r'https?://gem\.cbc\.ca/media/(?P<id>(?P<show>[0-9a-z-]+)/s(?P<season>[0-9]+))/?(?:[?#]|$)' + _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P<id>(?P<show>[0-9a-z-]+)/s(?P<season>[0-9]+))/?(?:[?#]|$)' _TESTS = [{ # TV show playlist, all public videos 'url': 'https://gem.cbc.ca/media/schitts-creek/s06', @@ -410,7 +731,14 @@ class CBCGemPlaylistIE(InfoExtractor): 'id': 'schitts-creek/s06', 'title': 'Season 6', 'description': 'md5:6a92104a56cbeb5818cc47884d4326a2', + 'series': 'Schitt\'s Creek', + 'season_number': 6, + 'season': 'Season 6', + 'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/season/perso/cbc_schitts_creek_season_06_carousel_v03.jpg?impolicy=ott&im=Resize=(_Size_)&quality=75', }, + }, { + 'url': 'https://gem.cbc.ca/schitts-creek/s06', + 'only_matching': True, }] _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/shows/' @@ -418,7 +746,7 @@ def _real_extract(self, url): match = self._match_valid_url(url) season_id = match.group('id') show = match.group('show') - show_info = self._download_json(self._API_BASE + show, season_id) + show_info = self._download_json(self._API_BASE + show, season_id, expected_status=426) season = int(match.group('season')) season_info = next((s for s in show_info['seasons'] if s.get('season') == season), None) @@ -470,49 +798,125 @@ def _real_extract(self, url): class CBCGemLiveIE(InfoExtractor): IE_NAME = 'gem.cbc.ca:live' - _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P<id>\d+)' - _TEST = { - 'url': 'https://gem.cbc.ca/live/920604739687', - 'info_dict': { - 'title': 'Ottawa', - 'description': 'The live TV channel and local programming from Ottawa', - 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg', - 'is_live': True, - 'id': 'AyqZwxRqh8EH', - 'ext': 'mp4', - 'timestamp': 1492106160, - 'upload_date': '20170413', - 'uploader': 'CBCC-NEW', + _VALID_URL = r'https?://gem\.cbc\.ca/live(?:-event)?/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'https://gem.cbc.ca/live/920604739687', + 'info_dict': { + 'title': 'Ottawa', + 'description': 'The live TV channel and local programming from Ottawa', + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg', + 'live_status': 'is_live', + 'id': 'AyqZwxRqh8EH', + 'ext': 'mp4', + 'release_timestamp': 1492106160, + 'release_date': '20170413', + 'uploader': 'CBCC-NEW', + }, + 'skip': 'Live might have ended', }, - 'skip': 'Live might have ended', - } - - # It's unclear where the chars at the end come from, but they appear to be - # constant. Might need updating in the future. - # There are two URLs, some livestreams are in one, and some - # in the other. The JSON schema is the same for both. - _API_URLS = ['https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT', 'https://tpfeed.cbc.ca/f/ExhSPC/FNiv9xQx_BnT'] + { + 'url': 'https://gem.cbc.ca/live/44', + 'info_dict': { + 'id': '44', + 'ext': 'mp4', + 'is_live': True, + 'title': r're:^Ottawa [0-9\-: ]+', + 'description': 'The live TV channel and local programming from Ottawa', + 'live_status': 'is_live', + 'thumbnail': r're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*', + }, + 'params': {'skip_download': True}, + 'skip': 'Live might have ended', + }, + { + 'url': 'https://gem.cbc.ca/live-event/10835', + 'info_dict': { + 'id': '10835', + 'ext': 'mp4', + 'is_live': True, + 'title': r're:^The National \| Biden’s trip wraps up, Paltrow testifies, Bird flu [0-9\-: ]+', + 'description': 'March 24, 2023 | President Biden’s Ottawa visit ends with big pledges from both countries. Plus, Gwyneth Paltrow testifies in her ski collision trial.', + 'live_status': 'is_live', + 'thumbnail': r're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*', + 'release_timestamp': 1679706000, + 'release_date': '20230325', + }, + 'params': {'skip_download': True}, + 'skip': 'Live might have ended', + }, + { # event replay (medianetlive) + 'url': 'https://gem.cbc.ca/live-event/42314', + 'md5': '297a9600f554f2258aed01514226a697', + 'info_dict': { + 'id': '42314', + 'ext': 'mp4', + 'live_status': 'was_live', + 'title': 'Women\'s Soccer - Canada vs New Zealand', + 'description': 'md5:36200e5f1a70982277b5a6ecea86155d', + 'thumbnail': r're:https://.+default\.jpg', + 'release_timestamp': 1721917200, + 'release_date': '20240725', + }, + 'params': {'skip_download': True}, + 'skip': 'Replay might no longer be available', + }, + { # event replay (medianetlive) + 'url': 'https://gem.cbc.ca/live-event/43273', + 'only_matching': True, + }, + ] + _GEO_COUNTRIES = ['CA'] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_info = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['data'] - for api_url in self._API_URLS: - video_info = next(( - stream for stream in self._download_json(api_url, video_id)['entries'] - if stream.get('guid') == video_id), None) - if video_info: - break + # Three types of video_info JSON: info in root, freeTv stream/item, event replay + if not video_info.get('formattedIdMedia'): + if traverse_obj(video_info, ('event', 'key')) == video_id: + video_info = video_info['event'] + else: + video_info = traverse_obj(video_info, ( + ('freeTv', ('streams', ...)), 'items', + lambda _, v: v['key'].partition('-')[0] == video_id, any)) or {} + + video_stream_id = video_info.get('formattedIdMedia') + if not video_stream_id: + raise ExtractorError( + 'Couldn\'t find video metadata, maybe this livestream is now offline', expected=True) + + live_status = 'was_live' if video_info.get('isVodEnabled') else 'is_live' + release_timestamp = traverse_obj(video_info, ('airDate', {parse_iso8601})) + + if live_status == 'is_live' and release_timestamp and release_timestamp > time.time(): + formats = [] + live_status = 'is_upcoming' + self.raise_no_formats('This livestream has not yet started', expected=True) else: - raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected=True) + stream_data = self._download_json( + 'https://services.radio-canada.ca/media/validation/v2/', video_id, query={ + 'appCode': 'medianetlive', + 'connectionType': 'hd', + 'deviceType': 'ipad', + 'idMedia': video_stream_id, + 'multibitrate': 'true', + 'output': 'json', + 'tech': 'hls', + 'manifestType': 'desktop', + }) + formats = self._extract_m3u8_formats( + stream_data['url'], video_id, 'mp4', live=live_status == 'is_live') return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': video_info['content'][0]['url'], 'id': video_id, - 'title': video_info.get('title'), - 'description': video_info.get('description'), - 'tags': try_get(video_info, lambda x: x['keywords'].split(', ')), - 'thumbnail': video_info.get('cbc$staticImage'), - 'is_live': True, + 'formats': formats, + 'live_status': live_status, + 'release_timestamp': release_timestamp, + **traverse_obj(video_info, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'thumbnail': ('images', 'card', 'url'), + }), } diff --git a/yt_dlp/extractor/cbs.py b/yt_dlp/extractor/cbs.py index 9aacd50c45..e825588972 100644 --- a/yt_dlp/extractor/cbs.py +++ b/yt_dlp/extractor/cbs.py @@ -1,12 +1,18 @@ +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor from .theplatform import ThePlatformFeedIE +from .youtube import YoutubeIE from ..utils import ( ExtractorError, - int_or_none, + extract_attributes, find_xpath_attr, - xpath_element, - xpath_text, + get_element_html_by_id, + int_or_none, + smuggle_url, update_url_query, url_or_none, + xpath_element, + xpath_text, ) @@ -25,7 +31,7 @@ def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): return subtitles def _extract_common_video_info(self, content_id, asset_types, mpx_acc, extra_info): - tp_path = 'dJ5BDC/media/guid/%d/%s' % (mpx_acc, content_id) + tp_path = f'dJ5BDC/media/guid/{mpx_acc}/{content_id}' tp_release_url = f'https://link.theplatform.com/s/{tp_path}' info = self._extract_theplatform_metadata(tp_path, content_id) @@ -35,7 +41,7 @@ def _extract_common_video_info(self, content_id, asset_types, mpx_acc, extra_inf try: tp_formats, tp_subtitles = self._extract_theplatform_smil( update_url_query(tp_release_url, query), content_id, - 'Downloading %s SMIL data' % asset_type) + f'Downloading {asset_type} SMIL data') except ExtractorError as e: last_e = e if asset_type != 'fallback': @@ -44,7 +50,7 @@ def _extract_common_video_info(self, content_id, asset_types, mpx_acc, extra_inf try: tp_formats, tp_subtitles = self._extract_theplatform_smil( update_url_query(tp_release_url, query), content_id, - 'Downloading %s SMIL data, trying again with another format' % asset_type) + f'Downloading {asset_type} SMIL data, trying again with another format') except ExtractorError as e: last_e = e continue @@ -70,6 +76,7 @@ def _real_extract(self, url): class CBSIE(CBSBaseIE): + _WORKING = False _VALID_URL = r'''(?x) (?: cbs:| @@ -95,6 +102,7 @@ class CBSIE(CBSBaseIE): # m3u8 download 'skip_download': True, }, + 'skip': 'Subscription required', }, { 'url': 'https://www.cbs.com/shows/video/sZH1MGgomIosZgxGJ1l263MFq16oMtW1/', 'info_dict': { @@ -111,6 +119,7 @@ class CBSIE(CBSBaseIE): }, 'expected_warnings': [ 'This content expired on', 'No video formats found', 'Requested format is not available'], + 'skip': '404 Not Found', }, { 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', 'only_matching': True, @@ -162,3 +171,110 @@ def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517): 'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000), 'thumbnail': url_or_none(xpath_text(video_data, 'previewImageURL')), }) + + +class ParamountPressExpressIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?paramountpressexpress\.com(?:/[\w-]+)+/(?P<yt>yt-)?video/?\?watch=(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.paramountpressexpress.com/cbs-entertainment/shows/survivor/video/?watch=pnzew7e2hx', + 'md5': '56631dbcadaab980d1fc47cb7b76cba4', + 'info_dict': { + 'id': '6322981580112', + 'ext': 'mp4', + 'title': 'I’m Felicia', + 'description': 'md5:88fad93f8eede1c9c8f390239e4c6290', + 'uploader_id': '6055873637001', + 'upload_date': '20230320', + 'timestamp': 1679334960, + 'duration': 49.557, + 'thumbnail': r're:^https://.+\.jpg', + 'tags': [], + }, + }, { + 'url': 'https://www.paramountpressexpress.com/cbs-entertainment/video/?watch=2s5eh8kppc', + 'md5': 'edcb03e3210b88a3e56c05aa863e0e5b', + 'info_dict': { + 'id': '6323036027112', + 'ext': 'mp4', + 'title': '‘Y&R’ Set Visit: Jerry O’Connell Quizzes Cast on Pre-Love Scene Rituals and More', + 'description': 'md5:b929867a357aac5544b783d834c78383', + 'uploader_id': '6055873637001', + 'upload_date': '20230321', + 'timestamp': 1679430180, + 'duration': 132.032, + 'thumbnail': r're:^https://.+\.jpg', + 'tags': [], + }, + }, { + 'url': 'https://www.paramountpressexpress.com/paramount-plus/yt-video/?watch=OX9wJWOcqck', + 'info_dict': { + 'id': 'OX9wJWOcqck', + 'ext': 'mp4', + 'title': 'Rugrats | Season 2 Official Trailer | Paramount+', + 'description': 'md5:1f7e26f5625a9f0d6564d9ad97a9f7de', + 'uploader': 'Paramount Plus', + 'uploader_id': '@paramountplus', + 'uploader_url': 'http://www.youtube.com/@paramountplus', + 'channel': 'Paramount Plus', + 'channel_id': 'UCrRttZIypNTA1Mrfwo745Sg', + 'channel_url': 'https://www.youtube.com/channel/UCrRttZIypNTA1Mrfwo745Sg', + 'upload_date': '20230316', + 'duration': 88, + 'age_limit': 0, + 'availability': 'public', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'view_count': int, + 'like_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/OX9wJWOcqck/maxresdefault.jpg', + 'categories': ['Entertainment'], + 'tags': ['Rugrats'], + }, + }, { + 'url': 'https://www.paramountpressexpress.com/showtime/yt-video/?watch=_ljssSoDLkw', + 'info_dict': { + 'id': '_ljssSoDLkw', + 'ext': 'mp4', + 'title': 'Lavell Crawford: THEE Lavell Crawford Comedy Special Official Trailer | SHOWTIME', + 'description': 'md5:39581bcc3fd810209b642609f448af70', + 'uploader': 'SHOWTIME', + 'uploader_id': '@Showtime', + 'uploader_url': 'http://www.youtube.com/@Showtime', + 'channel': 'SHOWTIME', + 'channel_id': 'UCtwMWJr2BFPkuJTnSvCESSQ', + 'channel_url': 'https://www.youtube.com/channel/UCtwMWJr2BFPkuJTnSvCESSQ', + 'upload_date': '20230209', + 'duration': 49, + 'age_limit': 0, + 'availability': 'public', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi_webp/_ljssSoDLkw/maxresdefault.webp', + 'categories': ['People & Blogs'], + 'tags': 'count:27', + }, + }] + + def _real_extract(self, url): + display_id, is_youtube = self._match_valid_url(url).group('id', 'yt') + if is_youtube: + return self.url_result(display_id, YoutubeIE) + + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r'\bvideo_id\s*=\s*["\'](\d+)["\']\s*,', webpage, 'Brightcove ID') + token = self._search_regex(r'\btoken\s*=\s*["\']([\w.-]+)["\']', webpage, 'token') + + player = extract_attributes(get_element_html_by_id('vcbrightcoveplayer', webpage) or '') + account_id = player.get('data-account') or '6055873637001' + player_id = player.get('data-player') or 'OtLKgXlO9F' + embed = player.get('data-embed') or 'default' + + return self.url_result(smuggle_url( + f'https://players.brightcove.net/{account_id}/{player_id}_{embed}/index.html?videoId={video_id}', + {'token': token}), BrightcoveNewIE) diff --git a/yt_dlp/extractor/cbsinteractive.py b/yt_dlp/extractor/cbsinteractive.py deleted file mode 100644 index b09e9823eb..0000000000 --- a/yt_dlp/extractor/cbsinteractive.py +++ /dev/null @@ -1,98 +0,0 @@ -from .cbs import CBSIE -from ..utils import int_or_none - - -class CBSInteractiveIE(CBSIE): # XXX: Do not subclass from concrete IE - _VALID_URL = r'https?://(?:www\.)?(?P<site>cnet|zdnet)\.com/(?:videos|video(?:/share)?)/(?P<id>[^/?]+)' - _TESTS = [{ - 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', - 'info_dict': { - 'id': 'R49SYt__yAfmlXR85z4f7gNmCBDcN_00', - 'display_id': 'hands-on-with-microsofts-windows-8-1-update', - 'ext': 'mp4', - 'title': 'Hands-on with Microsoft Windows 8.1 Update', - 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', - 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', - 'uploader': 'Sarah Mitroff', - 'duration': 70, - 'timestamp': 1396479627, - 'upload_date': '20140402', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', - 'md5': 'f11d27b2fa18597fbf92444d2a9ed386', - 'info_dict': { - 'id': 'kjOJd_OoVJqbg_ZD8MZCOk8Wekb9QccK', - 'display_id': 'whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187', - 'ext': 'mp4', - 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', - 'description': 'md5:d2b9a95a5ffe978ae6fbd4cf944d618f', - 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', - 'uploader': 'Ashley Esqueda', - 'duration': 1482, - 'timestamp': 1433289889, - 'upload_date': '20150603', - }, - }, { - 'url': 'http://www.zdnet.com/video/share/video-keeping-android-smartphones-and-tablets-secure/', - 'info_dict': { - 'id': 'k0r4T_ehht4xW_hAOqiVQPuBDPZ8SRjt', - 'display_id': 'video-keeping-android-smartphones-and-tablets-secure', - 'ext': 'mp4', - 'title': 'Video: Keeping Android smartphones and tablets secure', - 'description': 'Here\'s the best way to keep Android devices secure, and what you do when they\'ve come to the end of their lives.', - 'uploader_id': 'f2d97ea2-8175-11e2-9d12-0018fe8a00b0', - 'uploader': 'Adrian Kingsley-Hughes', - 'duration': 731, - 'timestamp': 1449129925, - 'upload_date': '20151203', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.zdnet.com/video/huawei-matebook-x-video/', - 'only_matching': True, - }] - - MPX_ACCOUNTS = { - 'cnet': 2198311517, - 'zdnet': 2387448114, - } - - def _real_extract(self, url): - site, display_id = self._match_valid_url(url).groups() - webpage = self._download_webpage(url, display_id) - - data_json = self._html_search_regex( - r"data(?:-(?:cnet|zdnet))?-video(?:-(?:uvp(?:js)?|player))?-options='([^']+)'", - webpage, 'data json') - data = self._parse_json(data_json, display_id) - vdata = data.get('video') or (data.get('videos') or data.get('playlist'))[0] - - video_id = vdata['mpxRefId'] - - title = vdata['title'] - author = vdata.get('author') - if author: - uploader = '%s %s' % (author['firstName'], author['lastName']) - uploader_id = author.get('id') - else: - uploader = None - uploader_id = None - - info = self._extract_video_info(video_id, site, self.MPX_ACCOUNTS[site]) - info.update({ - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'duration': int_or_none(vdata.get('duration')), - 'uploader': uploader, - 'uploader_id': uploader_id, - }) - return info diff --git a/yt_dlp/extractor/cbslocal.py b/yt_dlp/extractor/cbslocal.py deleted file mode 100644 index 3d50b0499f..0000000000 --- a/yt_dlp/extractor/cbslocal.py +++ /dev/null @@ -1,116 +0,0 @@ -from .anvato import AnvatoIE -from .sendtonews import SendtoNewsIE -from ..compat import compat_urlparse -from ..utils import ( - parse_iso8601, - unified_timestamp, -) - - -class CBSLocalIE(AnvatoIE): # XXX: Do not subclass from concrete IE - _VALID_URL_BASE = r'https?://[a-z]+\.cbslocal\.com/' - _VALID_URL = _VALID_URL_BASE + r'video/(?P<id>\d+)' - - _TESTS = [{ - 'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/', - 'info_dict': { - 'id': '3580809', - 'ext': 'mp4', - 'title': 'A Very Blue Anniversary', - 'description': 'CBS2’s Cindy Hsu has more.', - 'thumbnail': 're:^https?://.*', - 'timestamp': int, - 'upload_date': r're:^\d{8}$', - 'uploader': 'CBS', - 'subtitles': { - 'en': 'mincount:5', - }, - 'categories': [ - 'Stations\\Spoken Word\\WCBSTV', - 'Syndication\\AOL', - 'Syndication\\MSN', - 'Syndication\\NDN', - 'Syndication\\Yahoo', - 'Content\\News', - 'Content\\News\\Local News', - ], - 'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'], - }, - 'params': { - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - mcp_id = self._match_id(url) - return self.url_result( - 'anvato:anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67:' + mcp_id, 'Anvato', mcp_id) - - -class CBSLocalArticleIE(AnvatoIE): # XXX: Do not subclass from concrete IE - _VALID_URL = CBSLocalIE._VALID_URL_BASE + r'\d+/\d+/\d+/(?P<id>[0-9a-z-]+)' - - _TESTS = [{ - # Anvato backend - 'url': 'http://losangeles.cbslocal.com/2016/05/16/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis', - 'md5': 'f0ee3081e3843f575fccef901199b212', - 'info_dict': { - 'id': '3401037', - 'ext': 'mp4', - 'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'', - 'description': 'Collapsing seats have been the focus of scrutiny for decades, though experts say remarkably little has been done to address the issue. Randy Paige reports.', - 'thumbnail': 're:^https?://.*', - 'timestamp': 1463440500, - 'upload_date': '20160516', - 'uploader': 'CBS', - 'subtitles': { - 'en': 'mincount:5', - }, - 'categories': [ - 'Stations\\Spoken Word\\KCBSTV', - 'Syndication\\MSN', - 'Syndication\\NDN', - 'Syndication\\AOL', - 'Syndication\\Yahoo', - 'Syndication\\Tribune', - 'Syndication\\Curb.tv', - 'Content\\News' - ], - 'tags': ['CBS 2 News Evening'], - }, - }, { - # SendtoNews embed - 'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/', - 'info_dict': { - 'id': 'GxfCe0Zo7D-175909-5588', - }, - 'playlist_count': 9, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - sendtonews_url = SendtoNewsIE._extract_url(webpage) - if sendtonews_url: - return self.url_result( - compat_urlparse.urljoin(url, sendtonews_url), - ie=SendtoNewsIE.ie_key()) - - info_dict = self._extract_anvato_videos(webpage, display_id) - - timestamp = unified_timestamp(self._html_search_regex( - r'class="(?:entry|post)-date"[^>]*>([^<]+)', webpage, - 'released date', default=None)) or parse_iso8601( - self._html_search_meta('uploadDate', webpage)) - - info_dict.update({ - 'display_id': display_id, - 'timestamp': timestamp, - }) - - return info_dict diff --git a/yt_dlp/extractor/cbsnews.py b/yt_dlp/extractor/cbsnews.py index 16edf3af86..972e111190 100644 --- a/yt_dlp/extractor/cbsnews.py +++ b/yt_dlp/extractor/cbsnews.py @@ -1,36 +1,152 @@ +import base64 import re +import urllib.parse import zlib +from .anvato import AnvatoIE from .common import InfoExtractor -from .cbs import CBSIE -from ..compat import ( - compat_b64decode, - compat_urllib_parse_unquote, -) +from .paramountplus import ParamountPlusIE +from ..networking import HEADRequest from ..utils import ( + ExtractorError, + UserNotLive, + determine_ext, + float_or_none, + format_field, + int_or_none, + make_archive_id, + mimetype2ext, parse_duration, + smuggle_url, + traverse_obj, + url_or_none, ) -class CBSNewsEmbedIE(CBSIE): # XXX: Do not subclass from concrete IE +class CBSNewsBaseIE(InfoExtractor): + _LOCALES = { + 'atlanta': None, + 'baltimore': 'BAL', + 'boston': 'BOS', + 'chicago': 'CHI', + 'colorado': 'DEN', + 'detroit': 'DET', + 'losangeles': 'LA', + 'miami': 'MIA', + 'minnesota': 'MIN', + 'newyork': 'NY', + 'philadelphia': 'PHI', + 'pittsburgh': 'PIT', + 'sacramento': 'SAC', + 'sanfrancisco': 'SF', + 'texas': 'DAL', + } + _LOCALE_RE = '|'.join(map(re.escape, _LOCALES)) + _ANVACK = '5VD6Eyd6djewbCmNwBFnsJj17YAvGRwl' + + def _get_item(self, webpage, display_id): + return traverse_obj(self._search_json( + r'CBSNEWS\.defaultPayload\s*=', webpage, 'payload', display_id, + default={}), ('items', 0, {dict})) or {} + + def _get_video_url(self, item): + return traverse_obj(item, 'video', 'video2', expected_type=url_or_none) + + def _extract_playlist(self, webpage, playlist_id): + entries = [self.url_result(embed_url, CBSNewsEmbedIE) for embed_url in re.findall( + r'<iframe[^>]+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage)] + if entries: + return self.playlist_result( + entries, playlist_id, self._html_search_meta(['og:title', 'twitter:title'], webpage), + self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage)) + + def _extract_video(self, item, video_url, video_id): + if mimetype2ext(item.get('format'), default=determine_ext(video_url)) == 'mp4': + formats = [{'url': video_url, 'ext': 'mp4'}] + + else: + manifest = self._download_webpage(video_url, video_id, note='Downloading m3u8 information') + + anvato_id = self._search_regex(r'anvato-(\d+)', manifest, 'Anvato ID', default=None) + # Prefer Anvato if available; cbsnews.com m3u8 formats are re-encoded from Anvato source + if anvato_id: + return self.url_result( + smuggle_url(f'anvato:{self._ANVACK}:{anvato_id}', {'token': 'default'}), + AnvatoIE, url_transparent=True, _old_archive_ids=[make_archive_id(self, anvato_id)]) + + formats, _ = self._parse_m3u8_formats_and_subtitles( + manifest, video_url, 'mp4', m3u8_id='hls', video_id=video_id) + + def get_subtitles(subs_url): + return { + 'en': [{ + 'url': subs_url, + 'ext': 'dfxp', # TTAF1 + }], + } if url_or_none(subs_url) else None + + episode_meta = traverse_obj(item, { + 'season_number': ('season', {int_or_none}), + 'episode_number': ('episode', {int_or_none}), + }) if item.get('isFullEpisode') else {} + + return { + 'id': video_id, + 'formats': formats, + **traverse_obj(item, { + 'title': (None, ('fulltitle', 'title')), + 'description': 'dek', + 'timestamp': ('timestamp', {lambda x: float_or_none(x, 1000)}), + 'duration': ('duration', {float_or_none}), + 'subtitles': ('captions', {get_subtitles}), + 'thumbnail': ('images', ('hd', 'sd'), {url_or_none}), + 'is_live': ('type', {lambda x: x == 'live'}), + }, get_all=False), + **episode_meta, + } + + +class CBSNewsEmbedIE(CBSNewsBaseIE): IE_NAME = 'cbsnews:embed' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/embed/video[^#]*#(?P<id>.+)' _TESTS = [{ 'url': 'https://www.cbsnews.com/embed/video/?v=1.c9b5b61492913d6660db0b2f03579ef25e86307a#1Vb7b9s2EP5XBAHbT6Gt98PAMKTJ0se6LVjWYWtdGBR1stlIpEBSTtwi%2F%2FvuJNkNhmHdGxgM2NL57vjd6zt%2B8PngdN%2Fyg79qeGvhzN%2FLGrS%2F%2BuBLB531V28%2B%2BO7Qg7%2Fy97r2z3xZ42NW8yLhDbA0S0KWlHnIijwKWJBHZZnHBa8Cgbpdf%2F89NM9Hi9fXifhpr8sr%2FlP848tn%2BTdXycX25zh4cdX%2FvHl6PmmPqnWQv9w8Ed%2B9GjYRim07bFEqdG%2BZVHuwTm65A7bVRrYtR5lAyMox7pigF6W4k%2By91mjspGsJ%2BwVae4%2BsvdnaO1p73HkXs%2FVisUDTGm7R8IcdnOROeq%2B19qT1amhA1VJtPenoTUgrtfKc9m7Rq8dP7nnjwOB7wg7ADdNt7VX64DWAWlKhPtmDEq22g4GF99x6Dk9E8OSsankHXqPNKDxC%2FdK7MLKTircTDgsI3mmj4OBdSq64dy7fd1x577RU1rt4cvMtOaulFYOd%2FLewRWvDO9lIgXFpZSnkZmjbv5SxKTPoQXClFbpsf%2Fhbbpzs0IB3vb8KkyzJQ%2BywOAgCrMpgRrz%2BKk4fvb7kFbR4XJCu0gAdtNO7woCwZTu%2BBUs9bam%2Fds71drVerpeisgrubLjAB4nnOSkWQnfr5W6o1ku5Xpr1MgrCbL0M0vUyDtfLLK15WiYp47xKWSLyjFVpwVmVJSLIoCjSOFkv3W7oKsVliwZJcB9nwXpZ5GEQQwY8jNKqKCBrgjTLeFxgdCIpazojDgnRtn43J6kG7nZ6cAbxh0EeFFk4%2B1u867cY5u4344n%2FxXjCqAjucdTHgLKojNKmSfO8KRsOFY%2FzKEYCKEJBzv90QA9nfm9gL%2BHulaFqUkz9ULUYxl62B3U%2FRVNLA8IhggaPycOoBuwOCESciDQVSSUgiOMsROB%2FhKfwCKOzEk%2B4k6rWd4uuT%2FwTDz7K7t3d3WLO8ISD95jSPQbayBacthbz86XVgxHwhex5zawzgDOmtp%2F3GPcXn0VXHdSS029%2Fj99UC%2FwJUvyKQ%2FzKyixIEVlYJOn4RxxuaH43Ty9fbJ5OObykHH435XAzJTHeOF4hhEUXD8URe%2FQ%2FBT%2BMpf8d5GN02Ox%2FfiGsl7TA7POu1xZ5%2BbTzcAVKMe48mqcC21hkacVEVScM26liVVBnrKkC4CLKyzAvHu0lhEaTKMFwI3a4SN9MsrfYzdBLq2vkwRD1gVviLT8kY9h2CHH6Y%2Bix6609weFtey4ESp60WtyeWMy%2BsmBuhsoKIyuoT%2Bq2R%2FrW5qi3g%2FvzS2j40DoixDP8%2BKP0yUdpXJ4l6Vla%2Bg9vce%2BC4yM5YlUcbA%2F0jLKdpmTwvsdN5z88nAIe08%2F0HgxeG1iv%2B6Hlhjh7uiW0SDzYNI92L401uha3JKYk268UVRzdOzNQvAaJqoXzAc80dAV440NZ1WVVAAMRYQ2KrGJFmDUsq8saWSnjvIj8t78y%2FRa3JRnbHVfyFpfwoDiGpPgjzekyUiKNlU3OMlwuLMmzgvEojllYVE2Z1HhImvsnk%2BuhusTEoB21PAtSFodeFK3iYhXEH9WOG2%2FkOE833sfeG%2Ff5cfHtEFNXgYes0%2FXj7aGivUgJ9XpusCtoNcNYVVnJVrrDo0OmJAutHCpuZul4W9lLcfy7BnuLPT02%2ByXsCTk%2B9zhzswIN04YueNSK%2BPtM0jS88QdLqSLJDTLsuGZJNolm2yO0PXh3UPnz9Ix5bfIAqxPjvETQsDCEiPG4QbqNyhBZISxybLnZYCrW5H3Axp690%2F0BJdXtDZ5ITuM4xj3f4oUHGzc5JeJmZKpp%2FjwKh4wMV%2FV1yx3emLoR0MwbG4K%2F%2BZgVep3PnzXGDHZ6a3i%2Fk%2BJrONDN13%2Bnq6tBTYk4o7cLGhBtqCC4KwacGHpEVuoH5JNro%2FE6JfE6d5RydbiR76k%2BW5wioDHBIjw1euhHjUGRB0y5A97KoaPx6MlL%2BwgboUVtUFRI%2FLemgTpdtF59ii7pab08kuPcfWzs0l%2FRI5takWnFpka0zOgWRtYcuf9aIxZMxlwr6IiGpsb6j2DQUXPl%2FimXI599Ev7fWjoPD78A', - 'only_matching': True, + 'info_dict': { + 'id': '6ZP4cXvo9FaX3VLH7MF4CgY30JFpY_GA', + 'ext': 'mp4', + 'title': 'Cops investigate gorilla incident at Cincinnati Zoo', + 'description': 'md5:fee7441ab8aaeb3c693482394738102b', + 'duration': 350, + 'timestamp': 1464719713, + 'upload_date': '20160531', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): - item = self._parse_json(zlib.decompress(compat_b64decode( - compat_urllib_parse_unquote(self._match_id(url))), - -zlib.MAX_WBITS).decode('utf-8'), None)['video']['items'][0] - return self._extract_video_info(item['mpxRefId'], 'cbsnews') + item = traverse_obj(self._parse_json(zlib.decompress(base64.b64decode( + urllib.parse.unquote(self._match_id(url))), + -zlib.MAX_WBITS).decode(), None), ('video', 'items', 0, {dict})) or {} + + video_id = item['mpxRefId'] + video_url = self._get_video_url(item) + if not video_url: + # Old embeds redirect user to ParamountPlus but most links are 404 + pplus_url = f'https://www.paramountplus.com/shows/video/{video_id}' + try: + self._request_webpage(HEADRequest(pplus_url), video_id) + return self.url_result(pplus_url, ParamountPlusIE) + except ExtractorError: + self.raise_no_formats('This video is no longer available', True, video_id) + + return self._extract_video(item, video_url, video_id) -class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE +class CBSNewsIE(CBSNewsBaseIE): IE_NAME = 'cbsnews' IE_DESC = 'CBS News' - _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P<id>[\da-z_-]+)' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P<id>[\w-]+)' _TESTS = [ { @@ -47,10 +163,7 @@ class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE 'timestamp': 1476046464, 'upload_date': '20161009', }, - 'params': { - # rtmp download - 'skip_download': True, - }, + 'skip': 'This video is no longer available', }, { 'url': 'https://www.cbsnews.com/video/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', @@ -61,48 +174,234 @@ class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE 'description': 'md5:4a6983e480542d8b333a947bfc64ddc7', 'upload_date': '20140404', 'timestamp': 1396650660, - 'uploader': 'CBSI-NEW', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 205, 'subtitles': { 'en': [{ - 'ext': 'ttml', + 'ext': 'dfxp', }], }, }, 'params': { - # m3u8 download - 'skip_download': True, + 'skip_download': 'm3u8', }, }, { # 48 hours 'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/', 'info_dict': { + 'id': 'maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved', 'title': 'Cold as Ice', 'description': 'Can a childhood memory solve the 1957 murder of 7-year-old Maria Ridulph?', }, 'playlist_mincount': 7, }, + { + 'url': 'https://www.cbsnews.com/video/032823-cbs-evening-news/', + 'info_dict': { + 'id': '_2wuO7hD9LwtyM_TwSnVwnKp6kxlcXgE', + 'ext': 'mp4', + 'title': 'CBS Evening News, March 28, 2023', + 'description': 'md5:db20615aae54adc1d55a1fd69dc75d13', + 'duration': 1189, + 'timestamp': 1680042600, + 'upload_date': '20230328', + 'season': 'Season 2023', + 'season_number': 2023, + 'episode': 'Episode 83', + 'episode_number': 83, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, ] def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - entries = [] - for embed_url in re.findall(r'<iframe[^>]+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage): - entries.append(self.url_result(embed_url, CBSNewsEmbedIE.ie_key())) - if entries: - return self.playlist_result( - entries, playlist_title=self._html_search_meta(['og:title', 'twitter:title'], webpage), - playlist_description=self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage)) + playlist = self._extract_playlist(webpage, display_id) + if playlist: + return playlist - item = self._parse_json(self._html_search_regex( - r'CBSNEWS\.defaultPayload\s*=\s*({.+})', - webpage, 'video JSON info'), display_id)['items'][0] - return self._extract_video_info(item['mpxRefId'], 'cbsnews') + item = self._get_item(webpage, display_id) + video_id = item.get('mpxRefId') or display_id + video_url = self._get_video_url(item) + if not video_url: + self.raise_no_formats('No video content was found', expected=True, video_id=video_id) + + return self._extract_video(item, video_url, video_id) + + +class CBSLocalBaseIE(CBSNewsBaseIE): + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + item = self._get_item(webpage, display_id) + video_id = item.get('mpxRefId') or display_id + anvato_id = None + video_url = self._get_video_url(item) + + if not video_url: + anv_params = self._search_regex( + r'<iframe[^>]+\bdata-src="https?://w3\.mp\.lura\.live/player/prod/v3/anvload\.html\?key=([^"]+)"', + webpage, 'Anvato URL', default=None) + + if not anv_params: + playlist = self._extract_playlist(webpage, display_id) + if playlist: + return playlist + self.raise_no_formats('No video content was found', expected=True, video_id=video_id) + + anv_data = self._parse_json(base64.urlsafe_b64decode(f'{anv_params}===').decode(), video_id) + anvato_id = anv_data['v'] + return self.url_result( + smuggle_url(f'anvato:{anv_data.get("anvack") or self._ANVACK}:{anvato_id}', { + 'token': anv_data.get('token') or 'default', + }), AnvatoIE, url_transparent=True, _old_archive_ids=[make_archive_id(self, anvato_id)]) + + return self._extract_video(item, video_url, video_id) + + +class CBSLocalIE(CBSLocalBaseIE): + _VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?:{CBSNewsBaseIE._LOCALE_RE})/(?:live/)?video/(?P<id>[\w-]+)' + _TESTS = [{ + # Anvato video via defaultPayload JSON + 'url': 'https://www.cbsnews.com/newyork/video/1st-cannabis-dispensary-opens-in-queens/', + 'info_dict': { + 'id': '6376747', + 'ext': 'mp4', + 'title': '1st cannabis dispensary opens in Queens', + 'description': 'The dispensary is women-owned and located in Jamaica.', + 'uploader': 'CBS', + 'duration': 20, + 'timestamp': 1680193657, + 'upload_date': '20230330', + 'categories': ['Stations\\Spoken Word\\WCBSTV', 'Content\\Google', 'Content\\News', 'Content\\News\\Local News'], + 'tags': 'count:11', + 'thumbnail': 're:^https?://.*', + '_old_archive_ids': ['cbslocal 6376747'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + # cbsnews.com video via defaultPayload JSON + 'url': 'https://www.cbsnews.com/newyork/live/video/20230330171655-the-city-is-sounding-the-alarm-on-dangerous-social-media-challenges/', + 'info_dict': { + 'id': 'sJqfw7YvgSC6ant2zVmzt3y1jYKoL5J3', + 'ext': 'mp4', + 'title': 'the city is sounding the alarm on dangerous social media challenges', + 'description': 'md5:8eccc9b1b73be5138a52e9c4350d2cd6', + 'thumbnail': 'https://images-cbsn.cbsnews.com/prod/2023/03/30/story_22509622_1680196925.jpg', + 'duration': 41.0, + 'timestamp': 1680196615, + 'upload_date': '20230330', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + +class CBSLocalArticleIE(CBSLocalBaseIE): + _VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?:{CBSNewsBaseIE._LOCALE_RE})/news/(?P<id>[\w-]+)' + _TESTS = [{ + # Anvato video via iframe embed + 'url': 'https://www.cbsnews.com/newyork/news/mta-station-agents-leaving-their-booths-to-provide-more-direct-customer-service/', + 'playlist_count': 2, + 'info_dict': { + 'id': 'mta-station-agents-leaving-their-booths-to-provide-more-direct-customer-service', + 'title': 'MTA station agents begin leaving their booths to provide more direct customer service', + 'description': 'The more than 2,200 agents will provide face-to-face customer service to passengers.', + }, + }, { + 'url': 'https://www.cbsnews.com/losangeles/news/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis/', + 'md5': 'f0ee3081e3843f575fccef901199b212', + 'info_dict': { + 'id': '3401037', + 'ext': 'mp4', + 'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'', + 'thumbnail': 're:^https?://.*', + 'timestamp': 1463440500, + 'upload_date': '20160516', + }, + 'skip': 'Video has been removed', + }] + + +class CBSNewsLiveBaseIE(CBSNewsBaseIE): + def _get_id(self, url): + raise NotImplementedError('This method must be implemented by subclasses') + + def _real_extract(self, url): + video_id = self._get_id(url) + if not video_id: + raise ExtractorError('Livestream is not available', expected=True) + + data = traverse_obj(self._download_json( + 'https://feeds-cbsn.cbsnews.com/2.0/rundown/', video_id, query={ + 'partner': 'cbsnsite', + 'edition': video_id, + 'type': 'live', + }), ('navigation', 'data', 0, {dict})) + + video_url = traverse_obj(data, (('videoUrlDAI', ('videoUrl', 'base')), {url_or_none}), get_all=False) + if not video_url: + raise UserNotLive(video_id=video_id) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + **traverse_obj(data, { + 'title': 'headline', + 'description': 'rundown_slug', + 'thumbnail': ('images', 'thumbnail_url_hd', {url_or_none}), + }), + } + + +class CBSLocalLiveIE(CBSNewsLiveBaseIE): + _VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?P<id>{CBSNewsBaseIE._LOCALE_RE})/live/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.cbsnews.com/losangeles/live/', + 'info_dict': { + 'id': 'CBSN-LA', + 'ext': 'mp4', + 'title': str, + 'description': r're:KCBS/CBSN_LA.CRISPIN.\w+.RUNDOWN \w+ \w+', + 'thumbnail': r're:^https?://.*\.jpg$', + 'live_status': 'is_live', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _get_id(self, url): + return format_field(self._LOCALES, self._match_id(url), 'CBSN-%s') + + +class CBSNewsLiveIE(CBSNewsLiveBaseIE): + IE_NAME = 'cbsnews:live' + IE_DESC = 'CBS News Livestream' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.cbsnews.com/live/', + 'info_dict': { + 'id': 'CBSN-US', + 'ext': 'mp4', + 'title': str, + 'description': r're:\w+ \w+ CRISPIN RUNDOWN', + 'thumbnail': r're:^https?://.*\.jpg$', + 'live_status': 'is_live', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _get_id(self, url): + return 'CBSN-US' class CBSNewsLiveVideoIE(InfoExtractor): @@ -111,7 +410,7 @@ class CBSNewsLiveVideoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P<id>[^/?#]+)' # Live videos get deleted soon. See http://www.cbsnews.com/live/ for the latest examples - _TEST = { + _TESTS = [{ 'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', 'info_dict': { 'id': 'clinton-sanders-prepare-to-face-off-in-nh', @@ -120,7 +419,7 @@ class CBSNewsLiveVideoIE(InfoExtractor): 'duration': 334, }, 'skip': 'Video gone', - } + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -131,13 +430,13 @@ def _real_extract(self, url): 'dvr_slug': display_id, }) - formats = self._extract_akamai_formats(video_info['url'], display_id) - return { 'id': display_id, 'display_id': display_id, - 'title': video_info['headline'], - 'thumbnail': video_info.get('thumbnail_url_hd') or video_info.get('thumbnail_url_sd'), - 'duration': parse_duration(video_info.get('segmentDur')), - 'formats': formats, + 'formats': self._extract_akamai_formats(video_info['url'], display_id), + **traverse_obj(video_info, { + 'title': 'headline', + 'thumbnail': ('thumbnail_url_hd', {url_or_none}), + 'duration': ('segmentDur', {parse_duration}), + }), } diff --git a/yt_dlp/extractor/cbssports.py b/yt_dlp/extractor/cbssports.py index b5d85af12b..b9c82dab66 100644 --- a/yt_dlp/extractor/cbssports.py +++ b/yt_dlp/extractor/cbssports.py @@ -8,6 +8,7 @@ # class CBSSportsEmbedIE(CBSBaseIE): class CBSSportsEmbedIE(InfoExtractor): + _WORKING = False IE_NAME = 'cbssports:embed' _VALID_URL = r'''(?ix)https?://(?:(?:www\.)?cbs|embed\.247)sports\.com/player/embed.+? (?: @@ -75,6 +76,7 @@ def _real_extract(self, url): class CBSSportsIE(CBSSportsBaseIE): + _WORKING = False IE_NAME = 'cbssports' _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/video/(?P<id>[^/?#&]+)' _TESTS = [{ @@ -92,6 +94,7 @@ class CBSSportsIE(CBSSportsBaseIE): class TwentyFourSevenSportsIE(CBSSportsBaseIE): + _WORKING = False IE_NAME = '247sports' _VALID_URL = r'https?://(?:www\.)?247sports\.com/Video/(?:[^/?#&]+-)?(?P<id>\d+)' _TESTS = [{ diff --git a/yt_dlp/extractor/ccc.py b/yt_dlp/extractor/ccc.py index 22e3a22ece..1d781cc477 100644 --- a/yt_dlp/extractor/ccc.py +++ b/yt_dlp/extractor/ccc.py @@ -25,7 +25,7 @@ class CCCIE(InfoExtractor): 'timestamp': 1388188800, 'duration': 3710, 'tags': list, - } + }, }, { 'url': 'https://media.ccc.de/v/32c3-7368-shopshifting#download', 'only_matching': True, @@ -35,7 +35,7 @@ def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) event_id = self._search_regex(r"data-id='(\d+)'", webpage, 'event id') - event_data = self._download_json('https://media.ccc.de/public/events/%s' % event_id, event_id) + event_data = self._download_json(f'https://media.ccc.de/public/events/{event_id}', event_id) formats = [] for recording in event_data.get('recordings', []): @@ -90,10 +90,17 @@ class CCCPlaylistIE(InfoExtractor): 'id': '30c3', }, 'playlist_count': 135, + }, { + 'url': 'https://media.ccc.de/c/DS2023', + 'info_dict': { + 'title': 'Datenspuren 2023', + 'id': 'DS2023', + }, + 'playlist_count': 37, }] def _real_extract(self, url): - playlist_id = self._match_id(url).lower() + playlist_id = self._match_id(url) conf = self._download_json( 'https://media.ccc.de/public/conferences/' + playlist_id, diff --git a/yt_dlp/extractor/ccma.py b/yt_dlp/extractor/ccma.py index 88ff82f6e6..ffe4b49c15 100644 --- a/yt_dlp/extractor/ccma.py +++ b/yt_dlp/extractor/ccma.py @@ -1,6 +1,7 @@ from .common import InfoExtractor from ..utils import ( clean_html, + determine_ext, int_or_none, parse_duration, parse_resolution, @@ -23,7 +24,7 @@ class CCMAIE(InfoExtractor): 'timestamp': 1478608140, 'upload_date': '20161108', 'age_limit': 0, - } + }, }, { 'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/', 'md5': 'fa3e38f269329a278271276330261425', @@ -36,7 +37,7 @@ class CCMAIE(InfoExtractor): 'timestamp': 1494622500, 'vcodec': 'none', 'categories': ['Esports'], - } + }, }, { 'url': 'http://www.ccma.cat/tv3/alacarta/crims/crims-josep-tallada-lespereu-me-capitol-1/video/6031387/', 'md5': 'b43c3d3486f430f3032b5b160d80cbc3', @@ -50,7 +51,7 @@ class CCMAIE(InfoExtractor): 'subtitles': 'mincount:4', 'age_limit': 16, 'series': 'Crims', - } + }, }] def _real_extract(self, url): @@ -60,6 +61,7 @@ def _real_extract(self, url): 'http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={ 'media': media_type, 'idint': media_id, + 'format': 'dm', }) formats = [] @@ -69,6 +71,10 @@ def _real_extract(self, url): format_url = url_or_none(format_.get('file')) if not format_url: continue + if determine_ext(format_url) == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, media_id, mpd_id='dash', fatal=False)) + continue label = format_.get('label') f = parse_resolution(label) f.update({ diff --git a/yt_dlp/extractor/cctv.py b/yt_dlp/extractor/cctv.py index 466bdfb7cc..18c080df1b 100644 --- a/yt_dlp/extractor/cctv.py +++ b/yt_dlp/extractor/cctv.py @@ -1,7 +1,6 @@ import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( float_or_none, try_get, @@ -88,6 +87,20 @@ class CCTVIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # videoCenterId: "id" + 'url': 'http://news.cctv.com/2024/02/21/ARTIcU5tKIOIF2myEGCATkLo240221.shtml', + 'info_dict': { + 'id': '5c846c0518444308ba32c4159df3b3e0', + 'ext': 'mp4', + 'title': '《平“语”近人——习近平喜欢的典故》第三季 第5集:风物长宜放眼量', + 'uploader': 'yangjuan', + 'timestamp': 1708554940, + 'upload_date': '20240221', + }, + 'params': { + 'skip_download': True, + }, }, { # var ids = ["id"] 'url': 'http://www.ncpa-classic.com/clt/more/416/index.shtml', @@ -128,7 +141,7 @@ def _real_extract(self, url): video_id = self._search_regex( [r'var\s+guid\s*=\s*["\']([\da-fA-F]+)', - r'videoCenterId["\']\s*,\s*["\']([\da-fA-F]+)', + r'videoCenterId(?:["\']\s*,|:)\s*["\']([\da-fA-F]+)', r'changePlayer\s*\(\s*["\']([\da-fA-F]+)', r'load[Vv]ideo\s*\(\s*["\']([\da-fA-F]+)', r'var\s+initMyAray\s*=\s*["\']([\da-fA-F]+)', @@ -153,17 +166,17 @@ def _real_extract(self, url): if isinstance(video, dict): for quality, chapters_key in enumerate(('lowChapters', 'chapters')): video_url = try_get( - video, lambda x: x[chapters_key][0]['url'], compat_str) + video, lambda x: x[chapters_key][0]['url'], str) if video_url: formats.append({ 'url': video_url, 'format_id': 'http', 'quality': quality, # Sample clip - 'preference': -10 + 'preference': -10, }) - hls_url = try_get(data, lambda x: x['hls_url'], compat_str) + hls_url = try_get(data, lambda x: x['hls_url'], str) if hls_url: hls_url = re.sub(r'maxbr=\d+&?', '', hls_url) formats.extend(self._extract_m3u8_formats( diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py index 1157114b2a..62ee8b17f1 100644 --- a/yt_dlp/extractor/cda.py +++ b/yt_dlp/extractor/cda.py @@ -1,14 +1,15 @@ import base64 import codecs -import datetime +import datetime as dt import hashlib import hmac import json import random import re +import urllib.parse from .common import InfoExtractor -from ..compat import compat_ord, compat_urllib_parse_unquote +from ..compat import compat_ord from ..utils import ( ExtractorError, float_or_none, @@ -16,7 +17,6 @@ merge_dicts, multipart_encode, parse_duration, - random_birthday, traverse_obj, try_call, try_get, @@ -52,7 +52,7 @@ class CDAIE(InfoExtractor): 'age_limit': 0, 'upload_date': '20160221', 'timestamp': 1456078244, - } + }, }, { 'url': 'http://www.cda.pl/video/57413289', 'md5': 'a88828770a8310fc00be6c95faf7f4d5', @@ -63,26 +63,47 @@ class CDAIE(InfoExtractor): 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'crash404', - 'view_count': int, 'average_rating': float, 'duration': 137, 'age_limit': 0, - } + 'upload_date': '20160220', + 'timestamp': 1455968218, + }, }, { - # Age-restricted - 'url': 'http://www.cda.pl/video/1273454c4', + # Age-restricted with vfilm redirection + 'url': 'https://www.cda.pl/video/8753244c4', + 'md5': 'd8eeb83d63611289507010d3df3bb8b3', 'info_dict': { - 'id': '1273454c4', + 'id': '8753244c4', 'ext': 'mp4', - 'title': 'Bronson (2008) napisy HD 1080p', - 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c', + 'title': '[18+] Bez Filtra: Rezerwowe Psy czyli... najwulgarniejsza polska gra?', + 'description': 'md5:ae80bac31bd6a9f077a6cce03c7c077e', 'height': 1080, - 'uploader': 'boniek61', + 'uploader': 'arhn eu', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 5554, + 'duration': 991, 'age_limit': 18, - 'view_count': int, 'average_rating': float, + 'timestamp': 1633888264, + 'upload_date': '20211010', + }, + }, { + # Age-restricted without vfilm redirection + 'url': 'https://www.cda.pl/video/17028157b8', + 'md5': 'c1fe5ff4582bace95d4f0ce0fbd0f992', + 'info_dict': { + 'id': '17028157b8', + 'ext': 'mp4', + 'title': 'STENDUPY MICHAŁ OGIŃSKI', + 'description': 'md5:5851f3272bfc31f762d616040a1d609a', + 'height': 480, + 'uploader': 'oginski', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 18855, + 'age_limit': 18, + 'average_rating': float, + 'timestamp': 1699705901, + 'upload_date': '20231111', }, }, { 'url': 'http://ebd.cda.pl/0x0/5749950c', @@ -90,11 +111,9 @@ class CDAIE(InfoExtractor): }] def _download_age_confirm_page(self, url, video_id, *args, **kwargs): - form_data = random_birthday('rok', 'miesiac', 'dzien') - form_data.update({'return': url, 'module': 'video', 'module_id': video_id}) - data, content_type = multipart_encode(form_data) + data, content_type = multipart_encode({'age_confirm': ''}) return self._download_webpage( - urljoin(url, '/a/validatebirth'), video_id, *args, + url, video_id, *args, data=data, headers={ 'Referer': url, 'Content-Type': content_type, @@ -134,7 +153,7 @@ def _perform_login(self, username, password): self._API_HEADERS['User-Agent'] = f'pl.cda 1.0 (version {app_version}; Android {android_version}; {phone_model})' cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {} - if cached_bearer.get('valid_until', 0) > datetime.datetime.now().timestamp() + 5: + if cached_bearer.get('valid_until', 0) > dt.datetime.now().timestamp() + 5: self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}' return @@ -154,7 +173,7 @@ def _perform_login(self, username, password): }) self.cache.store(self._BEARER_CACHE, username, { 'token': token_res['access_token'], - 'valid_until': token_res['expires_in'] + datetime.datetime.now().timestamp(), + 'valid_until': token_res['expires_in'] + dt.datetime.now().timestamp(), }) self._API_HEADERS['Authorization'] = f'Bearer {token_res["access_token"]}' @@ -164,7 +183,7 @@ def _real_extract(self, url): if 'Authorization' in self._API_HEADERS: return self._api_extract(video_id) else: - return self._web_extract(video_id, url) + return self._web_extract(video_id) def _api_extract(self, video_id): meta = self._download_json( @@ -197,9 +216,9 @@ def _api_extract(self, video_id): 'view_count': meta.get('views'), } - def _web_extract(self, video_id, url): + def _web_extract(self, video_id): self._set_cookie('cda.pl', 'cda.player', 'html5') - webpage = self._download_webpage( + webpage, urlh = self._download_webpage_handle( f'{self._BASE_URL}/video/{video_id}/vfilm', video_id) if 'Ten film jest dostępny dla użytkowników premium' in webpage: @@ -209,10 +228,10 @@ def _web_extract(self, video_id, url): self.raise_geo_restricted() need_confirm_age = False - if self._html_search_regex(r'(<form[^>]+action="[^"]*/a/validatebirth[^"]*")', + if self._html_search_regex(r'(<button[^>]+name="[^"]*age_confirm[^"]*")', webpage, 'birthday validate form', default=None): webpage = self._download_age_confirm_page( - url, video_id, note='Confirming age') + urlh.url, video_id, note='Confirming age') need_confirm_age = True formats = [] @@ -222,9 +241,6 @@ def _web_extract(self, video_id, url): (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*? <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3> ''', webpage, 'uploader', default=None, group='uploader') - view_count = self._search_regex( - r'Odsłony:(?:\s| )*([0-9]+)', webpage, - 'view_count', default=None) average_rating = self._search_regex( (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)', r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False, @@ -235,7 +251,6 @@ def _web_extract(self, video_id, url): 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), 'uploader': uploader, - 'view_count': int_or_none(view_count), 'average_rating': float_or_none(average_rating), 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, @@ -249,7 +264,7 @@ def _web_extract(self, video_id, url): def decrypt_file(a): for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'): a = a.replace(p, '') - a = compat_urllib_parse_unquote(a) + a = urllib.parse.unquote(a) b = [] for c in a: f = compat_ord(c) @@ -266,16 +281,16 @@ def decrypt_file(a): def extract_format(page, version): json_str = self._html_search_regex( r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page, - '%s player_json' % version, fatal=False, group='player_data') + f'{version} player_json', fatal=False, group='player_data') if not json_str: return player_data = self._parse_json( - json_str, '%s player_data' % version, fatal=False) + json_str, f'{version} player_data', fatal=False) if not player_data: return video = player_data.get('video') if not video or 'file' not in video: - self.report_warning('Unable to extract %s version information' % version) + self.report_warning(f'Unable to extract {version} version information') return if video['file'].startswith('uggc'): video['file'] = codecs.decode(video['file'], 'rot_13') @@ -296,11 +311,11 @@ def extract_format(page, version): continue data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2, 'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]} - data = json.dumps(data).encode('utf-8') + data = json.dumps(data).encode() video_url = self._download_json( f'https://www.cda.pl/video/{video_id}', video_id, headers={ 'Content-Type': 'application/json', - 'X-Requested-With': 'XMLHttpRequest' + 'X-Requested-With': 'XMLHttpRequest', }, data=data, note=f'Fetching {quality} url', errnote=f'Failed to fetch {quality} url', fatal=False) if try_get(video_url, lambda x: x['result']['status']) == 'ok': @@ -308,7 +323,7 @@ def extract_format(page, version): info_dict['formats'].append({ 'url': video_url, 'format_id': quality, - 'height': int_or_none(quality[:-1]) + 'height': int_or_none(quality[:-1]), }) if not info_dict['duration']: @@ -326,11 +341,11 @@ def extract_format(page, version): webpage = handler( urljoin(self._BASE_URL, href), video_id, - 'Downloading %s version information' % resolution, fatal=False) + f'Downloading {resolution} version information', fatal=False) if not webpage: # Manually report warning because empty page is returned when # invalid version is requested. - self.report_warning('Unable to download %s version information' % resolution) + self.report_warning(f'Unable to download {resolution} version information') continue extract_format(webpage, resolution) diff --git a/yt_dlp/extractor/cellebrite.py b/yt_dlp/extractor/cellebrite.py index 9896a31afe..54367c4d52 100644 --- a/yt_dlp/extractor/cellebrite.py +++ b/yt_dlp/extractor/cellebrite.py @@ -1,63 +1,50 @@ -from .common import InfoExtractor -from ..utils import traverse_obj +from .vidyard import VidyardBaseIE, VidyardIE +from ..utils import ExtractorError, make_archive_id, url_basename -class CellebriteIE(InfoExtractor): +class CellebriteIE(VidyardBaseIE): _VALID_URL = r'https?://cellebrite\.com/(?:\w+)?/(?P<id>[\w-]+)' _TESTS = [{ 'url': 'https://cellebrite.com/en/collect-data-from-android-devices-with-cellebrite-ufed/', 'info_dict': { - 'id': '16025876', + 'id': 'ZqmUss3dQfEMGpauambPuH', + 'display_id': '16025876', 'ext': 'mp4', - 'description': 'md5:174571cb97083fd1d457d75c684f4e2b', - 'thumbnail': 'https://cellebrite.com/wp-content/uploads/2021/05/Chat-Capture-1024x559.png', 'title': 'Ask the Expert: Chat Capture - Collect Data from Android Devices in Cellebrite UFED', - 'duration': 455, - 'tags': [], - } + 'description': 'md5:dee48fe12bbae5c01fe6a053f7676da4', + 'thumbnail': 'https://cellebrite.com/wp-content/uploads/2021/05/Chat-Capture-1024x559.png', + 'duration': 455.979, + '_old_archive_ids': ['cellebrite 16025876'], + }, }, { 'url': 'https://cellebrite.com/en/how-to-lawfully-collect-the-maximum-amount-of-data-from-android-devices/', 'info_dict': { - 'id': '29018255', + 'id': 'QV1U8a2yzcxigw7VFnqKyg', + 'display_id': '29018255', 'ext': 'mp4', - 'duration': 134, - 'tags': [], - 'description': 'md5:e9a3d124c7287b0b07bad2547061cacf', + 'title': 'How to Lawfully Collect the Maximum Amount of Data From Android Devices', + 'description': 'md5:0e943a9ac14c374d5d74faed634d773c', 'thumbnail': 'https://cellebrite.com/wp-content/uploads/2022/07/How-to-Lawfully-Collect-the-Maximum-Amount-of-Data-From-Android-Devices.png', - 'title': 'Android Extractions Explained', - } + 'duration': 134.315, + '_old_archive_ids': ['cellebrite 29018255'], + }, }] - def _get_formats_and_subtitles(self, json_data, display_id): - formats = [{'url': url} for url in traverse_obj(json_data, ('mp4', ..., 'url')) or []] - subtitles = {} - - for url in traverse_obj(json_data, ('hls', ..., 'url')) or []: - fmt, sub = self._extract_m3u8_formats_and_subtitles( - url, display_id, ext='mp4', headers={'Referer': 'https://play.vidyard.com/'}) - formats.extend(fmt) - self._merge_subtitles(sub, target=subtitles) - - return formats, subtitles - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + slug = self._match_id(url) + webpage = self._download_webpage(url, slug) + vidyard_url = next(VidyardIE._extract_embed_urls(url, webpage), None) + if not vidyard_url: + raise ExtractorError('No Vidyard video embeds found on page') - player_uuid = self._search_regex( - r'<img\s[^>]*\bdata-uuid\s*=\s*"([^"\?]+)', webpage, 'player UUID') - json_data = self._download_json( - f'https://play.vidyard.com/player/{player_uuid}.json', display_id)['payload']['chapters'][0] + video_id = url_basename(vidyard_url) + info = self._process_video_json(self._fetch_video_json(video_id)['chapters'][0], video_id) + if info.get('display_id'): + info['_old_archive_ids'] = [make_archive_id(self, info['display_id'])] + if thumbnail := self._og_search_thumbnail(webpage, default=None): + info.setdefault('thumbnails', []).append({'url': thumbnail}) - formats, subtitles = self._get_formats_and_subtitles(json_data['sources'], display_id) return { - 'id': str(json_data['videoId']), - 'title': json_data.get('name') or self._og_search_title(webpage), - 'formats': formats, - 'subtitles': subtitles, - 'description': json_data.get('description') or self._og_search_description(webpage), - 'duration': json_data.get('seconds'), - 'tags': json_data.get('tags'), - 'thumbnail': self._og_search_thumbnail(webpage), - 'http_headers': {'Referer': 'https://play.vidyard.com/'}, + 'description': self._og_search_description(webpage, default=None), + **info, } diff --git a/yt_dlp/extractor/ceskatelevize.py b/yt_dlp/extractor/ceskatelevize.py index be2b0bb433..c323985caf 100644 --- a/yt_dlp/extractor/ceskatelevize.py +++ b/yt_dlp/extractor/ceskatelevize.py @@ -1,20 +1,20 @@ import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse, -) +from ..networking import Request from ..utils import ( ExtractorError, float_or_none, - sanitized_Request, str_or_none, traverse_obj, urlencode_postdata, - USER_AGENTS, ) +USER_AGENTS = { + 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', +} + class CeskaTelevizeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady|zive)/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' @@ -51,7 +51,7 @@ class CeskaTelevizeIE(InfoExtractor): 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', 'only_matching': True, 'info_dict': { - 'id': 402, + 'id': '402', 'ext': 'mp4', 'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, @@ -97,11 +97,11 @@ class CeskaTelevizeIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) webpage, urlh = self._download_webpage_handle(url, playlist_id) - parsed_url = compat_urllib_parse_urlparse(urlh.geturl()) + parsed_url = urllib.parse.urlparse(urlh.url) site_name = self._og_search_property('site_name', webpage, fatal=False, default='Česká televize') playlist_title = self._og_search_title(webpage, default=None) if site_name and playlist_title: - playlist_title = re.split(r'\s*[—|]\s*%s' % (site_name, ), playlist_title, 1)[0] + playlist_title = re.split(rf'\s*[—|]\s*{site_name}', playlist_title, maxsplit=1)[0] playlist_description = self._og_search_description(webpage, default=None) if playlist_description: playlist_description = playlist_description.replace('\xa0', ' ') @@ -122,15 +122,15 @@ def _real_extract(self, url): iframe_hash = self._download_webpage( 'https://www.ceskatelevize.cz/v-api/iframe-hash/', playlist_id, note='Getting IFRAME hash') - query = {'hash': iframe_hash, 'origin': 'iVysilani', 'autoStart': 'true', type_: idec, } + query = {'hash': iframe_hash, 'origin': 'iVysilani', 'autoStart': 'true', type_: idec} webpage = self._download_webpage( 'https://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php', playlist_id, note='Downloading player', query=query) NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' - if '%s</p>' % NOT_AVAILABLE_STRING in webpage: + if f'{NOT_AVAILABLE_STRING}</p>' in webpage: self.raise_geo_restricted(NOT_AVAILABLE_STRING) - if any(not_found in webpage for not_found in ('Neplatný parametr pro videopřehrávač', 'IDEC nebyl nalezen', )): + if any(not_found in webpage for not_found in ('Neplatný parametr pro videopřehrávač', 'IDEC nebyl nalezen')): raise ExtractorError('no video with IDEC available', video_id=idec, expected=True) type_ = None @@ -163,16 +163,16 @@ def _real_extract(self, url): entries = [] for user_agent in (None, USER_AGENTS['Safari']): - req = sanitized_Request( + req = Request( 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/', data=urlencode_postdata(data)) - req.add_header('Content-type', 'application/x-www-form-urlencoded') - req.add_header('x-addr', '127.0.0.1') - req.add_header('X-Requested-With', 'XMLHttpRequest') + req.headers['Content-type'] = 'application/x-www-form-urlencoded' + req.headers['x-addr'] = '127.0.0.1' + req.headers['X-Requested-With'] = 'XMLHttpRequest' if user_agent: - req.add_header('User-Agent', user_agent) - req.add_header('Referer', url) + req.headers['User-Agent'] = user_agent + req.headers['Referer'] = url playlistpage = self._download_json(req, playlist_id, fatal=False) @@ -183,8 +183,8 @@ def _real_extract(self, url): if playlist_url == 'error_region': raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) - req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) - req.add_header('Referer', url) + req = Request(urllib.parse.unquote(playlist_url)) + req.headers['Referer'] = url playlist = self._download_json(req, playlist_id, fatal=False) if not playlist: @@ -203,11 +203,11 @@ def _real_extract(self, url): if 'playerType=flash' in stream_url: stream_formats = self._extract_m3u8_formats( stream_url, playlist_id, 'mp4', 'm3u8_native', - m3u8_id='hls-%s' % format_id, fatal=False) + m3u8_id=f'hls-{format_id}', fatal=False) else: stream_formats = self._extract_mpd_formats( stream_url, playlist_id, - mpd_id='dash-%s' % format_id, fatal=False) + mpd_id=f'dash-{format_id}', fatal=False) if 'drmOnly=true' in stream_url: for f in stream_formats: f['has_drm'] = True @@ -236,7 +236,7 @@ def _real_extract(self, url): if playlist_len == 1: final_title = playlist_title or title else: - final_title = '%s (%s)' % (playlist_title, title) + final_title = f'{playlist_title} ({title})' entries.append({ 'id': item_id, @@ -261,7 +261,7 @@ def _get_subtitles(self, episode_id, subs): 'cs': [{ 'ext': 'srt', 'data': srt_subs, - }] + }], } @staticmethod @@ -282,7 +282,7 @@ def _fix_subtitle(subtitle): if m: yield m.group(1) start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:]) - yield '{0} --> {1}'.format(start, stop) + yield f'{start} --> {stop}' else: yield line diff --git a/yt_dlp/extractor/cgtn.py b/yt_dlp/extractor/cgtn.py index aaafa02d1b..b9757e0639 100644 --- a/yt_dlp/extractor/cgtn.py +++ b/yt_dlp/extractor/cgtn.py @@ -17,10 +17,11 @@ class CGTNIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1615295940, 'upload_date': '20210309', + 'categories': ['Video'], }, 'params': { - 'skip_download': True - } + 'skip_download': True, + }, }, { 'url': 'https://news.cgtn.com/news/2021-06-06/China-Indonesia-vow-to-further-deepen-maritime-cooperation-10REvJCewCY/index.html', 'info_dict': { @@ -29,15 +30,15 @@ class CGTNIE(InfoExtractor): 'title': 'China, Indonesia vow to further deepen maritime cooperation', 'thumbnail': r're:^https?://.*\.png$', 'description': 'China and Indonesia vowed to upgrade their cooperation into the maritime sector and also for political security, economy, and cultural and people-to-people exchanges.', - 'author': 'CGTN', - 'category': 'China', + 'creators': ['CGTN'], + 'categories': ['China'], 'timestamp': 1622950200, 'upload_date': '20210606', }, 'params': { - 'skip_download': False - } - } + 'skip_download': False, + }, + }, ] def _real_extract(self, url): @@ -45,7 +46,12 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) download_url = self._html_search_regex(r'data-video ="(?P<url>.+m3u8)"', webpage, 'download_url') - datetime_str = self._html_search_regex(r'<span class="date">\s*(.+?)\s*</span>', webpage, 'datetime_str', fatal=False) + datetime_str = self._html_search_regex( + r'<span class="date">\s*(.+?)\s*</span>', webpage, 'datetime_str', fatal=False) + category = self._html_search_regex( + r'<span class="section">\s*(.+?)\s*</span>', webpage, 'category', fatal=False) + author = self._search_regex( + r'<div class="news-author-name">\s*(.+?)\s*</div>', webpage, 'author', default=None) return { 'id': video_id, @@ -53,9 +59,7 @@ def _real_extract(self, url): 'description': self._og_search_description(webpage, default=None), 'thumbnail': self._og_search_thumbnail(webpage), 'formats': self._extract_m3u8_formats(download_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls'), - 'category': self._html_search_regex(r'<span class="section">\s*(.+?)\s*</span>', - webpage, 'category', fatal=False), - 'author': self._html_search_regex(r'<div class="news-author-name">\s*(.+?)\s*</div>', - webpage, 'author', default=None, fatal=False), + 'categories': [category] if category else None, + 'creators': [author] if author else None, 'timestamp': try_get(unified_timestamp(datetime_str), lambda x: x - 8 * 3600), } diff --git a/yt_dlp/extractor/channel9.py b/yt_dlp/extractor/channel9.py deleted file mode 100644 index a88474060a..0000000000 --- a/yt_dlp/extractor/channel9.py +++ /dev/null @@ -1,252 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - int_or_none, - parse_iso8601, - qualities, - unescapeHTML, -) - - -class Channel9IE(InfoExtractor): - IE_DESC = 'Channel 9' - IE_NAME = 'channel9' - _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)' - _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b'] - - _TESTS = [{ - 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', - 'md5': '32083d4eaf1946db6d454313f44510ca', - 'info_dict': { - 'id': '6c413323-383a-49dc-88f9-a22800cab024', - 'ext': 'wmv', - 'title': 'Developer Kick-Off Session: Stuff We Love', - 'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731', - 'duration': 4576, - 'thumbnail': r're:https?://.*\.jpg', - 'timestamp': 1377717420, - 'upload_date': '20130828', - 'session_code': 'KOS002', - 'session_room': 'Arena 1A', - 'session_speakers': 'count:5', - }, - }, { - 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', - 'md5': 'dcf983ee6acd2088e7188c3cf79b46bc', - 'info_dict': { - 'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024', - 'ext': 'wmv', - 'title': 'Self-service BI with Power BI - nuclear testing', - 'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54', - 'duration': 1540, - 'thumbnail': r're:https?://.*\.jpg', - 'timestamp': 1386381991, - 'upload_date': '20131207', - 'authors': ['Mike Wilmot'], - }, - }, { - # low quality mp4 is best - 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', - 'info_dict': { - 'id': '33ad69d2-6a4e-4172-83a1-a523013dec76', - 'ext': 'mp4', - 'title': 'Ranges for the Standard Library', - 'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372', - 'duration': 5646, - 'thumbnail': r're:https?://.*\.jpg', - 'upload_date': '20150930', - 'timestamp': 1443640735, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS', - 'info_dict': { - 'id': 'Events/DEVintersection/DEVintersection-2016', - 'title': 'DEVintersection 2016 Orlando Sessions', - }, - 'playlist_mincount': 14, - }, { - 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS', - 'only_matching': True, - }, { - 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman', - 'only_matching': True, - }] - - _RSS_URL = 'http://channel9.msdn.com/%s/RSS' - - def _extract_list(self, video_id, rss_url=None): - if not rss_url: - rss_url = self._RSS_URL % video_id - rss = self._download_xml(rss_url, video_id, 'Downloading RSS') - entries = [self.url_result(session_url.text, 'Channel9') - for session_url in rss.findall('./channel/item/link')] - title_text = rss.find('./channel/title').text - return self.playlist_result(entries, video_id, title_text) - - def _real_extract(self, url): - content_path, rss = self._match_valid_url(url).groups() - - if rss: - return self._extract_list(content_path, url) - - webpage = self._download_webpage( - url, content_path, 'Downloading web page') - - episode_data = self._search_regex( - r"data-episode='([^']+)'", webpage, 'episode data', default=None) - if episode_data: - episode_data = self._parse_json(unescapeHTML( - episode_data), content_path) - content_id = episode_data['contentId'] - is_session = '/Sessions(' in episode_data['api'] - content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] + '?$select=Captions,CommentCount,MediaLengthInSeconds,PublishedDate,Rating,RatingCount,Title,VideoMP4High,VideoMP4Low,VideoMP4Medium,VideoPlayerPreviewImage,VideoWMV,VideoWMVHQ,Views,' - if is_session: - content_url += 'Code,Description,Room,Slides,Speakers,ZipFile&$expand=Speakers' - else: - content_url += 'Authors,Body&$expand=Authors' - content_data = self._download_json(content_url, content_id) - title = content_data['Title'] - - QUALITIES = ( - 'mp3', - 'wmv', 'mp4', - 'wmv-low', 'mp4-low', - 'wmv-mid', 'mp4-mid', - 'wmv-high', 'mp4-high', - ) - - quality_key = qualities(QUALITIES) - - def quality(quality_id, format_url): - return (len(QUALITIES) if '_Source.' in format_url - else quality_key(quality_id)) - - formats = [] - urls = set() - - SITE_QUALITIES = { - 'MP3': 'mp3', - 'MP4': 'mp4', - 'Low Quality WMV': 'wmv-low', - 'Low Quality MP4': 'mp4-low', - 'Mid Quality WMV': 'wmv-mid', - 'Mid Quality MP4': 'mp4-mid', - 'High Quality WMV': 'wmv-high', - 'High Quality MP4': 'mp4-high', - } - - formats_select = self._search_regex( - r'(?s)<select[^>]+name=["\']format[^>]+>(.+?)</select', webpage, - 'formats select', default=None) - if formats_select: - for mobj in re.finditer( - r'<option\b[^>]+\bvalue=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*(?P<format>[^<]+?)\s*<', - formats_select): - format_url = mobj.group('url') - if format_url in urls: - continue - urls.add(format_url) - format_id = mobj.group('format') - quality_id = SITE_QUALITIES.get(format_id, format_id) - formats.append({ - 'url': format_url, - 'format_id': quality_id, - 'quality': quality(quality_id, format_url), - 'vcodec': 'none' if quality_id == 'mp3' else None, - }) - - API_QUALITIES = { - 'VideoMP4Low': 'mp4-low', - 'VideoWMV': 'wmv-mid', - 'VideoMP4Medium': 'mp4-mid', - 'VideoMP4High': 'mp4-high', - 'VideoWMVHQ': 'wmv-hq', - } - - for format_id, q in API_QUALITIES.items(): - q_url = content_data.get(format_id) - if not q_url or q_url in urls: - continue - urls.add(q_url) - formats.append({ - 'url': q_url, - 'format_id': q, - 'quality': quality(q, q_url), - }) - - slides = content_data.get('Slides') - zip_file = content_data.get('ZipFile') - - if not formats and not slides and not zip_file: - self.raise_no_formats( - 'None of recording, slides or zip are available for %s' % content_path) - - subtitles = {} - for caption in content_data.get('Captions', []): - caption_url = caption.get('Url') - if not caption_url: - continue - subtitles.setdefault(caption.get('Language', 'en'), []).append({ - 'url': caption_url, - 'ext': 'vtt', - }) - - common = { - 'id': content_id, - 'title': title, - 'description': clean_html(content_data.get('Description') or content_data.get('Body')), - 'thumbnail': content_data.get('VideoPlayerPreviewImage'), - 'duration': int_or_none(content_data.get('MediaLengthInSeconds')), - 'timestamp': parse_iso8601(content_data.get('PublishedDate')), - 'avg_rating': int_or_none(content_data.get('Rating')), - 'rating_count': int_or_none(content_data.get('RatingCount')), - 'view_count': int_or_none(content_data.get('Views')), - 'comment_count': int_or_none(content_data.get('CommentCount')), - 'subtitles': subtitles, - } - if is_session: - speakers = [] - for s in content_data.get('Speakers', []): - speaker_name = s.get('FullName') - if not speaker_name: - continue - speakers.append(speaker_name) - - common.update({ - 'session_code': content_data.get('Code'), - 'session_room': content_data.get('Room'), - 'session_speakers': speakers, - }) - else: - authors = [] - for a in content_data.get('Authors', []): - author_name = a.get('DisplayName') - if not author_name: - continue - authors.append(author_name) - common['authors'] = authors - - contents = [] - - if slides: - d = common.copy() - d.update({'title': title + '-Slides', 'url': slides}) - contents.append(d) - - if zip_file: - d = common.copy() - d.update({'title': title + '-Zip', 'url': zip_file}) - contents.append(d) - - if formats: - d = common.copy() - d.update({'title': title, 'formats': formats}) - contents.append(d) - return self.playlist_result(contents) - else: - return self._extract_list(content_path) diff --git a/yt_dlp/extractor/chaturbate.py b/yt_dlp/extractor/chaturbate.py index 99dfcfdebb..b49f741efa 100644 --- a/yt_dlp/extractor/chaturbate.py +++ b/yt_dlp/extractor/chaturbate.py @@ -37,7 +37,7 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - 'https://chaturbate.com/%s/' % video_id, video_id, + f'https://chaturbate.com/{video_id}/', video_id, headers=self.geo_verification_headers()) found_m3u8_urls = [] @@ -85,7 +85,7 @@ def _real_extract(self, url): formats = [] for m3u8_url in m3u8_urls: for known_id in ('fast', 'slow'): - if '_%s' % known_id in m3u8_url: + if f'_{known_id}' in m3u8_url: m3u8_id = known_id break else: @@ -99,7 +99,7 @@ def _real_extract(self, url): return { 'id': video_id, 'title': video_id, - 'thumbnail': 'https://roomimg.stream.highwebmedia.com/ri/%s.jpg' % video_id, + 'thumbnail': f'https://roomimg.stream.highwebmedia.com/ri/{video_id}.jpg', 'age_limit': self._rta_search(webpage), 'is_live': True, 'formats': formats, diff --git a/yt_dlp/extractor/chilloutzone.py b/yt_dlp/extractor/chilloutzone.py index 1a2f77c4eb..ac4252f1b4 100644 --- a/yt_dlp/extractor/chilloutzone.py +++ b/yt_dlp/extractor/chilloutzone.py @@ -1,93 +1,123 @@ -import json +import base64 from .common import InfoExtractor -from .youtube import YoutubeIE -from ..compat import compat_b64decode from ..utils import ( clean_html, - ExtractorError + int_or_none, + traverse_obj, ) class ChilloutzoneIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P<id>[\w|-]+)\.html' + _VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P<id>[\w-]+)\.html' _TESTS = [{ - 'url': 'http://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html', + 'url': 'https://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html', 'md5': 'a76f3457e813ea0037e5244f509e66d1', 'info_dict': { 'id': 'enemene-meck-alle-katzen-weg', 'ext': 'mp4', 'title': 'Enemene Meck - Alle Katzen weg', 'description': 'Ist das der Umkehrschluss des Niesenden Panda-Babys?', + 'duration': 24, }, }, { 'note': 'Video hosted at YouTube', - 'url': 'http://www.chilloutzone.net/video/eine-sekunde-bevor.html', + 'url': 'https://www.chilloutzone.net/video/eine-sekunde-bevor.html', 'info_dict': { 'id': '1YVQaAgHyRU', 'ext': 'mp4', 'title': '16 Photos Taken 1 Second Before Disaster', 'description': 'md5:58a8fcf6a459fe0a08f54140f0ad1814', 'uploader': 'BuzzFeedVideo', - 'uploader_id': 'BuzzFeedVideo', + 'uploader_id': '@BuzzFeedVideo', 'upload_date': '20131105', + 'availability': 'public', + 'thumbnail': 'https://i.ytimg.com/vi/1YVQaAgHyRU/maxresdefault.jpg', + 'tags': 'count:41', + 'like_count': int, + 'playable_in_embed': True, + 'channel_url': 'https://www.youtube.com/channel/UCpko_-a4wgz2u_DgDgd9fqA', + 'chapters': 'count:6', + 'live_status': 'not_live', + 'view_count': int, + 'categories': ['Entertainment'], + 'age_limit': 0, + 'channel_id': 'UCpko_-a4wgz2u_DgDgd9fqA', + 'duration': 100, + 'uploader_url': 'http://www.youtube.com/@BuzzFeedVideo', + 'channel_follower_count': int, + 'channel': 'BuzzFeedVideo', }, }, { - 'note': 'Video hosted at Vimeo', - 'url': 'http://www.chilloutzone.net/video/icon-blending.html', - 'md5': '2645c678b8dc4fefcc0e1b60db18dac1', + 'url': 'https://www.chilloutzone.net/video/icon-blending.html', + 'md5': '2f9d6850ec567b24f0f4fa143b9aa2f9', 'info_dict': { - 'id': '85523671', + 'id': 'LLNkHpSjBfc', 'ext': 'mp4', - 'title': 'The Sunday Times - Icons', - 'description': 're:(?s)^Watch the making of - makingoficons.com.{300,}', - 'uploader': 'Us', - 'uploader_id': 'usfilms', - 'upload_date': '20140131' + 'title': 'The Sunday Times Making of Icons', + 'description': 'md5:b9259fcf63a1669e42001e5db677f02a', + 'uploader': 'MadFoxUA', + 'uploader_id': '@MadFoxUA', + 'upload_date': '20140204', + 'channel_id': 'UCSZa9Y6-Vl7c11kWMcbAfCw', + 'channel_url': 'https://www.youtube.com/channel/UCSZa9Y6-Vl7c11kWMcbAfCw', + 'comment_count': int, + 'uploader_url': 'http://www.youtube.com/@MadFoxUA', + 'duration': 66, + 'live_status': 'not_live', + 'channel_follower_count': int, + 'playable_in_embed': True, + 'view_count': int, + 'like_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/LLNkHpSjBfc/maxresdefault.jpg', + 'categories': ['Comedy'], + 'availability': 'public', + 'tags': [], + 'channel': 'MadFoxUA', + 'age_limit': 0, + }, + }, { + 'url': 'https://www.chilloutzone.net/video/ordentlich-abgeschuettelt.html', + 'info_dict': { + 'id': 'ordentlich-abgeschuettelt', + 'ext': 'mp4', + 'title': 'Ordentlich abgeschüttelt', + 'description': 'md5:d41541966b75d3d1e8ea77a94ea0d329', + 'duration': 18, }, }] def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + b64_data = self._html_search_regex( + r'var cozVidData\s*=\s*"([^"]+)"', webpage, 'video data') + info = self._parse_json(base64.b64decode(b64_data).decode(), video_id) - base64_video_info = self._html_search_regex( - r'var cozVidData = "(.+?)";', webpage, 'video data') - decoded_video_info = compat_b64decode(base64_video_info).decode('utf-8') - video_info_dict = json.loads(decoded_video_info) + video_url = info.get('mediaUrl') + native_platform = info.get('nativePlatform') - # get video information from dict - video_url = video_info_dict['mediaUrl'] - description = clean_html(video_info_dict.get('description')) - title = video_info_dict['title'] - native_platform = video_info_dict['nativePlatform'] - native_video_id = video_info_dict['nativeVideoId'] - source_priority = video_info_dict['sourcePriority'] - - # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed) - if native_platform is None: - youtube_url = YoutubeIE._extract_url(webpage) - if youtube_url: - return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) - - # Non Fallback: Decide to use native source (e.g. youtube or vimeo) or - # the own CDN - if source_priority == 'native': + if native_platform and info.get('sourcePriority') == 'native': + native_video_id = info['nativeVideoId'] if native_platform == 'youtube': - return self.url_result(native_video_id, ie='Youtube') - if native_platform == 'vimeo': - return self.url_result( - 'http://vimeo.com/' + native_video_id, ie='Vimeo') + return self.url_result(native_video_id, 'Youtube') + elif native_platform == 'vimeo': + return self.url_result(f'https://vimeo.com/{native_video_id}', 'Vimeo') - if not video_url: - raise ExtractorError('No video found') + elif not video_url: + # Possibly a standard youtube embed? + # TODO: Investigate if site still does this (there are no tests for it) + return self.url_result(url, 'Generic') return { 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': title, - 'description': description, + **traverse_obj(info, { + 'title': 'title', + 'description': ('description', {clean_html}), + 'duration': ('videoLength', {int_or_none}), + 'width': ('videoWidth', {int_or_none}), + 'height': ('videoHeight', {int_or_none}), + }), } diff --git a/yt_dlp/extractor/chingari.py b/yt_dlp/extractor/chingari.py deleted file mode 100644 index 48091dd654..0000000000 --- a/yt_dlp/extractor/chingari.py +++ /dev/null @@ -1,207 +0,0 @@ -import itertools -import json -import urllib.parse - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - clean_html, - int_or_none, - str_to_int, - url_or_none, -) - - -class ChingariBaseIE(InfoExtractor): - def _get_post(self, id, post_data): - media_data = post_data['mediaLocation'] - base_url = media_data['base'] - author_data = post_data.get('authorData', {}) - song_data = post_data.get('song', {}) # revist this in future for differentiating b/w 'art' and 'author' - - formats = [{ - 'format_id': frmt, - 'width': str_to_int(frmt[1:]), - 'url': base_url + frmt_path, - } for frmt, frmt_path in media_data.get('transcoded', {}).items()] - - if media_data.get('path'): - formats.append({ - 'format_id': 'original', - 'format_note': 'Direct video.', - 'url': base_url + '/apipublic' + media_data['path'], - 'quality': 10, - }) - timestamp = str_to_int(post_data.get('created_at')) - if timestamp: - timestamp = int_or_none(timestamp, 1000) - - thumbnail, uploader_url = None, None - if media_data.get('thumbnail'): - thumbnail = base_url + media_data.get('thumbnail') - if author_data.get('username'): - uploader_url = 'https://chingari.io/' + author_data.get('username') - - return { - 'id': id, - 'extractor_key': ChingariIE.ie_key(), - 'extractor': 'Chingari', - 'title': urllib.parse.unquote_plus(clean_html(post_data.get('caption'))), - 'description': urllib.parse.unquote_plus(clean_html(post_data.get('caption'))), - 'duration': media_data.get('duration'), - 'thumbnail': url_or_none(thumbnail), - 'like_count': post_data.get('likeCount'), - 'view_count': post_data.get('viewsCount'), - 'comment_count': post_data.get('commentCount'), - 'repost_count': post_data.get('shareCount'), - 'timestamp': timestamp, - 'uploader_id': post_data.get('userId') or author_data.get('_id'), - 'uploader': author_data.get('name'), - 'uploader_url': url_or_none(uploader_url), - 'track': song_data.get('title'), - 'artist': song_data.get('author'), - 'formats': formats, - } - - -class ChingariIE(ChingariBaseIE): - _VALID_URL = r'https?://(?:www\.)?chingari\.io/share/post\?id=(?P<id>[^&/#?]+)' - _TESTS = [{ - 'url': 'https://chingari.io/share/post?id=612f8f4ce1dc57090e8a7beb', - 'info_dict': { - 'id': '612f8f4ce1dc57090e8a7beb', - 'ext': 'mp4', - 'title': 'Happy birthday Srila Prabhupada', - 'description': 'md5:c7080ebfdfeb06016e638c286d6bc3fa', - 'duration': 0, - 'thumbnail': 'https://media.chingari.io/uploads/c41d30e2-06b6-4e3b-9b4b-edbb929cec06-1630506826911/thumbnail/198f993f-ce87-4623-82c6-cd071bd6d4f4-1630506828016.jpg', - 'like_count': int, - 'view_count': int, - 'comment_count': int, - 'repost_count': int, - 'timestamp': 1630506828, - 'upload_date': '20210901', - 'uploader_id': '5f0403982c8bd344f4813f8c', - 'uploader': 'ISKCON,Inc.', - 'uploader_url': 'https://chingari.io/iskcon,inc', - 'track': None, - 'artist': None, - }, - 'params': {'skip_download': True} - }] - - def _real_extract(self, url): - id = self._match_id(url) - post_json = self._download_json(f'https://api.chingari.io/post/post_details/{id}', id) - if post_json['code'] != 200: - raise ExtractorError(post_json['message'], expected=True) - post_data = post_json['data'] - return self._get_post(id, post_data) - - -class ChingariUserIE(ChingariBaseIE): - _VALID_URL = r'https?://(?:www\.)?chingari\.io/(?!share/post)(?P<id>[^/?]+)' - _TESTS = [{ - 'url': 'https://chingari.io/dada1023', - 'info_dict': { - 'id': 'dada1023', - }, - 'params': {'playlistend': 3}, - 'playlist': [{ - 'url': 'https://chingari.io/share/post?id=614781f3ade60b3a0bfff42a', - 'info_dict': { - 'id': '614781f3ade60b3a0bfff42a', - 'ext': 'mp4', - 'title': '#chingaribappa ', - 'description': 'md5:d1df21d84088770468fa63afe3b17857', - 'duration': 7, - 'thumbnail': 'https://media.chingari.io/uploads/346d86d4-abb2-474e-a164-ffccf2bbcb72-1632076273717/thumbnail/b0b3aac2-2b86-4dd1-909d-9ed6e57cf77c-1632076275552.jpg', - 'like_count': int, - 'view_count': int, - 'comment_count': int, - 'repost_count': int, - 'timestamp': 1632076275, - 'upload_date': '20210919', - 'uploader_id': '5efc4b12cca35c3d1794c2d3', - 'uploader': 'dada (girish) dhawale', - 'uploader_url': 'https://chingari.io/dada1023', - 'track': None, - 'artist': None - }, - 'params': {'skip_download': True} - }, { - 'url': 'https://chingari.io/share/post?id=6146b132bcbf860959e12cba', - 'info_dict': { - 'id': '6146b132bcbf860959e12cba', - 'ext': 'mp4', - 'title': 'Tactor harvesting', - 'description': 'md5:8403f12dce68828b77ecee7eb7e887b7', - 'duration': 59.3, - 'thumbnail': 'https://media.chingari.io/uploads/b353ca70-7a87-400d-93a6-fa561afaec86-1632022814584/thumbnail/c09302e3-2043-41b1-a2fe-77d97e5bd676-1632022834260.jpg', - 'like_count': int, - 'view_count': int, - 'comment_count': int, - 'repost_count': int, - 'timestamp': 1632022834, - 'upload_date': '20210919', - 'uploader_id': '5efc4b12cca35c3d1794c2d3', - 'uploader': 'dada (girish) dhawale', - 'uploader_url': 'https://chingari.io/dada1023', - 'track': None, - 'artist': None - }, - 'params': {'skip_download': True} - }, { - 'url': 'https://chingari.io/share/post?id=6145651b74cb030a64c40b82', - 'info_dict': { - 'id': '6145651b74cb030a64c40b82', - 'ext': 'mp4', - 'title': '#odiabhajan ', - 'description': 'md5:687ea36835b9276cf2af90f25e7654cb', - 'duration': 56.67, - 'thumbnail': 'https://media.chingari.io/uploads/6cbf216b-babc-4cce-87fe-ceaac8d706ac-1631937782708/thumbnail/8855754f-6669-48ce-b269-8cc0699ed6da-1631937819522.jpg', - 'like_count': int, - 'view_count': int, - 'comment_count': int, - 'repost_count': int, - 'timestamp': 1631937819, - 'upload_date': '20210918', - 'uploader_id': '5efc4b12cca35c3d1794c2d3', - 'uploader': 'dada (girish) dhawale', - 'uploader_url': 'https://chingari.io/dada1023', - 'track': None, - 'artist': None - }, - 'params': {'skip_download': True} - }], - }, { - 'url': 'https://chingari.io/iskcon%2Cinc', - 'playlist_mincount': 1025, - 'info_dict': { - 'id': 'iskcon%2Cinc', - }, - }] - - def _entries(self, id): - skip = 0 - has_more = True - for page in itertools.count(): - posts = self._download_json('https://api.chingari.io/users/getPosts', id, - data=json.dumps({'userId': id, 'ownerId': id, 'skip': skip, 'limit': 20}).encode(), - headers={'content-type': 'application/json;charset=UTF-8'}, - note='Downloading page %s' % page) - for post in posts.get('data', []): - post_data = post['post'] - yield self._get_post(post_data['_id'], post_data) - skip += 20 - has_more = posts['hasMoreData'] - if not has_more: - break - - def _real_extract(self, url): - alt_id = self._match_id(url) - post_json = self._download_json(f'https://api.chingari.io/user/{alt_id}', alt_id) - if post_json['code'] != 200: - raise ExtractorError(post_json['message'], expected=True) - id = post_json['data']['_id'] - return self.playlist_result(self._entries(id), playlist_id=alt_id) diff --git a/yt_dlp/extractor/chirbit.py b/yt_dlp/extractor/chirbit.py deleted file mode 100644 index 452711d973..0000000000 --- a/yt_dlp/extractor/chirbit.py +++ /dev/null @@ -1,88 +0,0 @@ -import re - -from .common import InfoExtractor -from ..compat import compat_b64decode -from ..utils import parse_duration - - -class ChirbitIE(InfoExtractor): - IE_NAME = 'chirbit' - _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P<id>[\da-zA-Z]+)' - _TESTS = [{ - 'url': 'http://chirb.it/be2abG', - 'info_dict': { - 'id': 'be2abG', - 'ext': 'mp3', - 'title': 'md5:f542ea253f5255240be4da375c6a5d7e', - 'description': 'md5:f24a4e22a71763e32da5fed59e47c770', - 'duration': 306, - 'uploader': 'Gerryaudio', - }, - 'params': { - 'skip_download': True, - } - }, { - 'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5', - 'only_matching': True, - }, { - 'url': 'https://chirb.it/wp/MN58c2', - 'only_matching': True, - }] - - def _real_extract(self, url): - audio_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://chirb.it/%s' % audio_id, audio_id) - - data_fd = self._search_regex( - r'data-fd=(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'data fd', group='url') - - # Reverse engineered from https://chirb.it/js/chirbit.player.js (look - # for soundURL) - audio_url = compat_b64decode(data_fd[::-1]).decode('utf-8') - - title = self._search_regex( - r'class=["\']chirbit-title["\'][^>]*>([^<]+)', webpage, 'title') - description = self._search_regex( - r'<h3>Description</h3>\s*<pre[^>]*>([^<]+)</pre>', - webpage, 'description', default=None) - duration = parse_duration(self._search_regex( - r'class=["\']c-length["\'][^>]*>([^<]+)', - webpage, 'duration', fatal=False)) - uploader = self._search_regex( - r'id=["\']chirbit-username["\'][^>]*>([^<]+)', - webpage, 'uploader', fatal=False) - - return { - 'id': audio_id, - 'url': audio_url, - 'title': title, - 'description': description, - 'duration': duration, - 'uploader': uploader, - } - - -class ChirbitProfileIE(InfoExtractor): - IE_NAME = 'chirbit:profile' - _VALID_URL = r'https?://(?:www\.)?chirbit\.com/(?:rss/)?(?P<id>[^/]+)' - _TEST = { - 'url': 'http://chirbit.com/ScarletBeauty', - 'info_dict': { - 'id': 'ScarletBeauty', - }, - 'playlist_mincount': 3, - } - - def _real_extract(self, url): - profile_id = self._match_id(url) - - webpage = self._download_webpage(url, profile_id) - - entries = [ - self.url_result(self._proto_relative_url('//chirb.it/' + video_id)) - for _, video_id in re.findall(r'<input[^>]+id=([\'"])copy-btn-(?P<id>[0-9a-zA-Z]+)\1', webpage)] - - return self.playlist_result(entries, profile_id) diff --git a/yt_dlp/extractor/chzzk.py b/yt_dlp/extractor/chzzk.py new file mode 100644 index 0000000000..e0b9980afd --- /dev/null +++ b/yt_dlp/extractor/chzzk.py @@ -0,0 +1,172 @@ +import functools + +from .common import InfoExtractor +from ..utils import ( + UserNotLive, + float_or_none, + int_or_none, + parse_iso8601, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class CHZZKLiveIE(InfoExtractor): + IE_NAME = 'chzzk:live' + _VALID_URL = r'https?://chzzk\.naver\.com/live/(?P<id>[\da-f]+)' + _TESTS = [{ + 'url': 'https://chzzk.naver.com/live/c68b8ef525fb3d2fa146344d84991753', + 'info_dict': { + 'id': 'c68b8ef525fb3d2fa146344d84991753', + 'ext': 'mp4', + 'title': str, + 'channel': '진짜도현', + 'channel_id': 'c68b8ef525fb3d2fa146344d84991753', + 'channel_is_verified': False, + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1705510344, + 'upload_date': '20240117', + 'live_status': 'is_live', + 'view_count': int, + 'concurrent_view_count': int, + }, + 'skip': 'The channel is not currently live', + }] + + def _real_extract(self, url): + channel_id = self._match_id(url) + live_detail = self._download_json( + f'https://api.chzzk.naver.com/service/v3/channels/{channel_id}/live-detail', channel_id, + note='Downloading channel info', errnote='Unable to download channel info')['content'] + + if live_detail.get('status') == 'CLOSE': + raise UserNotLive(video_id=channel_id) + + live_playback = self._parse_json(live_detail['livePlaybackJson'], channel_id) + + thumbnails = [] + thumbnail_template = traverse_obj( + live_playback, ('thumbnail', 'snapshotThumbnailTemplate', {url_or_none})) + if thumbnail_template and '{type}' in thumbnail_template: + for width in traverse_obj(live_playback, ('thumbnail', 'types', ..., {str})): + thumbnails.append({ + 'id': width, + 'url': thumbnail_template.replace('{type}', width), + 'width': int_or_none(width), + }) + + formats, subtitles = [], {} + for media in traverse_obj(live_playback, ('media', lambda _, v: url_or_none(v['path']))): + is_low_latency = media.get('mediaId') == 'LLHLS' + fmts, subs = self._extract_m3u8_formats_and_subtitles( + media['path'], channel_id, 'mp4', fatal=False, live=True, + m3u8_id='hls-ll' if is_low_latency else 'hls') + for f in fmts: + if is_low_latency: + f['source_preference'] = -2 + if '-afragalow.stream-audio.stream' in f['format_id']: + f['quality'] = -2 + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': channel_id, + 'is_live': True, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + **traverse_obj(live_detail, { + 'title': ('liveTitle', {str}), + 'timestamp': ('openDate', {functools.partial(parse_iso8601, delimiter=' ')}), + 'concurrent_view_count': ('concurrentUserCount', {int_or_none}), + 'view_count': ('accumulateCount', {int_or_none}), + 'channel': ('channel', 'channelName', {str}), + 'channel_id': ('channel', 'channelId', {str}), + 'channel_is_verified': ('channel', 'verifiedMark', {bool}), + }), + } + + +class CHZZKVideoIE(InfoExtractor): + IE_NAME = 'chzzk:video' + _VALID_URL = r'https?://chzzk\.naver\.com/video/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://chzzk.naver.com/video/1754', + 'md5': 'b0c0c1bb888d913b93d702b1512c7f06', + 'info_dict': { + 'id': '1754', + 'ext': 'mp4', + 'title': '치지직 테스트 방송', + 'channel': '침착맨', + 'channel_id': 'bb382c2c0cc9fa7c86ab3b037fb5799c', + 'channel_is_verified': False, + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 15577, + 'timestamp': 1702970505.417, + 'upload_date': '20231219', + 'view_count': int, + }, + 'skip': 'Replay video is expired', + }, { + # Manually uploaded video + 'url': 'https://chzzk.naver.com/video/1980', + 'info_dict': { + 'id': '1980', + 'ext': 'mp4', + 'title': '※시청주의※한번보면 잊기 힘든 영상', + 'channel': '라디유radiyu', + 'channel_id': '68f895c59a1043bc5019b5e08c83a5c5', + 'channel_is_verified': False, + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 95, + 'timestamp': 1703102631.722, + 'upload_date': '20231220', + 'view_count': int, + }, + }, { + # Partner channel replay video + 'url': 'https://chzzk.naver.com/video/2458', + 'info_dict': { + 'id': '2458', + 'ext': 'mp4', + 'title': '첫 방송', + 'channel': '강지', + 'channel_id': 'b5ed5db484d04faf4d150aedd362f34b', + 'channel_is_verified': True, + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 4433, + 'timestamp': 1703307460.214, + 'upload_date': '20231223', + 'view_count': int, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_meta = self._download_json( + f'https://api.chzzk.naver.com/service/v3/videos/{video_id}', video_id, + note='Downloading video info', errnote='Unable to download video info')['content'] + formats, subtitles = self._extract_mpd_formats_and_subtitles( + f'https://apis.naver.com/neonplayer/vodplay/v1/playback/{video_meta["videoId"]}', video_id, + query={ + 'key': video_meta['inKey'], + 'env': 'real', + 'lc': 'en_US', + 'cpl': 'en_US', + }, note='Downloading video playback', errnote='Unable to download video playback') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(video_meta, { + 'title': ('videoTitle', {str}), + 'thumbnail': ('thumbnailImageUrl', {url_or_none}), + 'timestamp': ('publishDateAt', {functools.partial(float_or_none, scale=1000)}), + 'view_count': ('readCount', {int_or_none}), + 'duration': ('duration', {int_or_none}), + 'channel': ('channel', 'channelName', {str}), + 'channel_id': ('channel', 'channelId', {str}), + 'channel_is_verified': ('channel', 'verifiedMark', {bool}), + }), + } diff --git a/yt_dlp/extractor/cinchcast.py b/yt_dlp/extractor/cinchcast.py deleted file mode 100644 index 7a7ea8b228..0000000000 --- a/yt_dlp/extractor/cinchcast.py +++ /dev/null @@ -1,56 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - unified_strdate, - xpath_text, -) - - -class CinchcastIE(InfoExtractor): - _VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P<id>[0-9]+)' - _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1'] - - _TESTS = [{ - 'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single', - 'info_dict': { - 'id': '5258197', - 'ext': 'mp3', - 'title': 'Train Your Brain to Up Your Game with Coach Mandy', - 'upload_date': '20130816', - }, - }, { - # Actual test is run in generic, look for undergroundwellness - 'url': 'http://player.cinchcast.com/?platformId=1&assetType=single&assetId=7141703', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - doc = self._download_xml( - 'http://www.blogtalkradio.com/playerasset/mrss?assetType=single&assetId=%s' % video_id, - video_id) - - item = doc.find('.//item') - title = xpath_text(item, './title', fatal=True) - date_str = xpath_text( - item, './{http://developer.longtailvideo.com/trac/}date') - upload_date = unified_strdate(date_str, day_first=False) - # duration is present but wrong - formats = [{ - 'format_id': 'main', - 'url': item.find('./{http://search.yahoo.com/mrss/}content').attrib['url'], - }] - backup_url = xpath_text( - item, './{http://developer.longtailvideo.com/trac/}backupContent') - if backup_url: - formats.append({ - 'preference': 2, # seems to be more reliable - 'format_id': 'backup', - 'url': backup_url, - }) - - return { - 'id': video_id, - 'title': title, - 'upload_date': upload_date, - 'formats': formats, - } diff --git a/yt_dlp/extractor/cinemax.py b/yt_dlp/extractor/cinemax.py index 54cab2285e..66831ef62d 100644 --- a/yt_dlp/extractor/cinemax.py +++ b/yt_dlp/extractor/cinemax.py @@ -2,6 +2,7 @@ class CinemaxIE(HBOBaseIE): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?cinemax\.com/(?P<path>[^/]+/video/[0-9a-z-]+-(?P<id>\d+))' _TESTS = [{ 'url': 'https://www.cinemax.com/warrior/video/s1-ep-1-recap-20126903', @@ -19,6 +20,6 @@ class CinemaxIE(HBOBaseIE): def _real_extract(self, url): path, video_id = self._match_valid_url(url).groups() - info = self._extract_info('https://www.cinemax.com/%s.xml' % path, video_id) + info = self._extract_info(f'https://www.cinemax.com/{path}.xml', video_id) info['id'] = video_id return info diff --git a/yt_dlp/extractor/cinetecamilano.py b/yt_dlp/extractor/cinetecamilano.py index 5e770ebac2..834890d56f 100644 --- a/yt_dlp/extractor/cinetecamilano.py +++ b/yt_dlp/extractor/cinetecamilano.py @@ -1,6 +1,7 @@ import json -import urllib.error + from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, float_or_none, @@ -26,8 +27,8 @@ class CinetecaMilanoIE(InfoExtractor): 'modified_date': '20200520', 'duration': 3139, 'release_timestamp': 1643446208, - 'modified_timestamp': int - } + 'modified_timestamp': int, + }, }] def _real_extract(self, url): @@ -37,10 +38,10 @@ def _real_extract(self, url): f'https://www.cinetecamilano.it/api/catalogo/{video_id}/?', video_id, headers={ 'Referer': url, - 'Authorization': try_get(self._get_cookies('https://www.cinetecamilano.it'), lambda x: f'Bearer {x["cnt-token"].value}') or '' + 'Authorization': try_get(self._get_cookies('https://www.cinetecamilano.it'), lambda x: f'Bearer {x["cnt-token"].value}') or '', }) except ExtractorError as e: - if ((isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 500) + if ((isinstance(e.cause, HTTPError) and e.cause.status == 500) or isinstance(e.cause, json.JSONDecodeError)): self.raise_login_required(method='cookies') raise @@ -57,5 +58,5 @@ def _real_extract(self, url): 'modified_timestamp': parse_iso8601(archive.get('created_at'), delimiter=' '), 'thumbnail': urljoin(url, try_get(archive, lambda x: x['thumb']['src'].replace('/public/', '/storage/'))), 'formats': self._extract_m3u8_formats( - urljoin(url, traverse_obj(archive, ('drm', 'hls'))), video_id, 'mp4') + urljoin(url, traverse_obj(archive, ('drm', 'hls'))), video_id, 'mp4'), } diff --git a/yt_dlp/extractor/cineverse.py b/yt_dlp/extractor/cineverse.py new file mode 100644 index 0000000000..c8c6c48c27 --- /dev/null +++ b/yt_dlp/extractor/cineverse.py @@ -0,0 +1,139 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + filter_dict, + int_or_none, + parse_age_limit, + smuggle_url, + traverse_obj, + unsmuggle_url, + url_or_none, +) + + +class CineverseBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://www\.(?P<host>{})'.format('|'.join(map(re.escape, ( + 'cineverse.com', + 'asiancrush.com', + 'dovechannel.com', + 'screambox.com', + 'midnightpulp.com', + 'fandor.com', + 'retrocrush.tv', + )))) + + +class CineverseIE(CineverseBaseIE): + _VALID_URL = rf'{CineverseBaseIE._VALID_URL_BASE}/watch/(?P<id>[A-Z0-9]+)' + _TESTS = [{ + 'url': 'https://www.asiancrush.com/watch/DMR00018919/Women-Who-Flirt', + 'skip': 'geo-blocked', + 'info_dict': { + 'title': 'Women Who Flirt', + 'ext': 'mp4', + 'id': 'DMR00018919', + 'modified_timestamp': 1678744575289, + 'cast': ['Xun Zhou', 'Xiaoming Huang', 'Yi-Lin Sie', 'Sonia Sui', 'Quniciren'], + 'duration': 5811.597, + 'description': 'md5:892fd62a05611d394141e8394ace0bc6', + 'age_limit': 13, + }, + }, { + 'url': 'https://www.retrocrush.tv/watch/1000000023016/Archenemy! Crystal Bowie', + 'skip': 'geo-blocked', + 'info_dict': { + 'title': 'Archenemy! Crystal Bowie', + 'ext': 'mp4', + 'id': '1000000023016', + 'episode_number': 3, + 'season_number': 1, + 'cast': ['Nachi Nozawa', 'Yoshiko Sakakibara', 'Toshiko Fujita'], + 'age_limit': 0, + 'episode': 'Episode 3', + 'season': 'Season 1', + 'duration': 1485.067, + 'description': 'Cobra meets a beautiful bounty hunter by the name of Jane Royal.', + 'series': 'Space Adventure COBRA (Original Japanese)', + }, + }] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, default={}) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) + video_id = self._match_id(url) + html = self._download_webpage(url, video_id) + idetails = self._search_nextjs_data(html, video_id)['props']['pageProps']['idetails'] + + err_code = idetails.get('err_code') + if err_code == 1002: + self.raise_login_required() + elif err_code == 1200: + self.raise_geo_restricted( + 'This video is not available from your location due to geo restriction. ' + 'You may be able to bypass it by using the /details/ page instead of the /watch/ page', + countries=smuggled_data.get('geo_countries')) + + return { + 'subtitles': filter_dict({ + 'en': traverse_obj(idetails, (('cc_url_vtt', 'subtitle_url'), {'url': {url_or_none}})) or None, + }), + 'formats': self._extract_m3u8_formats(idetails['url'], video_id), + **traverse_obj(idetails, { + 'title': 'title', + 'id': ('details', 'item_id'), + 'description': ('details', 'description'), + 'duration': ('duration', {lambda x: x / 1000}), + 'cast': ('details', 'cast', {lambda x: x.split(', ')}), + 'modified_timestamp': ('details', 'updated_by', 0, 'update_time', 'time', {int_or_none}), + 'season_number': ('details', 'season', {int_or_none}), + 'episode_number': ('details', 'episode', {int_or_none}), + 'age_limit': ('details', 'rating_code', {parse_age_limit}), + 'series': ('details', 'series_details', 'title'), + }), + } + + +class CineverseDetailsIE(CineverseBaseIE): + _VALID_URL = rf'{CineverseBaseIE._VALID_URL_BASE}/details/(?P<id>[A-Z0-9]+)' + _TESTS = [{ + 'url': 'https://www.retrocrush.tv/details/1000000023012/Space-Adventure-COBRA-(Original-Japanese)', + 'playlist_mincount': 30, + 'info_dict': { + 'title': 'Space Adventure COBRA (Original Japanese)', + 'id': '1000000023012', + }, + }, { + 'url': 'https://www.asiancrush.com/details/NNVG4938/Hansel-and-Gretel', + 'info_dict': { + 'id': 'NNVG4938', + 'ext': 'mp4', + 'title': 'Hansel and Gretel', + 'description': 'md5:e3e4c35309c2e82aee044f972c2fb05d', + 'cast': ['Jeong-myeong Cheon', 'Eun Won-jae', 'Shim Eun-gyeong', 'Ji-hee Jin', 'Hee-soon Park', 'Lydia Park', 'Kyeong-ik Kim'], + 'duration': 7030.732, + }, + }] + + def _real_extract(self, url): + host, series_id = self._match_valid_url(url).group('host', 'id') + html = self._download_webpage(url, series_id) + pageprops = self._search_nextjs_data(html, series_id)['props']['pageProps'] + + geo_countries = traverse_obj(pageprops, ('itemDetailsData', 'geo_country', {lambda x: x.split(', ')})) + geoblocked = traverse_obj(pageprops, ( + 'itemDetailsData', 'playback_err_msg')) == 'This title is not available in your location.' + + def item_result(item): + item_url = f'https://www.{host}/watch/{item["item_id"]}/{item["title"]}' + if geoblocked: + item_url = smuggle_url(item_url, {'geo_countries': geo_countries}) + return self.url_result(item_url, CineverseIE) + + season = traverse_obj(pageprops, ('seasonEpisodes', ..., 'episodes', lambda _, v: v['item_id'] and v['title'])) + if season: + return self.playlist_result([item_result(ep) for ep in season], playlist_id=series_id, + playlist_title=traverse_obj(pageprops, ('itemDetailsData', 'title'))) + return item_result(pageprops['itemDetailsData']) diff --git a/yt_dlp/extractor/ciscolive.py b/yt_dlp/extractor/ciscolive.py index 0668578170..1584ca6657 100644 --- a/yt_dlp/extractor/ciscolive.py +++ b/yt_dlp/extractor/ciscolive.py @@ -105,7 +105,7 @@ class CiscoLiveSearchIE(CiscoLiveBaseIE): @classmethod def suitable(cls, url): - return False if CiscoLiveSessionIE.suitable(url) else super(CiscoLiveSearchIE, cls).suitable(url) + return False if CiscoLiveSessionIE.suitable(url) else super().suitable(url) @staticmethod def _check_bc_id_exists(rf_item): @@ -117,7 +117,7 @@ def _entries(self, query, url): for page_num in itertools.count(1): results = self._call_api( 'search', None, query, url, - 'Downloading search JSON page %d' % page_num) + f'Downloading search JSON page {page_num}') sl = try_get(results, lambda x: x['sectionList'][0], dict) if sl: results = sl diff --git a/yt_dlp/extractor/ciscowebex.py b/yt_dlp/extractor/ciscowebex.py index 0fcf022820..d39347c82c 100644 --- a/yt_dlp/extractor/ciscowebex.py +++ b/yt_dlp/extractor/ciscowebex.py @@ -33,7 +33,7 @@ def _real_extract(self, url): if rcid: webpage = self._download_webpage(url, None, note='Getting video ID') url = self._search_regex(self._VALID_URL, webpage, 'redirection url', group='url') - url = self._request_webpage(url, None, note='Resolving final URL').geturl() + url = self._request_webpage(url, None, note='Resolving final URL').url mobj = self._match_valid_url(url) subdomain = mobj.group('subdomain') siteurl = mobj.group('siteurl_1') or mobj.group('siteurl_2') @@ -46,7 +46,7 @@ def _real_extract(self, url): headers['accessPwd'] = password stream, urlh = self._download_json_handle( - 'https://%s.webex.com/webappng/api/v1/recordings/%s/stream' % (subdomain, video_id), + f'https://{subdomain}.webex.com/webappng/api/v1/recordings/{video_id}/stream', video_id, headers=headers, query={'siteurl': siteurl}, expected_status=(403, 429)) if urlh.status == 403: @@ -101,6 +101,6 @@ def _real_extract(self, url): 'uploader_id': stream.get('ownerUserName') or stream.get('ownerId'), 'timestamp': unified_timestamp(stream.get('createTime')), 'duration': int_or_none(stream.get('duration'), 1000), - 'webpage_url': 'https://%s.webex.com/recordingservice/sites/%s/recording/playback/%s' % (subdomain, siteurl, video_id), + 'webpage_url': f'https://{subdomain}.webex.com/recordingservice/sites/{siteurl}/recording/playback/{video_id}', 'formats': formats, } diff --git a/yt_dlp/extractor/cjsw.py b/yt_dlp/extractor/cjsw.py index c37a3b8482..b80236a7ee 100644 --- a/yt_dlp/extractor/cjsw.py +++ b/yt_dlp/extractor/cjsw.py @@ -27,7 +27,7 @@ class CJSWIE(InfoExtractor): def _real_extract(self, url): mobj = self._match_valid_url(url) program, episode_id = mobj.group('program', 'id') - audio_id = '%s/%s' % (program, episode_id) + audio_id = f'{program}/{episode_id}' webpage = self._download_webpage(url, episode_id) diff --git a/yt_dlp/extractor/clipchamp.py b/yt_dlp/extractor/clipchamp.py new file mode 100644 index 0000000000..a8bdf7e509 --- /dev/null +++ b/yt_dlp/extractor/clipchamp.py @@ -0,0 +1,61 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + unified_timestamp, + url_or_none, +) + + +class ClipchampIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?clipchamp\.com/watch/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://clipchamp.com/watch/gRXZ4ZhdDaU', + 'info_dict': { + 'id': 'gRXZ4ZhdDaU', + 'ext': 'mp4', + 'title': 'Untitled video', + 'uploader': 'Alexander Schwartz', + 'timestamp': 1680805580, + 'upload_date': '20230406', + 'thumbnail': r're:^https?://.+\.jpg', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + _STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s' + _STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'} + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['video'] + + storage_location = data.get('storage_location') + if storage_location != 'cf_stream': + raise ExtractorError(f'Unsupported clip storage location "{storage_location}"') + + path = data['download_url'] + iframe = self._download_webpage( + f'https://iframe.cloudflarestream.com/{path}', video_id, 'Downloading player iframe') + subdomain = self._search_regex( + r'\bcustomer-domain-prefix=["\']([\w-]+)["\']', iframe, + 'subdomain', fatal=False) or 'customer-2ut9yn3y6fta1yxe' + + formats = self._extract_mpd_formats( + self._STREAM_URL_TMPL % (subdomain, path, 'mpd'), video_id, + query=self._STREAM_URL_QUERY, fatal=False, mpd_id='dash') + formats.extend(self._extract_m3u8_formats( + self._STREAM_URL_TMPL % (subdomain, path, 'm3u8'), video_id, 'mp4', + query=self._STREAM_URL_QUERY, fatal=False, m3u8_id='hls')) + + return { + 'id': video_id, + 'formats': formats, + 'uploader': ' '.join(traverse_obj(data, ('creator', ('first_name', 'last_name'), {str}))) or None, + **traverse_obj(data, { + 'title': ('project', 'project_name', {str}), + 'timestamp': ('created_at', {unified_timestamp}), + 'thumbnail': ('thumbnail_url', {url_or_none}), + }), + } diff --git a/yt_dlp/extractor/cliphunter.py b/yt_dlp/extractor/cliphunter.py deleted file mode 100644 index 2b907dc804..0000000000 --- a/yt_dlp/extractor/cliphunter.py +++ /dev/null @@ -1,76 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - int_or_none, - url_or_none, -) - - -class CliphunterIE(InfoExtractor): - IE_NAME = 'cliphunter' - - _VALID_URL = r'''(?x)https?://(?:www\.)?cliphunter\.com/w/ - (?P<id>[0-9]+)/ - (?P<seo>.+?)(?:$|[#\?]) - ''' - _TESTS = [{ - 'url': 'http://www.cliphunter.com/w/1012420/Fun_Jynx_Maze_solo', - 'md5': 'b7c9bbd4eb3a226ab91093714dcaa480', - 'info_dict': { - 'id': '1012420', - 'ext': 'flv', - 'title': 'Fun Jynx Maze solo', - 'thumbnail': r're:^https?://.*\.jpg$', - 'age_limit': 18, - }, - 'skip': 'Video gone', - }, { - 'url': 'http://www.cliphunter.com/w/2019449/ShesNew__My_booty_girlfriend_Victoria_Paradices_pussy_filled_with_jizz', - 'md5': '55a723c67bfc6da6b0cfa00d55da8a27', - 'info_dict': { - 'id': '2019449', - 'ext': 'mp4', - 'title': 'ShesNew - My booty girlfriend, Victoria Paradice\'s pussy filled with jizz', - 'thumbnail': r're:^https?://.*\.jpg$', - 'age_limit': 18, - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_title = self._search_regex( - r'mediaTitle = "([^"]+)"', webpage, 'title') - - gexo_files = self._parse_json( - self._search_regex( - r'var\s+gexoFiles\s*=\s*({.+?});', webpage, 'gexo files'), - video_id) - - formats = [] - for format_id, f in gexo_files.items(): - video_url = url_or_none(f.get('url')) - if not video_url: - continue - fmt = f.get('fmt') - height = f.get('h') - format_id = '%s_%sp' % (fmt, height) if fmt and height else format_id - formats.append({ - 'url': video_url, - 'format_id': format_id, - 'width': int_or_none(f.get('w')), - 'height': int_or_none(height), - 'tbr': int_or_none(f.get('br')), - }) - - thumbnail = self._search_regex( - r"var\s+mov_thumb\s*=\s*'([^']+)';", - webpage, 'thumbnail', fatal=False) - - return { - 'id': video_id, - 'title': video_title, - 'formats': formats, - 'age_limit': self._rta_search(webpage), - 'thumbnail': thumbnail, - } diff --git a/yt_dlp/extractor/clippit.py b/yt_dlp/extractor/clippit.py index 006a713b2a..393f217308 100644 --- a/yt_dlp/extractor/clippit.py +++ b/yt_dlp/extractor/clippit.py @@ -1,11 +1,11 @@ +import re + from .common import InfoExtractor from ..utils import ( parse_iso8601, qualities, ) -import re - class ClippitIE(InfoExtractor): @@ -23,7 +23,7 @@ class ClippitIE(InfoExtractor): 'upload_date': '20160826', 'description': 'BattleBots | ABC', 'thumbnail': r're:^https?://.*\.jpg$', - } + }, } def _real_extract(self, url): @@ -36,7 +36,7 @@ def _real_extract(self, url): quality = qualities(FORMATS) formats = [] for format_id in FORMATS: - url = self._html_search_regex(r'data-%s-file="(.+?)"' % format_id, + url = self._html_search_regex(rf'data-{format_id}-file="(.+?)"', webpage, 'url', fatal=False) if not url: continue diff --git a/yt_dlp/extractor/cliprs.py b/yt_dlp/extractor/cliprs.py index 567f77b94e..42f78cac65 100644 --- a/yt_dlp/extractor/cliprs.py +++ b/yt_dlp/extractor/cliprs.py @@ -2,6 +2,7 @@ class ClipRsIE(OnetBaseIE): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P<id>[^/]+)/\d+' _TEST = { 'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732', @@ -14,7 +15,7 @@ class ClipRsIE(OnetBaseIE): 'duration': 229, 'timestamp': 1459850243, 'upload_date': '20160405', - } + }, } def _real_extract(self, url): diff --git a/yt_dlp/extractor/clipsyndicate.py b/yt_dlp/extractor/clipsyndicate.py deleted file mode 100644 index 6064443210..0000000000 --- a/yt_dlp/extractor/clipsyndicate.py +++ /dev/null @@ -1,52 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - find_xpath_attr, - fix_xml_ampersands -) - - -class ClipsyndicateIE(InfoExtractor): - _VALID_URL = r'https?://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)' - - _TESTS = [{ - 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', - 'md5': '4d7d549451bad625e0ff3d7bd56d776c', - 'info_dict': { - 'id': '4629301', - 'ext': 'mp4', - 'title': 'Brick Briscoe', - 'duration': 612, - 'thumbnail': r're:^https?://.+\.jpg', - }, - }, { - 'url': 'http://chic.clipsyndicate.com/video/play/5844117/shark_attack', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - js_player = self._download_webpage( - 'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id, - video_id, 'Downlaoding player') - # it includes a required token - flvars = self._search_regex(r'flvars: "(.*?)"', js_player, 'flvars') - - pdoc = self._download_xml( - 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, - video_id, 'Downloading video info', - transform_source=fix_xml_ampersands) - - track_doc = pdoc.find('trackList/track') - - def find_param(name): - node = find_xpath_attr(track_doc, './/param', 'name', name) - if node is not None: - return node.attrib['value'] - - return { - 'id': video_id, - 'title': find_param('title'), - 'url': track_doc.find('location').text, - 'thumbnail': find_param('thumbnail'), - 'duration': int(find_param('duration')), - } diff --git a/yt_dlp/extractor/closertotruth.py b/yt_dlp/extractor/closertotruth.py index e78e26a113..77469eda99 100644 --- a/yt_dlp/extractor/closertotruth.py +++ b/yt_dlp/extractor/closertotruth.py @@ -4,6 +4,7 @@ class CloserToTruthIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', @@ -14,7 +15,7 @@ class CloserToTruthIE(InfoExtractor): 'title': 'Solutions to the Mind-Body Problem?', 'upload_date': '20140221', 'timestamp': 1392956007, - 'uploader_id': 'CTTXML' + 'uploader_id': 'CTTXML', }, 'params': { 'skip_download': True, @@ -28,7 +29,7 @@ class CloserToTruthIE(InfoExtractor): 'title': 'How do Brains Work?', 'upload_date': '20140221', 'timestamp': 1392956024, - 'uploader_id': 'CTTXML' + 'uploader_id': 'CTTXML', }, 'params': { 'skip_download': True, @@ -68,7 +69,7 @@ def _real_extract(self, url): entry_ids.add(entry_id) entries.append({ '_type': 'url_transparent', - 'url': 'kaltura:%s:%s' % (partner_id, entry_id), + 'url': f'kaltura:{partner_id}:{entry_id}', 'ie_key': 'Kaltura', 'title': mobj.group('title'), }) @@ -82,7 +83,7 @@ def _real_extract(self, url): return { '_type': 'url_transparent', 'display_id': display_id, - 'url': 'kaltura:%s:%s' % (partner_id, entry_id), + 'url': f'kaltura:{partner_id}:{entry_id}', 'ie_key': 'Kaltura', - 'title': title + 'title': title, } diff --git a/yt_dlp/extractor/cloudflarestream.py b/yt_dlp/extractor/cloudflarestream.py index 748e8e9087..8a409461a8 100644 --- a/yt_dlp/extractor/cloudflarestream.py +++ b/yt_dlp/extractor/cloudflarestream.py @@ -4,27 +4,33 @@ class CloudflareStreamIE(InfoExtractor): + _SUBDOMAIN_RE = r'(?:(?:watch|iframe|customer-\w+)\.)?' _DOMAIN_RE = r'(?:cloudflarestream\.com|(?:videodelivery|bytehighway)\.net)' - _EMBED_RE = r'embed\.%s/embed/[^/]+\.js\?.*?\bvideo=' % _DOMAIN_RE - _ID_RE = r'[\da-f]{32}|[\w-]+\.[\w-]+\.[\w-]+' - _VALID_URL = r'''(?x) - https?:// - (?: - (?:watch\.)?%s/| - %s - ) - (?P<id>%s) - ''' % (_DOMAIN_RE, _EMBED_RE, _ID_RE) - _EMBED_REGEX = [fr'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE}).*?)\1'] + _EMBED_RE = rf'(?:embed\.|{_SUBDOMAIN_RE}){_DOMAIN_RE}/embed/[^/?#]+\.js\?(?:[^#]+&)?video=' + _ID_RE = r'[\da-f]{32}|eyJ[\w-]+\.[\w-]+\.[\w-]+' + _VALID_URL = rf'https?://(?:{_SUBDOMAIN_RE}{_DOMAIN_RE}/|{_EMBED_RE})(?P<id>{_ID_RE})' + _EMBED_REGEX = [ + rf'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE})(?:(?!\1).)*)\1', + rf'<iframe[^>]+\bsrc=["\'](?P<url>https?://{_SUBDOMAIN_RE}{_DOMAIN_RE}/[\da-f]{{32}})', + ] _TESTS = [{ 'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717', 'info_dict': { 'id': '31c9291ab41fac05471db4e73aa11717', 'ext': 'mp4', 'title': '31c9291ab41fac05471db4e73aa11717', + 'thumbnail': 'https://videodelivery.net/31c9291ab41fac05471db4e73aa11717/thumbnails/thumbnail.jpg', }, 'params': { - 'skip_download': True, + 'skip_download': 'm3u8', + }, + }, { + 'url': 'https://watch.cloudflarestream.com/embed/sdk-iframe-integration.fla9.latest.js?video=0e8e040aec776862e1d632a699edf59e', + 'info_dict': { + 'id': '0e8e040aec776862e1d632a699edf59e', + 'ext': 'mp4', + 'title': '0e8e040aec776862e1d632a699edf59e', + 'thumbnail': 'https://videodelivery.net/0e8e040aec776862e1d632a699edf59e/thumbnails/thumbnail.jpg', }, }, { 'url': 'https://watch.cloudflarestream.com/9df17203414fd1db3e3ed74abbe936c1', @@ -35,26 +41,47 @@ class CloudflareStreamIE(InfoExtractor): }, { 'url': 'https://embed.videodelivery.net/embed/r4xu.fla9.latest.js?video=81d80727f3022488598f68d323c1ad5e', 'only_matching': True, + }, { + 'url': 'https://customer-aw5py76sw8wyqzmh.cloudflarestream.com/2463f6d3e06fa29710a337f5f5389fd8/iframe', + 'only_matching': True, + }, { + 'url': 'https://watch.cloudflarestream.com/eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJraWQiOiJmYTA0YjViMzQ2NDkwYTM5NWJiNzQ1NWFhZTA2YzYwZSIsInN1YiI6Ijg4ZDQxMDhhMzY0MjA3M2VhYmFhZjg3ZGExODJkMjYzIiwiZXhwIjoxNjAwNjA5MzE5fQ.xkRJwLGkt0nZ%5F0BlPiwU7iW4pqb4lKkznbKfAhGg0tGcxSS6ZBA3lcTUwu7W%2DyCFbnAl%2Dhqk3Fn%5FqeQS%5FQydP27qTHpB9iIFFsMtk1tqzGZV5v4yrYDnwLSKzEKvVd6QwJnfABtxH2JdpSNuWlMUiVXFxGWgjOw6QeTNDDklTQYXV%5FNLV7sErSn5CeOPeRRkdXb%2D8ip%5FVOcfk1nDsFoOo4fctFtGP0wYMyY5ae8nhhatydHwevuvJCcEvEfh%2D4qjq9mCZOodevmtSQ4YWmggf4BxtWnDWYrGW8Otp6oqezrR8oY4%2DbKdV6PaqBj49aJdcls6xK7PmM8%5Fvjy3xfm0Mg', + 'only_matching': True, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://upride.cc/incident/shoulder-pass-at-light/', + 'info_dict': { + 'id': 'eaef9dea5159cf968be84241b5cedfe7', + 'ext': 'mp4', + 'title': 'eaef9dea5159cf968be84241b5cedfe7', + 'thumbnail': 'https://videodelivery.net/eaef9dea5159cf968be84241b5cedfe7/thumbnails/thumbnail.jpg', + }, + 'params': { + 'skip_download': 'm3u8', + }, }] def _real_extract(self, url): video_id = self._match_id(url) domain = 'bytehighway.net' if 'bytehighway.net/' in url else 'videodelivery.net' - base_url = 'https://%s/%s/' % (domain, video_id) + base_url = f'https://{domain}/{video_id}/' if '.' in video_id: video_id = self._parse_json(base64.urlsafe_b64decode( video_id.split('.')[1] + '==='), video_id)['sub'] manifest_base_url = base_url + 'manifest/video.' - formats = self._extract_m3u8_formats( + formats, subtitles = self._extract_m3u8_formats_and_subtitles( manifest_base_url + 'm3u8', video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - formats.extend(self._extract_mpd_formats( - manifest_base_url + 'mpd', video_id, mpd_id='dash', fatal=False)) + fmts, subs = self._extract_mpd_formats_and_subtitles( + manifest_base_url + 'mpd', video_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) return { 'id': video_id, 'title': video_id, 'thumbnail': base_url + 'thumbnails/thumbnail.jpg', 'formats': formats, + 'subtitles': subtitles, } diff --git a/yt_dlp/extractor/cloudy.py b/yt_dlp/extractor/cloudy.py deleted file mode 100644 index 848643e262..0000000000 --- a/yt_dlp/extractor/cloudy.py +++ /dev/null @@ -1,57 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - str_to_int, - unified_strdate, -) - - -class CloudyIE(InfoExtractor): - _IE_DESC = 'cloudy.ec' - _VALID_URL = r'https?://(?:www\.)?cloudy\.ec/(?:v/|embed\.php\?.*?\bid=)(?P<id>[A-Za-z0-9]+)' - _TESTS = [{ - 'url': 'https://www.cloudy.ec/v/af511e2527aac', - 'md5': '29832b05028ead1b58be86bf319397ca', - 'info_dict': { - 'id': 'af511e2527aac', - 'ext': 'mp4', - 'title': 'Funny Cats and Animals Compilation june 2013', - 'upload_date': '20130913', - 'view_count': int, - } - }, { - 'url': 'http://www.cloudy.ec/embed.php?autoplay=1&id=af511e2527aac', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'https://www.cloudy.ec/embed.php', video_id, query={ - 'id': video_id, - 'playerPage': 1, - 'autoplay': 1, - }) - - info = self._parse_html5_media_entries(url, webpage, video_id)[0] - - webpage = self._download_webpage( - 'https://www.cloudy.ec/v/%s' % video_id, video_id, fatal=False) - - if webpage: - info.update({ - 'title': self._search_regex( - r'<h\d[^>]*>([^<]+)<', webpage, 'title'), - 'upload_date': unified_strdate(self._search_regex( - r'>Published at (\d{4}-\d{1,2}-\d{1,2})', webpage, - 'upload date', fatal=False)), - 'view_count': str_to_int(self._search_regex( - r'([\d,.]+) views<', webpage, 'view count', fatal=False)), - }) - - if not info.get('title'): - info['title'] = video_id - - info['id'] = video_id - - return info diff --git a/yt_dlp/extractor/cloudycdn.py b/yt_dlp/extractor/cloudycdn.py new file mode 100644 index 0000000000..6e757d79ee --- /dev/null +++ b/yt_dlp/extractor/cloudycdn.py @@ -0,0 +1,98 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + url_or_none, + urlencode_postdata, +) +from ..utils.traversal import traverse_obj + + +class CloudyCDNIE(InfoExtractor): + _VALID_URL = r'(?:https?:)?//embed\.cloudycdn\.services/(?P<site_id>[^/?#]+)/media/(?P<id>[\w-]+)' + _EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL})'] + _TESTS = [{ + 'url': 'https://embed.cloudycdn.services/ltv/media/46k_d23-6000-105?', + 'md5': '64f72a360ca530d5ed89c77646c9eee5', + 'info_dict': { + 'id': '46k_d23-6000-105', + 'ext': 'mp4', + 'timestamp': 1700589151, + 'duration': 1442, + 'upload_date': '20231121', + 'title': 'D23-6000-105_cetstud', + 'thumbnail': 'https://store.cloudycdn.services/tmsp00060/assets/media/660858/placeholder1700589200.jpg', + }, + }, { + 'url': 'https://embed.cloudycdn.services/izm/media/26e_lv-8-5-1', + 'md5': '798828a479151e2444d8dcfbec76e482', + 'info_dict': { + 'id': '26e_lv-8-5-1', + 'ext': 'mp4', + 'title': 'LV-8-5-1', + 'timestamp': 1669767167, + 'thumbnail': 'https://store.cloudycdn.services/tmsp00120/assets/media/488306/placeholder1679423604.jpg', + 'duration': 1205, + 'upload_date': '20221130', + }, + }, { + # Video-only m3u8 formats need manual fixup + 'url': 'https://embed.cloudycdn.services/ltv/media/08j_d24-6000-074', + 'md5': 'fc472e40f6e6238446509be411c920e2', + 'info_dict': { + 'id': '08j_d24-6000-074', + 'ext': 'mp4', + 'upload_date': '20240620', + 'duration': 1673, + 'title': 'D24-6000-074-cetstud', + 'timestamp': 1718902233, + 'thumbnail': 'https://store.cloudycdn.services/tmsp00060/assets/media/788392/placeholder1718903938.jpg', + }, + 'params': {'format': 'bv'}, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.tavaklase.lv/video/es-esmu-mina-um-2/', + 'md5': '63074e8e6c84ac2a01f2fb8bf03b8f43', + 'info_dict': { + 'id': 'cqd_lib-2', + 'ext': 'mp4', + 'upload_date': '20230223', + 'duration': 629, + 'thumbnail': 'https://store.cloudycdn.services/tmsp00120/assets/media/518407/placeholder1678748124.jpg', + 'timestamp': 1677181513, + 'title': 'LIB-2', + }, + }] + + def _real_extract(self, url): + site_id, video_id = self._match_valid_url(url).group('site_id', 'id') + + data = self._download_json( + f'https://player.cloudycdn.services/player/{site_id}/media/{video_id}/', + video_id, data=urlencode_postdata({ + 'version': '6.4.0', + 'referer': url, + })) + + formats, subtitles = [], {} + for m3u8_url in traverse_obj(data, ('source', 'sources', ..., 'src', {url_or_none})): + fmts, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False) + for fmt in fmts: + if re.search(r'chunklist_b\d+_vo_', fmt['url']): + fmt['acodec'] = 'none' + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(data, { + 'title': ('name', {str}), + 'duration': ('duration', {int_or_none}), + 'timestamp': ('upload_date', {parse_iso8601}), + 'thumbnail': ('source', 'poster', {url_or_none}), + }), + } diff --git a/yt_dlp/extractor/clubic.py b/yt_dlp/extractor/clubic.py index 403e44aafd..c908e61a1e 100644 --- a/yt_dlp/extractor/clubic.py +++ b/yt_dlp/extractor/clubic.py @@ -6,6 +6,7 @@ class ClubicIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P<id>[0-9]+)\.html' _TESTS = [{ @@ -17,7 +18,7 @@ class ClubicIE(InfoExtractor): 'title': 'Clubic Week 2.0 : le FBI se lance dans la photo d\u0092identité', 'description': 're:Gueule de bois chez Nokia. Le constructeur a indiqué cette.*', 'thumbnail': r're:^http://img\.clubic\.com/.*\.jpg$', - } + }, }, { 'url': 'http://www.clubic.com/video/video-clubic-week-2-0-apple-iphone-6s-et-plus-mais-surtout-le-pencil-469792.html', 'only_matching': True, @@ -26,7 +27,7 @@ class ClubicIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - player_url = 'http://player.m6web.fr/v1/player/clubic/%s.html' % video_id + player_url = f'http://player.m6web.fr/v1/player/clubic/{video_id}.html' player_page = self._download_webpage(player_url, video_id) config = self._parse_json(self._search_regex( diff --git a/yt_dlp/extractor/clyp.py b/yt_dlp/extractor/clyp.py index 273d0025f0..2702427c86 100644 --- a/yt_dlp/extractor/clyp.py +++ b/yt_dlp/extractor/clyp.py @@ -58,13 +58,13 @@ def _real_extract(self, url): query['token'] = token metadata = self._download_json( - 'https://api.clyp.it/%s' % audio_id, audio_id, query=query) + f'https://api.clyp.it/{audio_id}', audio_id, query=query) formats = [] for secure in ('', 'Secure'): for ext in ('Ogg', 'Mp3'): - format_id = '%s%s' % (secure, ext) - format_url = metadata.get('%sUrl' % format_id) + format_id = f'{secure}{ext}' + format_url = metadata.get(f'{format_id}Url') if format_url: formats.append({ 'url': format_url, diff --git a/yt_dlp/extractor/cmt.py b/yt_dlp/extractor/cmt.py index 8aed7708b1..8e53b7fbf8 100644 --- a/yt_dlp/extractor/cmt.py +++ b/yt_dlp/extractor/cmt.py @@ -1,9 +1,10 @@ from .mtv import MTVIE -# TODO Remove - Reason: Outdated Site +# TODO: Remove - Reason: Outdated Site class CMTIE(MTVIE): # XXX: Do not subclass from concrete IE + _WORKING = False IE_NAME = 'cmt.com' _VALID_URL = r'https?://(?:www\.)?cmt\.com/(?:videos|shows|(?:full-)?episodes|video-clips)/(?P<id>[^/]+)' @@ -51,4 +52,4 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) mgid = self._extract_mgid(webpage, url) - return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) + return self.url_result(f'http://media.mtvnservices.com/embed/{mgid}') diff --git a/yt_dlp/extractor/cnbc.py b/yt_dlp/extractor/cnbc.py index 68fd025b7c..cedfd3ef97 100644 --- a/yt_dlp/extractor/cnbc.py +++ b/yt_dlp/extractor/cnbc.py @@ -1,66 +1,97 @@ from .common import InfoExtractor -from ..utils import smuggle_url - - -class CNBCIE(InfoExtractor): - _VALID_URL = r'https?://video\.cnbc\.com/gallery/\?video=(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://video.cnbc.com/gallery/?video=3000503714', - 'info_dict': { - 'id': '3000503714', - 'ext': 'mp4', - 'title': 'Fighting zombies is big business', - 'description': 'md5:0c100d8e1a7947bd2feec9a5550e519e', - 'timestamp': 1459332000, - 'upload_date': '20160330', - 'uploader': 'NBCU-CNBC', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': smuggle_url( - 'http://link.theplatform.com/s/gZWlPC/media/guid/2408950221/%s?mbr=true&manifest=m3u' % video_id, - {'force_smil_url': True}), - 'id': video_id, - } +from ..utils import int_or_none, parse_iso8601, str_or_none, url_or_none +from ..utils.traversal import traverse_obj class CNBCVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P<path>/video/(?:[^/]+/)+(?P<id>[^./?#&]+)\.html)' - _TEST = { - 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', + _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/?#]+/)+(?P<id>[^./?#&]+)\.html' + + _TESTS = [{ + 'url': 'https://www.cnbc.com/video/2023/12/07/mcdonalds-just-unveiled-cosmcsits-new-spinoff-brand.html', 'info_dict': { - 'id': '7000031301', 'ext': 'mp4', - 'title': "Trump: I don't necessarily agree with raising rates", - 'description': 'md5:878d8f0b4ebb5bb1dda3514b91b49de3', - 'timestamp': 1531958400, - 'upload_date': '20180719', - 'uploader': 'NBCU-CNBC', + 'id': '107344774', + 'display_id': 'mcdonalds-just-unveiled-cosmcsits-new-spinoff-brand', + 'modified_timestamp': 1702053483, + 'timestamp': 1701977810, + 'channel': 'News Videos', + 'upload_date': '20231207', + 'description': 'md5:882c001d85cb43d7579b514307b3e78b', + 'release_timestamp': 1701977375, + 'modified_date': '20231208', + 'release_date': '20231207', + 'duration': 65, + 'creators': ['Sean Conlon'], + 'title': 'Here\'s a first look at McDonald\'s new spinoff brand, CosMc\'s', + 'thumbnail': 'https://image.cnbcfm.com/api/v1/image/107344192-1701894812493-CosMcsskyHero_2336x1040_hero-desktop.jpg?v=1701894855', }, - 'params': { - 'skip_download': True, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + 'url': 'https://www.cnbc.com/video/2023/12/08/jim-cramer-shares-his-take-on-seattles-tech-scene.html', + 'info_dict': { + 'creators': ['Jim Cramer'], + 'channel': 'Mad Money with Jim Cramer', + 'description': 'md5:72925be21b952e95eba51178dddf4e3e', + 'duration': 299.0, + 'ext': 'mp4', + 'id': '107345451', + 'display_id': 'jim-cramer-shares-his-take-on-seattles-tech-scene', + 'thumbnail': 'https://image.cnbcfm.com/api/v1/image/107345481-1702079431MM-B-120823.jpg?v=1702079430', + 'timestamp': 1702080139, + 'title': 'Jim Cramer shares his take on Seattle\'s tech scene', + 'release_date': '20231208', + 'upload_date': '20231209', + 'modified_timestamp': 1702080139, + 'modified_date': '20231209', + 'release_timestamp': 1702073551, }, - } + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + 'url': 'https://www.cnbc.com/video/2023/12/08/the-epicenter-of-ai-is-in-seattle-says-jim-cramer.html', + 'info_dict': { + 'creators': ['Jim Cramer'], + 'channel': 'Mad Money with Jim Cramer', + 'description': 'md5:72925be21b952e95eba51178dddf4e3e', + 'duration': 113.0, + 'ext': 'mp4', + 'id': '107345474', + 'display_id': 'the-epicenter-of-ai-is-in-seattle-says-jim-cramer', + 'thumbnail': 'https://image.cnbcfm.com/api/v1/image/107345486-Screenshot_2023-12-08_at_70339_PM.png?v=1702080248', + 'timestamp': 1702080535, + 'title': 'The epicenter of AI is in Seattle, says Jim Cramer', + 'release_timestamp': 1702077347, + 'modified_timestamp': 1702080535, + 'release_date': '20231208', + 'upload_date': '20231209', + 'modified_date': '20231209', + }, + 'expected_warnings': ['Unable to download f4m manifest'], + }] def _real_extract(self, url): - path, display_id = self._match_valid_url(url).groups() - video_id = self._download_json( - 'https://webql-redesign.cnbcfm.com/graphql', display_id, query={ - 'query': '''{ - page(path: "%s") { - vcpsId - } -}''' % path, - })['data']['page']['vcpsId'] - return self.url_result( - 'http://video.cnbc.com/gallery/?video=%d' % video_id, - CNBCIE.ie_key()) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + data = self._search_json(r'window\.__s_data=', webpage, 'video data', display_id) + + player_data = traverse_obj(data, ( + 'page', 'page', 'layout', ..., 'columns', ..., 'modules', + lambda _, v: v['name'] == 'clipPlayer', 'data', {dict}), get_all=False) + + return { + 'id': display_id, + 'display_id': display_id, + 'formats': self._extract_akamai_formats(player_data['playbackURL'], display_id), + **self._search_json_ld(webpage, display_id, fatal=False), + **traverse_obj(player_data, { + 'id': ('id', {str_or_none}), + 'title': ('title', {str}), + 'description': ('description', {str}), + 'creators': ('author', ..., 'name', {str}), + 'timestamp': ('datePublished', {parse_iso8601}), + 'release_timestamp': ('uploadDate', {parse_iso8601}), + 'modified_timestamp': ('dateLastPublished', {parse_iso8601}), + 'thumbnail': ('thumbnail', {url_or_none}), + 'duration': ('duration', {int_or_none}), + 'channel': ('section', 'title', {str}), + }), + } diff --git a/yt_dlp/extractor/cnn.py b/yt_dlp/extractor/cnn.py index 61b62fae9f..fe7615a891 100644 --- a/yt_dlp/extractor/cnn.py +++ b/yt_dlp/extractor/cnn.py @@ -26,7 +26,7 @@ class CNNIE(TurnerBaseIE): 'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology', 'ext': 'mp4', 'title': "Student's epic speech stuns new freshmen", - 'description': "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"", + 'description': 'A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from "2001: A Space Odyssey."', 'upload_date': '20130821', }, 'expected_warnings': ['Failed to download m3u8 information'], @@ -161,7 +161,7 @@ class CNNIndonesiaIE(InfoExtractor): 'release_timestamp': 1662859088, 'release_date': '20220911', 'uploader': 'Asfahan Yahsyi', - } + }, }, { 'url': 'https://www.cnnindonesia.com/internasional/20220911104341-139-846189/video-momen-charles-disambut-meriah-usai-dilantik-jadi-raja-inggris', 'info_dict': { @@ -178,7 +178,7 @@ class CNNIndonesiaIE(InfoExtractor): 'release_date': '20220911', 'uploader': 'REUTERS', 'release_timestamp': 1662869995, - } + }, }] def _real_extract(self, url): @@ -194,5 +194,5 @@ def _real_extract(self, url): '_type': 'url_transparent', 'url': embed_url, 'upload_date': upload_date, - 'tags': try_call(lambda: self._html_search_meta('keywords', webpage).split(', ')) + 'tags': try_call(lambda: self._html_search_meta('keywords', webpage).split(', ')), }) diff --git a/yt_dlp/extractor/comedycentral.py b/yt_dlp/extractor/comedycentral.py index 05fc9f2b50..27d295bb38 100644 --- a/yt_dlp/extractor/comedycentral.py +++ b/yt_dlp/extractor/comedycentral.py @@ -2,7 +2,7 @@ class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?|collection-playlist)/(?P<id>[0-9a-z]{6})' + _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?|collection-playlist|movies)/(?P<id>[0-9a-z]{6})' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TESTS = [{ @@ -25,6 +25,9 @@ class ComedyCentralIE(MTVServicesInfoExtractor): }, { 'url': 'https://www.cc.com/collection-playlist/cosnej/stand-up-specials/t6vtjb', 'only_matching': True, + }, { + 'url': 'https://www.cc.com/movies/tkp406/a-cluesterfuenke-christmas', + 'only_matching': True, }] diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index ebacc87bc0..187f73e7b9 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1,5 +1,6 @@ import base64 import collections +import functools import getpass import hashlib import http.client @@ -13,6 +14,7 @@ import os import random import re +import subprocess import sys import time import types @@ -20,10 +22,22 @@ import urllib.request import xml.etree.ElementTree -from ..compat import functools # isort: split -from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name +from ..compat import ( + compat_etree_fromstring, + compat_expanduser, + compat_os_name, + urllib_req_to_req, +) from ..cookies import LenientSimpleCookie from ..downloader.f4m import get_base_url, remove_encrypted_media +from ..downloader.hls import HlsFD +from ..networking import HEADRequest, Request +from ..networking.exceptions import ( + HTTPError, + IncompleteRead, + network_exceptions, +) +from ..networking.impersonate import ImpersonateTarget from ..utils import ( IDENTITY, JSON_LD_RE, @@ -32,8 +46,8 @@ FormatSorter, GeoRestrictedError, GeoUtils, - HEADRequest, LenientJSONDecoder, + Popen, RegexNotFoundError, RetryManager, UnsupportedError, @@ -46,7 +60,6 @@ determine_ext, dict_get, encode_data_uri, - error_to_compat_str, extract_attributes, filter_dict, fix_xml_ampersands, @@ -56,7 +69,7 @@ join_nonempty, js_to_json, mimetype2ext, - network_exceptions, + netrc_from_content, orderedSet, parse_bitrate, parse_codecs, @@ -66,7 +79,6 @@ parse_resolution, sanitize_filename, sanitize_url, - sanitized_Request, smuggle_url, str_or_none, str_to_int, @@ -78,8 +90,6 @@ unescapeHTML, unified_strdate, unified_timestamp, - update_Request, - update_url_query, url_basename, url_or_none, urlhandle_detect_ext, @@ -132,6 +142,7 @@ class InfoExtractor: is parsed from a string (in case of fragmented media) for MSS - URL of the ISM manifest. + * request_data Data to send in POST request to the URL * manifest_url The URL of the manifest file in case of fragmented media: @@ -159,12 +170,12 @@ class InfoExtractor: Automatically calculated from width and height * dynamic_range The dynamic range of the video. One of: "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV" - * tbr Average bitrate of audio and video in KBit/s - * abr Average audio bitrate in KBit/s + * tbr Average bitrate of audio and video in kbps (1000 bits/sec) + * abr Average audio bitrate in kbps (1000 bits/sec) * acodec Name of the audio codec in use * asr Audio sampling rate in Hertz * audio_channels Number of audio channels - * vbr Average video bitrate in KBit/s + * vbr Average video bitrate in kbps (1000 bits/sec) * fps Frame rate * vcodec Name of the video codec in use * container Name of the container format @@ -219,10 +230,18 @@ class InfoExtractor: width : height ratio as float. * no_resume The server does not support resuming the (HTTP or RTMP) download. Boolean. - * has_drm The format has DRM and cannot be downloaded. Boolean + * has_drm True if the format has DRM and cannot be downloaded. + 'maybe' if the format may have DRM and has to be tested before download. * extra_param_to_segment_url A query string to append to each fragment's URL, or to update each existing query string - with. Only applied by the native HLS/DASH downloaders. + with. If it is an HLS stream with an AES-128 decryption key, + the query paramaters will be passed to the key URI as well, + unless there is an `extra_param_to_key_url` given, + or unless an external key URI is provided via `hls_aes`. + Only applied by the native HLS/DASH downloaders. + * extra_param_to_key_url A query string to append to the URL + of the format's HLS AES-128 decryption key. + Only applied by the native HLS downloader. * hls_aes A dictionary of HLS AES-128 decryption information used by the native HLS downloader to override the values in the media playlist when an '#EXT-X-KEY' tag @@ -234,7 +253,10 @@ class InfoExtractor: * downloader_options A dictionary of downloader options (For internal use only) * http_chunk_size Chunk size for HTTP downloads - * ffmpeg_args Extra arguments for ffmpeg downloader + * ffmpeg_args Extra arguments for ffmpeg downloader (input) + * ffmpeg_args_out Extra arguments for ffmpeg downloader (output) + * is_dash_periods Whether the format is a result of merging + multiple DASH periods. RTMP formats can also have the additional fields: page_url, app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn, rtmp_protocol, rtmp_real_time @@ -248,7 +270,7 @@ class InfoExtractor: direct: True if a direct video file was given (must only be set by GenericIE) alt_title: A secondary title of the video. - display_id An alternative identifier for the video, not necessarily + display_id: An alternative identifier for the video, not necessarily unique, but available before title. Typically, id is something like "4234987", title "Dancing naked mole rats", and display_id "dancing-naked-mole-rats" @@ -266,7 +288,7 @@ class InfoExtractor: description: Full video description. uploader: Full name of the video uploader. license: License name the video is licensed under. - creator: The creator of the video. + creators: List of creators of the video. timestamp: UNIX timestamp of the moment the video was uploaded upload_date: Video upload date in UTC (YYYYMMDD). If not explicitly set, calculated from timestamp @@ -274,6 +296,9 @@ class InfoExtractor: If it is not clear whether to use timestamp or this, use the former release_date: The date (YYYYMMDD) when the video was released in UTC. If not explicitly set, calculated from release_timestamp + release_year: Year (YYYY) as integer when the video or album was released. + To be used if no exact release date is known. + If not explicitly set, calculated from release_date. modified_timestamp: UNIX timestamp of the moment the video was last modified. modified_date: The date (YYYYMMDD) when the video was last modified in UTC. If not explicitly set, calculated from modified_timestamp @@ -285,6 +310,7 @@ class InfoExtractor: channel_id: Id of the channel. channel_url: Full URL to a channel webpage. channel_follower_count: Number of followers of the channel. + channel_is_verified: Whether the channel is verified on the platform. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {tag: subformats}. "tag" is usually a language code, and @@ -313,6 +339,11 @@ class InfoExtractor: * "author" - human-readable name of the comment author * "author_id" - user ID of the comment author * "author_thumbnail" - The thumbnail of the comment author + * "author_url" - The url to the comment author's page + * "author_is_verified" - Whether the author is verified + on the platform + * "author_is_uploader" - Whether the comment is made by + the video uploader * "id" - Comment ID * "html" - Comment as HTML * "text" - Plain text of the comment @@ -324,8 +355,8 @@ class InfoExtractor: * "dislike_count" - Number of negative ratings of the comment * "is_favorited" - Whether the comment is marked as favorite by the video uploader - * "author_is_uploader" - Whether the comment is made by - the video uploader + * "is_pinned" - Whether the comment is pinned to + the top of the comments age_limit: Age restriction for the video, as an integer (years) webpage_url: The URL to the video webpage, if given to yt-dlp it should allow to get the same result again. (It will be set @@ -349,6 +380,10 @@ class InfoExtractor: * "start_time" - The start time of the chapter in seconds * "end_time" - The end time of the chapter in seconds * "title" (optional, string) + heatmap: A list of dictionaries, with the following entries: + * "start_time" - The start time of the data point in seconds + * "end_time" - The end time of the data point in seconds + * "value" - The normalized value of the data point (float between 0 and 1) playable_in_embed: Whether this video is allowed to play in embedded players on other sites. Can be True (=always allowed), False (=never allowed), None (=unknown), or a string @@ -357,6 +392,7 @@ class InfoExtractor: 'private', 'premium_only', 'subscriber_only', 'needs_auth', 'unlisted' or 'public'. Use 'InfoExtractor._availability' to set it + media_type: The type of media as classified by the site, e.g. "episode", "clip", "trailer" _old_archive_ids: A list of old archive ids needed for backward compatibility _format_sort_fields: A list of fields to use for sorting formats __post_extractor: A function to be called just before the metadata is @@ -396,17 +432,16 @@ class InfoExtractor: track_number: Number of the track within an album or a disc, as an integer. track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii), as a unicode string. - artist: Artist(s) of the track. - genre: Genre(s) of the track. + artists: List of artists of the track. + composers: List of composers of the piece. + genres: List of genres of the track. album: Title of the album the track belongs to. album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). - album_artist: List of all artists appeared on the album (e.g. - "Ash Borer / Fell Voices" or "Various Artists", useful for splits - and compilations). + album_artists: List of all artists appeared on the album. + E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"]. + Useful for splits and compilations. disc_number: Number of the disc or other physical medium the track belongs to, as an integer. - release_year: Year (YYYY) when the album was released. - composer: Composer of the piece The following fields should only be set for clips that should be cut from the original video: @@ -417,6 +452,18 @@ class InfoExtractor: rows: Number of rows in each storyboard fragment, as an integer columns: Number of columns in each storyboard fragment, as an integer + The following fields are deprecated and should not be set by new code: + composer: Use "composers" instead. + Composer(s) of the piece, comma-separated. + artist: Use "artists" instead. + Artist(s) of the track, comma-separated. + genre: Use "genres" instead. + Genre(s) of the track, comma-separated. + album_artist: Use "album_artists" instead. + All artists appeared on the album, comma-separated. + creator: Use "creators" instead. + The creator of the video. + Unless mentioned otherwise, the fields should be Unicode strings. Unless mentioned otherwise, None is equivalent to absence of information. @@ -460,8 +507,8 @@ class InfoExtractor: Subclasses of this should also be added to the list of extractors and - should define a _VALID_URL regexp and, re-define the _real_extract() and - (optionally) _real_initialize() methods. + should define _VALID_URL as a regexp or a Sequence of regexps, and + re-define the _real_extract() and (optionally) _real_initialize() methods. Subclasses may also override suitable() if necessary, but ensure the function signature is preserved and that this function imports everything it needs @@ -524,7 +571,7 @@ class InfoExtractor: _EMBED_REGEX = [] def _login_hint(self, method=NO_DEFAULT, netrc=None): - password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' + password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' return { None: '', 'any': f'Use --cookies, --cookies-from-browser, {password_hint}', @@ -551,8 +598,8 @@ def _match_valid_url(cls, url): # we have cached the regexp for *this* class, whereas getattr would also # match the superclass if '_VALID_URL_RE' not in cls.__dict__: - cls._VALID_URL_RE = re.compile(cls._VALID_URL) - return cls._VALID_URL_RE.match(url) + cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL))) + return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None) @classmethod def suitable(cls, url): @@ -707,11 +754,11 @@ def extract(self, url): except UnsupportedError: raise except ExtractorError as e: - e.video_id = e.video_id or self.get_temp_id(url), - e.ie = e.ie or self.IE_NAME, + e.video_id = e.video_id or self.get_temp_id(url) + e.ie = e.ie or self.IE_NAME e.traceback = e.traceback or sys.exc_info()[2] raise - except http.client.IncompleteRead as e: + except IncompleteRead as e: raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url)) except (KeyError, StopIteration) as e: raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url)) @@ -726,8 +773,8 @@ def __maybe_fake_ip_and_retry(self, countries): self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) if self._x_forwarded_for_ip: self.report_warning( - 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.' - % (self._x_forwarded_for_ip, country_code.upper())) + 'Video is geo restricted. Retrying extraction with fake IP ' + f'{self._x_forwarded_for_ip} ({country_code.upper()}) as X-Forwarded-For.') return True return False @@ -770,22 +817,28 @@ def IE_NAME(cls): @staticmethod def __can_accept_status_code(err, expected_status): - assert isinstance(err, urllib.error.HTTPError) + assert isinstance(err, HTTPError) if expected_status is None: return False elif callable(expected_status): - return expected_status(err.code) is True + return expected_status(err.status) is True else: - return err.code in variadic(expected_status) + return err.status in variadic(expected_status) - def _create_request(self, url_or_request, data=None, headers=None, query=None): + def _create_request(self, url_or_request, data=None, headers=None, query=None, extensions=None): if isinstance(url_or_request, urllib.request.Request): - return update_Request(url_or_request, data=data, headers=headers, query=query) - if query: - url_or_request = update_url_query(url_or_request, query) - return sanitized_Request(url_or_request, data, headers or {}) + self._downloader.deprecation_warning( + 'Passing a urllib.request.Request to _create_request() is deprecated. ' + 'Use yt_dlp.networking.common.Request instead.') + url_or_request = urllib_req_to_req(url_or_request) + elif not isinstance(url_or_request, Request): + url_or_request = Request(url_or_request) - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None): + url_or_request.update(data=data, headers=headers, query=query, extensions=extensions) + return url_or_request + + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, + headers=None, query=None, expected_status=None, impersonate=None, require_impersonation=False): """ Return the response handle. @@ -794,7 +847,7 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa if not self._downloader._first_webpage_request: sleep_interval = self.get_param('sleep_interval_requests') or 0 if sleep_interval > 0: - self.to_screen('Sleeping %s seconds ...' % sleep_interval) + self.to_screen(f'Sleeping {sleep_interval} seconds ...') time.sleep(sleep_interval) else: self._downloader._first_webpage_request = False @@ -816,24 +869,42 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa headers = (headers or {}).copy() headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip) + extensions = {} + + if impersonate in (True, ''): + impersonate = ImpersonateTarget() + requested_targets = [ + t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t) + for t in variadic(impersonate) + ] if impersonate else [] + + available_target = next(filter(self._downloader._impersonate_target_available, requested_targets), None) + if available_target: + extensions['impersonate'] = available_target + elif requested_targets: + message = 'The extractor is attempting impersonation, but ' + message += ( + 'no impersonate target is available' if not str(impersonate) + else f'none of these impersonate targets are available: "{", ".join(map(str, requested_targets))}"') + info_msg = ('see https://github.com/yt-dlp/yt-dlp#impersonation ' + 'for information on installing the required dependencies') + if require_impersonation: + raise ExtractorError(f'{message}; {info_msg}', expected=True) + self.report_warning(f'{message}; if you encounter errors, then {info_msg}', only_once=True) + try: - return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query)) + return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query, extensions)) except network_exceptions as err: - if isinstance(err, urllib.error.HTTPError): + if isinstance(err, HTTPError): if self.__can_accept_status_code(err, expected_status): - # Retain reference to error to prevent file object from - # being closed before it can be read. Works around the - # effects of <https://bugs.python.org/issue15002> - # introduced in Python 3.4.1. - err.fp._error = err - return err.fp + return err.response if errnote is False: return False if errnote is None: errnote = 'Unable to download webpage' - errmsg = f'{errnote}: {error_to_compat_str(err)}' + errmsg = f'{errnote}: {err}' if fatal: raise ExtractorError(errmsg, cause=err) else: @@ -841,13 +912,14 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa return False def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, - encoding=None, data=None, headers={}, query={}, expected_status=None): + encoding=None, data=None, headers={}, query={}, expected_status=None, + impersonate=None, require_impersonation=False): """ Return a tuple (page content as string, URL handle). Arguments: url_or_request -- plain text URL as a string or - a urllib.request.Request object + a yt_dlp.networking.Request object video_id -- Video/playlist/item identifier (string) Keyword arguments: @@ -872,17 +944,27 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote= returning True if it should be accepted Note that this argument does not affect success status codes (2xx) which are always accepted. + impersonate -- the impersonate target. Can be any of the following entities: + - an instance of yt_dlp.networking.impersonate.ImpersonateTarget + - a string in the format of CLIENT[:OS] + - a list or a tuple of CLIENT[:OS] strings or ImpersonateTarget instances + - a boolean value; True means any impersonate target is sufficient + require_impersonation -- flag to toggle whether the request should raise an error + if impersonation is not possible (bool, default: False) """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, str): url_or_request = url_or_request.partition('#')[0] - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, + headers=headers, query=query, expected_status=expected_status, + impersonate=impersonate, require_impersonation=require_impersonation) if urlh is False: assert not fatal return False - content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, + encoding=encoding, data=data) return (content, urlh) @staticmethod @@ -911,7 +993,7 @@ def __check_blocked(self, content): r'<iframe src="([^"]+)"', content, 'Websense information URL', default=None) if blocked_iframe: - msg += ' Visit %s for more details' % blocked_iframe + msg += f' Visit {blocked_iframe} for more details' raise ExtractorError(msg, expected=True) if '<title>The URL you requested has been blocked' in first_block: msg = ( @@ -921,7 +1003,7 @@ def __check_blocked(self, content): r'

(.*?)

', content, 'block message', default=None) if block_msg: - msg += ' (Message: "%s")' % block_msg.replace('\n', ' ') + msg += ' (Message: "{}")'.format(block_msg.replace('\n', ' ')) raise ExtractorError(msg, expected=True) if ('TTK :: Доступ к ресурсу ограничен' in content and 'blocklist.rkn.gov.ru' in content): @@ -930,11 +1012,13 @@ def __check_blocked(self, content): 'Visit http://blocklist.rkn.gov.ru/ for a block reason.', expected=True) - def _request_dump_filename(self, url, video_id): - basen = f'{video_id}_{url}' + def _request_dump_filename(self, url, video_id, data=None): + if data is not None: + data = hashlib.md5(data).hexdigest() + basen = join_nonempty(video_id, data, url, delim='_') trim_length = self.get_param('trim_file_name') or 240 if len(basen) > trim_length: - h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() + h = '___' + hashlib.md5(basen.encode()).hexdigest() basen = basen[:trim_length - len(h)] + h filename = sanitize_filename(f'{basen}.dump', restricted=True) # Working around MAX_PATH limitation on Windows (see @@ -953,16 +1037,19 @@ def __decode_webpage(self, webpage_bytes, encoding, headers): except LookupError: return webpage_bytes.decode('utf-8', 'replace') - def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, + prefix=None, encoding=None, data=None): webpage_bytes = urlh.read() if prefix is not None: webpage_bytes = prefix + webpage_bytes if self.get_param('dump_intermediate_pages', False): - self.to_screen('Dumping request to ' + urlh.geturl()) + self.to_screen('Dumping request to ' + urlh.url) dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) if self.get_param('write_pages'): - filename = self._request_dump_filename(urlh.geturl(), video_id) + if isinstance(url_or_request, Request): + data = self._create_request(url_or_request, data).data + filename = self._request_dump_filename(urlh.url, video_id, data) self.to_screen(f'Saving request to {filename}') with open(filename, 'wb') as outf: outf.write(webpage_bytes) @@ -982,7 +1069,7 @@ def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, er if transform_source: xml_string = transform_source(xml_string) try: - return compat_etree_fromstring(xml_string.encode('utf-8')) + return compat_etree_fromstring(xml_string.encode()) except xml.etree.ElementTree.ParseError as ve: self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve) @@ -1007,20 +1094,23 @@ def parse(ie, content, *args, errnote=errnote, **kwargs): return getattr(ie, parser)(content, *args, **kwargs) def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None, + impersonate=None, require_impersonation=False): res = self._download_webpage_handle( url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query, expected_status=expected_status) + data=data, headers=headers, query=query, expected_status=expected_status, + impersonate=impersonate, require_impersonation=require_impersonation) if res is False: return res content, urlh = res return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None, + impersonate=None, require_impersonation=False): if self.get_param('load_pages'): url_or_request = self._create_request(url_or_request, data, headers, query) - filename = self._request_dump_filename(url_or_request.full_url, video_id) + filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data) self.to_screen(f'Loading request from {filename}') try: with open(filename, 'rb') as dumpf: @@ -1040,6 +1130,8 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote, 'headers': headers, 'query': query, 'expected_status': expected_status, + 'impersonate': impersonate, + 'require_impersonation': require_impersonation, } if parser is None: kwargs.pop('transform_source') @@ -1094,7 +1186,7 @@ def _download_webpage( while True: try: return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs) - except http.client.IncompleteRead as e: + except IncompleteRead as e: try_count += 1 if try_count >= tries: raise e @@ -1128,11 +1220,11 @@ def report_drm(self, video_id, partial=NO_DEFAULT): def report_extraction(self, id_or_name): """Report information extraction.""" - self.to_screen('%s: Extracting information' % id_or_name) + self.to_screen(f'{id_or_name}: Extracting information') def report_download_webpage(self, video_id): """Report webpage download.""" - self.to_screen('%s: Downloading webpage' % video_id) + self.to_screen(f'{video_id}: Downloading webpage') def report_age_confirmation(self): """Report attempt to confirm age.""" @@ -1238,9 +1330,9 @@ def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, f elif default is not NO_DEFAULT: return default elif fatal: - raise RegexNotFoundError('Unable to extract %s' % _name) + raise RegexNotFoundError(f'Unable to extract {_name}') else: - self.report_warning('unable to extract %s' % _name + bug_reports_message()) + self.report_warning(f'unable to extract {_name}' + bug_reports_message()) return None def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', @@ -1280,45 +1372,51 @@ def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=Tr return clean_html(res) def _get_netrc_login_info(self, netrc_machine=None): - username = None - password = None netrc_machine = netrc_machine or self._NETRC_MACHINE - if self.get_param('usenetrc', False): - try: - netrc_file = compat_expanduser(self.get_param('netrc_location') or '~') - if os.path.isdir(netrc_file): - netrc_file = os.path.join(netrc_file, '.netrc') - info = netrc.netrc(file=netrc_file).authenticators(netrc_machine) - if info is not None: - username = info[0] - password = info[2] - else: - raise netrc.NetrcParseError( - 'No authenticators for %s' % netrc_machine) - except (OSError, netrc.NetrcParseError) as err: - self.report_warning( - 'parsing .netrc: %s' % error_to_compat_str(err)) + cmd = self.get_param('netrc_cmd') + if cmd: + cmd = cmd.replace('{}', netrc_machine) + self.to_screen(f'Executing command: {cmd}') + stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE) + if ret != 0: + raise OSError(f'Command returned error code {ret}') + info = netrc_from_content(stdout).authenticators(netrc_machine) - return username, password + elif self.get_param('usenetrc', False): + netrc_file = compat_expanduser(self.get_param('netrc_location') or '~') + if os.path.isdir(netrc_file): + netrc_file = os.path.join(netrc_file, '.netrc') + info = netrc.netrc(netrc_file).authenticators(netrc_machine) + + else: + return None, None + if not info: + self.to_screen(f'No authenticators for {netrc_machine}') + return None, None + + self.write_debug(f'Using netrc for {netrc_machine} authentication') + return info[0], info[2] def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None): """ Get the login info as (username, password) First look for the manually specified credentials using username_option and password_option as keys in params dictionary. If no such credentials - available look in the netrc file using the netrc_machine or _NETRC_MACHINE - value. + are available try the netrc_cmd if it is defined or look in the + netrc file using the netrc_machine or _NETRC_MACHINE value. If there's no info available, return (None, None) """ - # Attempt to use provided username and password or .netrc data username = self.get_param(username_option) if username is not None: password = self.get_param(password_option) else: - username, password = self._get_netrc_login_info(netrc_machine) - + try: + username, password = self._get_netrc_login_info(netrc_machine) + except (OSError, netrc.NetrcParseError) as err: + self.report_warning(f'Failed to parse .netrc: {err}') + return None, None return username, password def _get_tfa_info(self, note='two-factor verification code'): @@ -1333,14 +1431,14 @@ def _get_tfa_info(self, note='two-factor verification code'): if tfa is not None: return tfa - return getpass.getpass('Type %s and press [Return]: ' % note) + return getpass.getpass(f'Type {note} and press [Return]: ') # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))' - property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)' - % {'prop': re.escape(prop), 'sep': '(?::|[:-])'}) + property_re = r'(?:name|property)=(?:\'og{sep}{prop}\'|"og{sep}{prop}"|\s*og{sep}{prop}\b)'.format( + prop=re.escape(prop), sep='(?::|[:-])') template = r']+?%s[^>]+?%s' return [ template % (property_re, content_re), @@ -1349,14 +1447,14 @@ def _og_regexes(prop): @staticmethod def _meta_regex(prop): - return r'''(?isx)]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1) - [^>]+?content=(["\'])(?P.*?)\2''' % re.escape(prop) + return rf'''(?isx)]+(?:itemprop|name|property|id|http-equiv)=(["\']?){re.escape(prop)}\1) + [^>]+?content=(["\'])(?P.*?)\2''' def _og_search_property(self, prop, html, name=None, **kargs): prop = variadic(prop) if name is None: - name = 'OpenGraph %s' % prop[0] + name = f'OpenGraph {prop[0]}' og_regexes = [] for p in prop: og_regexes.extend(self._og_regexes(p)) @@ -1479,7 +1577,7 @@ def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, def elif fatal: raise RegexNotFoundError('Unable to extract JSON-LD') else: - self.report_warning('unable to extract JSON-LD %s' % bug_reports_message()) + self.report_warning(f'unable to extract JSON-LD {bug_reports_message()}') return {} def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): @@ -1501,8 +1599,8 @@ def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): } def is_type(e, *expected_types): - type = variadic(traverse_obj(e, '@type')) - return any(x in type for x in expected_types) + type_ = variadic(traverse_obj(e, '@type')) + return any(x in type_ for x in expected_types) def extract_interaction_type(e): interaction_type = e.get('interactionType') @@ -1531,7 +1629,7 @@ def extract_interaction_statistic(e): count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1]) if not count_kind: continue - count_key = '%s_count' % count_kind + count_key = f'{count_kind}_count' if info.get(count_key) is not None: continue info[count_key] = interaction_count @@ -1543,7 +1641,7 @@ def extract_chapter_information(e): 'end_time': part.get('endOffset'), } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip'] for idx, (last_c, current_c, next_c) in enumerate(zip( - [{'end_time': 0}] + chapters, chapters, chapters[1:])): + [{'end_time': 0}, *chapters], chapters, chapters[1:])): current_c['end_time'] = current_c['end_time'] or next_c['start_time'] current_c['start_time'] = current_c['start_time'] or last_c['end_time'] if None in current_c.values(): @@ -1652,17 +1750,21 @@ def traverse_json_ld(json_ld, at_top_level=True): traverse_json_ld(json_ld) return filter_dict(info) - def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw): - return self._parse_json( - self._search_regex( - r'(?s)]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)', - webpage, 'next.js data', fatal=fatal, **kw), - video_id, transform_source=transform_source, fatal=fatal) + def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT, **kw): + if default == '{}': + self._downloader.deprecation_warning('using `default=\'{}\'` is deprecated, use `default={}` instead') + default = {} + if default is not NO_DEFAULT: + fatal = False + + return self._search_json( + r']+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data', + video_id, end_pattern='', fatal=fatal, default=default, **kw) def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" rectx = re.escape(context_name) - FUNCTION_RE = r'\(function\((?P.*?)\){return\s+(?P{.*?})\s*;?\s*}\((?P.*?)\)' + FUNCTION_RE = r'\(function\((?P.*?)\){.*?\breturn\s+(?P{.*?})\s*;?\s*}\((?P.*?)\)' js, arg_keys, arg_vals = self._search_regex( (rf'', rf'{rectx}\(.*?{FUNCTION_RE}'), webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), @@ -1680,9 +1782,9 @@ def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal def _hidden_inputs(html): html = re.sub(r'', '', html) hidden_inputs = {} - for input in re.findall(r'(?i)(]+>)', html): - attrs = extract_attributes(input) - if not input: + for input_el in re.findall(r'(?i)(]+>)', html): + attrs = extract_attributes(input_el) + if not input_el: continue if attrs.get('type') not in ('hidden', 'submit'): continue @@ -1694,8 +1796,8 @@ def _hidden_inputs(html): def _form_hidden_inputs(self, form_id, html): form = self._search_regex( - r'(?is)]+?id=(["\'])%s\1[^>]*>(?P
.+?)
' % form_id, - html, '%s form' % form_id, group='form') + rf'(?is)]+?id=(["\']){form_id}\1[^>]*>(?P
.+?)
', + html, f'{form_id} form', group='form') return self._hidden_inputs(form) @classproperty(cache=True) @@ -1725,7 +1827,7 @@ def _check_formats(self, formats, video_id): formats[:] = filter( lambda f: self._is_valid_url( f['url'], video_id, - item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'), + item='{} video format'.format(f.get('format_id')) if f.get('format_id') else 'video'), formats) @staticmethod @@ -1741,15 +1843,14 @@ def _remove_duplicate_formats(formats): def _is_valid_url(self, url, video_id, item='video', headers={}): url = self._proto_relative_url(url, scheme='http:') # For now assume non HTTP(S) URLs always valid - if not (url.startswith('http://') or url.startswith('https://')): + if not url.startswith(('http://', 'https://')): return True try: - self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) + self._request_webpage(url, video_id, f'Checking {item} URL', headers=headers) return True except ExtractorError as e: self.to_screen( - '%s: %s URL is invalid, skipping: %s' - % (video_id, item, error_to_compat_str(e.cause))) + f'{video_id}: {item} URL is invalid, skipping: {e.cause!s}') return False def http_scheme(self): @@ -1788,7 +1889,7 @@ def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality= return [] manifest, urlh = res - manifest_url = urlh.geturl() + manifest_url = urlh.url return self._parse_f4m_formats( manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id, @@ -1803,8 +1904,8 @@ def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0') if akamai_pv is not None and ';' in akamai_pv.text: - playerVerificationChallenge = akamai_pv.text.split(';')[0] - if playerVerificationChallenge.strip() != '': + player_verification_challenge = akamai_pv.text.split(';')[0] + if player_verification_challenge.strip() != '': return [] formats = [] @@ -1850,7 +1951,7 @@ def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, if not media_url: continue manifest_url = ( - media_url if media_url.startswith('http://') or media_url.startswith('https://') + media_url if media_url.startswith(('http://', 'https://')) else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) # If media_url is itself a f4m manifest do the recursive extraction # since bitrates in parent manifest (this one) and media_url manifest @@ -1911,7 +2012,7 @@ def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m def _report_ignoring_subs(self, name): self.report_warning(bug_reports_message( f'Ignoring subtitle tracks found in the {name} manifest; ' - 'if any subtitle tracks are missing,' + 'if any subtitle tracks are missing,', ), only_once=True) def _extract_m3u8_formats(self, *args, **kwargs): @@ -1947,7 +2048,7 @@ def _extract_m3u8_formats_and_subtitles( return [], {} m3u8_doc, urlh = res - m3u8_url = urlh.geturl() + m3u8_url = urlh.url return self._parse_m3u8_formats_and_subtitles( m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, @@ -1961,11 +2062,7 @@ def _parse_m3u8_formats_and_subtitles( errnote=None, fatal=True, data=None, headers={}, query={}, video_id=None): formats, subtitles = [], {} - - has_drm = re.search('|'.join([ - r'#EXT-X-FAXS-CM:', # Adobe Flash Access - r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay - ]), m3u8_doc) + has_drm = HlsFD._has_drm(m3u8_doc) def format_url(url): return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url) @@ -2006,7 +2103,7 @@ def _extract_m3u8_playlist_indices(*args, **kwargs): formats = [{ 'format_id': join_nonempty(m3u8_id, idx), 'format_index': idx, - 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'), + 'url': m3u8_url or encode_data_uri(m3u8_doc.encode(), 'application/x-mpegurl'), 'ext': ext, 'protocol': entry_protocol, 'preference': preference, @@ -2063,6 +2160,7 @@ def extract_media(x_media_line): 'protocol': entry_protocol, 'preference': preference, 'quality': quality, + 'has_drm': has_drm, 'vcodec': 'none' if media_type == 'AUDIO' else None, } for idx in _extract_m3u8_playlist_indices(manifest_url)) @@ -2122,7 +2220,13 @@ def build_stream_name(): 'protocol': entry_protocol, 'preference': preference, 'quality': quality, + 'has_drm': has_drm, } + + # YouTube-specific + if yt_audio_content_id := last_stream_inf.get('YT-EXT-AUDIO-CONTENT-ID'): + f['language'] = yt_audio_content_id.split('.')[0] + resolution = last_stream_inf.get('RESOLUTION') if resolution: mobj = re.search(r'(?P\d+)[xX](?P\d+)', resolution) @@ -2202,7 +2306,9 @@ def _extract_mpd_vod_duration( mpd_url, video_id, note='Downloading MPD VOD manifest' if note is None else note, errnote='Failed to download VOD manifest' if errnote is None else errnote, - fatal=False, data=data, headers=headers, query=query) or {} + fatal=False, data=data, headers=headers, query=query) + if not isinstance(mpd_doc, xml.etree.ElementTree.Element): + return None return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration'))) @staticmethod @@ -2214,7 +2320,7 @@ def _xpath_ns(path, namespace=None): if not c or c == '.': out.append(c) else: - out.append('{%s}%s' % (namespace, c)) + out.append(f'{{{namespace}}}{c}') return '/'.join(out) def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): @@ -2225,18 +2331,10 @@ def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4 if res is False: assert not fatal return [], {} - smil, urlh = res - smil_url = urlh.geturl() - namespace = self._parse_smil_namespace(smil) - - fmts = self._parse_smil_formats( - smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) - subs = self._parse_smil_subtitles( - smil, namespace=namespace) - - return fmts, subs + return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params, + namespace=self._parse_smil_namespace(smil)) def _extract_smil_formats(self, *args, **kwargs): fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs) @@ -2250,7 +2348,7 @@ def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): return {} smil, urlh = res - smil_url = urlh.geturl() + smil_url = urlh.url return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) @@ -2262,9 +2360,8 @@ def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): namespace = self._parse_smil_namespace(smil) - formats = self._parse_smil_formats( + formats, subtitles = self._parse_smil_formats_and_subtitles( smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) - subtitles = self._parse_smil_subtitles(smil, namespace=namespace) video_id = os.path.splitext(url_basename(smil_url))[0] title = None @@ -2303,7 +2400,14 @@ def _parse_smil_namespace(self, smil): return self._search_regex( r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + def _parse_smil_formats(self, *args, **kwargs): + fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs) + if subs: + self._report_ignoring_subs('SMIL') + return fmts + + def _parse_smil_formats_and_subtitles( + self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): base = smil_url for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): b = meta.get('base') or meta.get('httpBase') @@ -2311,14 +2415,16 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para base = b break - formats = [] + formats, subtitles = [], {} rtmp_count = 0 http_count = 0 m3u8_count = 0 imgs_count = 0 srcs = set() - media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) + media = itertools.chain.from_iterable( + smil.findall(self._xpath_ns(arg, namespace)) + for arg in ['.//video', './/audio', './/media']) for medium in media: src = medium.get('src') if not src or src in srcs: @@ -2355,12 +2461,13 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para }) continue - src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src) + src_url = src if src.startswith('http') else urllib.parse.urljoin(f'{base}/', src) src_url = src_url.strip() if proto == 'm3u8' or src_ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( + m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles( src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) + self._merge_subtitles(m3u8_subs, target=subtitles) if len(m3u8_formats) == 1: m3u8_count += 1 m3u8_formats[0].update({ @@ -2381,11 +2488,15 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para f4m_url += urllib.parse.urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) elif src_ext == 'mpd': - formats.extend(self._extract_mpd_formats( - src_url, video_id, mpd_id='dash', fatal=False)) + mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles( + src_url, video_id, mpd_id='dash', fatal=False) + formats.extend(mpd_formats) + self._merge_subtitles(mpd_subs, target=subtitles) elif re.search(r'\.ism/[Mm]anifest', src_url): - formats.extend(self._extract_ism_formats( - src_url, video_id, ism_id='mss', fatal=False)) + ism_formats, ism_subs = self._extract_ism_formats_and_subtitles( + src_url, video_id, ism_id='mss', fatal=False) + formats.extend(ism_formats) + self._merge_subtitles(ism_subs, target=subtitles) elif src_url.startswith('http') and self._is_valid_url(src, video_id): http_count += 1 formats.append({ @@ -2406,7 +2517,7 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para imgs_count += 1 formats.append({ - 'format_id': 'imagestream-%d' % (imgs_count), + 'format_id': f'imagestream-{imgs_count}', 'url': src, 'ext': mimetype2ext(medium.get('type')), 'acodec': 'none', @@ -2416,12 +2527,15 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para 'format_note': 'SMIL storyboards', }) - return formats + smil_subs = self._parse_smil_subtitles(smil, namespace=namespace) + self._merge_subtitles(smil_subs, target=subtitles) + + return formats, subtitles def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): urls = [] subtitles = {} - for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): + for textstream in smil.findall(self._xpath_ns('.//textstream', namespace)): src = textstream.get('src') if not src or src in urls: continue @@ -2442,7 +2556,7 @@ def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True): return [] xspf, urlh = res - xspf_url = urlh.geturl() + xspf_url = urlh.url return self._parse_xspf( xspf, playlist_id, xspf_url=xspf_url, @@ -2494,7 +2608,11 @@ def _extract_mpd_formats(self, *args, **kwargs): self._report_ignoring_subs('DASH') return fmts - def _extract_mpd_formats_and_subtitles( + def _extract_mpd_formats_and_subtitles(self, *args, **kwargs): + periods = self._extract_mpd_periods(*args, **kwargs) + return self._merge_mpd_periods(periods) + + def _extract_mpd_periods( self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): @@ -2507,17 +2625,16 @@ def _extract_mpd_formats_and_subtitles( errnote='Failed to download MPD manifest' if errnote is None else errnote, fatal=fatal, data=data, headers=headers, query=query) if res is False: - return [], {} + return [] mpd_doc, urlh = res if mpd_doc is None: - return [], {} + return [] # We could have been redirected to a new url when we retrieved our mpd file. - mpd_url = urlh.geturl() + mpd_url = urlh.url mpd_base_url = base_url(mpd_url) - return self._parse_mpd_formats_and_subtitles( - mpd_doc, mpd_id, mpd_base_url, mpd_url) + return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url) def _parse_mpd_formats(self, *args, **kwargs): fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs) @@ -2525,8 +2642,39 @@ def _parse_mpd_formats(self, *args, **kwargs): self._report_ignoring_subs('DASH') return fmts - def _parse_mpd_formats_and_subtitles( - self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None): + def _parse_mpd_formats_and_subtitles(self, *args, **kwargs): + periods = self._parse_mpd_periods(*args, **kwargs) + return self._merge_mpd_periods(periods) + + def _merge_mpd_periods(self, periods): + """ + Combine all formats and subtitles from an MPD manifest into a single list, + by concatenate streams with similar formats. + """ + formats, subtitles = {}, {} + for period in periods: + for f in period['formats']: + assert 'is_dash_periods' not in f, 'format already processed' + f['is_dash_periods'] = True + format_key = tuple(v for k, v in f.items() if k not in ( + ('format_id', 'fragments', 'manifest_stream_number'))) + if format_key not in formats: + formats[format_key] = f + elif 'fragments' in f: + formats[format_key].setdefault('fragments', []).extend(f['fragments']) + + if subtitles and period['subtitles']: + self.report_warning(bug_reports_message( + 'Found subtitles in multiple periods in the DASH manifest; ' + 'if part of the subtitles are missing,', + ), only_once=True) + + for sub_lang, sub_info in period['subtitles'].items(): + subtitles.setdefault(sub_lang, []).extend(sub_info) + + return list(formats.values()), subtitles + + def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None): """ Parse formats from MPD manifest. References: @@ -2605,9 +2753,13 @@ def extract_Initialization(source): return ms_info mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) - formats, subtitles = [], {} stream_numbers = collections.defaultdict(int) - for period in mpd_doc.findall(_add_ns('Period')): + for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))): + period_entry = { + 'id': period.get('id', f'period-{period_idx}'), + 'formats': [], + 'subtitles': collections.defaultdict(list), + } period_duration = parse_duration(period.get('duration')) or mpd_duration period_ms_info = extract_multisegment_info(period, { 'start_number': 1, @@ -2640,7 +2792,7 @@ def extract_Initialization(source): elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'): content_type = 'text' else: - self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) + self.report_warning(f'Unknown MIME type {mime_type} in DASH manifest') continue base_url = '' @@ -2678,10 +2830,10 @@ def extract_Initialization(source): 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), 'fps': int_or_none(representation_attrib.get('frameRate')), 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, - 'format_note': 'DASH %s' % content_type, + 'format_note': f'DASH {content_type}', 'filesize': filesize, 'container': mimetype2ext(mime_type) + '_dash', - **codecs + **codecs, } elif content_type == 'text': f = { @@ -2722,8 +2874,8 @@ def prepare_template(template_name, identifiers): t += c # Next, $...$ templates are translated to their # %(...) counterparts to be used with % operator - t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) - t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) + t = re.sub(r'\$({})\$'.format('|'.join(identifiers)), r'%(\1)d', t) + t = re.sub(r'\$({})%([^$]+)\$'.format('|'.join(identifiers)), r'%(\1)\2', t) t.replace('$$', '$') return t @@ -2786,12 +2938,12 @@ def add_segment_url(): 'duration': float_or_none(segment_d, representation_ms_info['timescale']), }) - for num, s in enumerate(representation_ms_info['s']): + for s in representation_ms_info['s']: segment_time = s.get('t') or segment_time segment_d = s['d'] add_segment_url() segment_number += 1 - for r in range(s.get('r', 0)): + for _ in range(s.get('r', 0)): segment_time += segment_d add_segment_url() segment_number += 1 @@ -2805,7 +2957,7 @@ def add_segment_url(): timescale = representation_ms_info['timescale'] for s in representation_ms_info['s']: duration = float_or_none(s['d'], timescale) - for r in range(s.get('r', 0) + 1): + for _ in range(s.get('r', 0) + 1): segment_uri = representation_ms_info['segment_urls'][segment_index] fragments.append({ location_key(segment_uri): segment_uri, @@ -2857,11 +3009,10 @@ def add_segment_url(): if content_type in ('video', 'audio', 'image/jpeg'): f['manifest_stream_number'] = stream_numbers[f['url']] stream_numbers[f['url']] += 1 - formats.append(f) + period_entry['formats'].append(f) elif content_type == 'text': - subtitles.setdefault(lang or 'und', []).append(f) - - return formats, subtitles + period_entry['subtitles'][lang or 'und'].append(f) + yield period_entry def _extract_ism_formats(self, *args, **kwargs): fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs) @@ -2884,7 +3035,7 @@ def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, not if ism_doc is None: return [], {} - return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id) + return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id) def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): """ @@ -2914,7 +3065,7 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag')) # TODO: add support for WVC1 and WMAP if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'): - self.report_warning('%s is not a supported codec' % fourcc) + self.report_warning(f'{fourcc} is not a supported codec') continue tbr = int(track.attrib['Bitrate']) // 1000 # [1] does not mention Width and Height attributes. However, @@ -2963,7 +3114,7 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): 'fourcc': fourcc, 'language': stream_language, 'codec_private_data': track.get('CodecPrivateData'), - } + }, }) elif stream_type in ('video', 'audio'): formats.append({ @@ -2980,6 +3131,8 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): 'protocol': 'ism', 'fragments': fragments, 'has_drm': ism_doc.find('Protection') is not None, + 'language': stream_language, + 'audio_channels': int_or_none(track.get('Channels')), '_download_params': { 'stream_type': stream_type, 'duration': duration, @@ -2997,7 +3150,7 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): }) return formats, subtitles - def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None): + def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None, _headers=None): def absolute_url(item_url): return urljoin(base_url, item_url) @@ -3021,11 +3174,11 @@ def _media_formats(src, cur_media_type, type_info=None): formats = self._extract_m3u8_formats( full_url, video_id, ext='mp4', entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id, - preference=preference, quality=quality, fatal=False) + preference=preference, quality=quality, fatal=False, headers=_headers) elif ext == 'mpd': is_plain_url = False formats = self._extract_mpd_formats( - full_url, video_id, mpd_id=mpd_id, fatal=False) + full_url, video_id, mpd_id=mpd_id, fatal=False, headers=_headers) else: is_plain_url = True formats = [{ @@ -3043,13 +3196,13 @@ def _media_formats(src, cur_media_type, type_info=None): _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)' media_tags = [(media_tag, media_tag_name, media_type, '') for media_tag, media_tag_name, media_type - in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)] + in re.findall(rf'(?s)(<({_MEDIA_TAG_NAME_RE})[^>]*/>)', webpage)] media_tags.extend(re.findall( # We only allow video|audio followed by a whitespace or '>'. # Allowing more characters may end up in significant slow down (see # https://github.com/ytdl-org/youtube-dl/issues/11979, # e.g. http://www.porntrex.com/maps/videositemap.xml). - r'(?s)(<(?P%s)(?:\s+[^>]*)?>)(.*?)' % _MEDIA_TAG_NAME_RE, webpage)) + rf'(?s)(<(?P{_MEDIA_TAG_NAME_RE})(?:\s+[^>]*)?>)(.*?)', webpage)) for media_tag, _, media_type, media_content in media_tags: media_info = { 'formats': [], @@ -3119,6 +3272,8 @@ def _media_formats(src, cur_media_type, type_info=None): }) for f in media_info['formats']: f.setdefault('http_headers', {})['Referer'] = base_url + if _headers: + f['http_headers'].update(_headers) if media_info['formats'] or media_info['subtitles']: entries.append(media_info) return entries @@ -3193,13 +3348,13 @@ def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native mobj = re.search( r'(?:(?:http|rtmp|rtsp)(?Ps)?:)?(?P//[^?]+)', url) url_base = mobj.group('url') - http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base) + http_base_url = '{}{}:{}'.format('http', mobj.group('s') or '', url_base) formats = [] def manifest_url(manifest): m_url = f'{http_base_url}/{manifest}' if query: - m_url += '?%s' % query + m_url += f'?{query}' return m_url if 'm3u8' not in skip_protocols: @@ -3221,7 +3376,7 @@ def manifest_url(manifest): video_id, fatal=False) for rtmp_format in rtmp_formats: rtsp_format = rtmp_format.copy() - rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) + rtsp_format['url'] = '{}/{}'.format(rtmp_format['url'], rtmp_format['play_path']) del rtsp_format['play_path'] del rtsp_format['ext'] rtsp_format.update({ @@ -3241,23 +3396,16 @@ def manifest_url(manifest): return formats def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): - mobj = re.search( - r'''(?s)jwplayer\s*\(\s*(?P'|")(?!(?P=q)).+(?P=q)\s*\)(?!).*?\.\s*setup\s*\(\s*(?P(?:\([^)]*\)|[^)])+)\s*\)''', - webpage) - if mobj: - try: - jwplayer_data = self._parse_json(mobj.group('options'), - video_id=video_id, - transform_source=transform_source) - except ExtractorError: - pass - else: - if isinstance(jwplayer_data, dict): - return jwplayer_data + return self._search_json( + r'''(?'|")(?!(?P=q)).+(?P=q)\s*\)(?:(?!).)*?\.\s*(?:setup\s*\(|(?Pload)\s*\(\s*\[)''', + webpage, 'JWPlayer data', video_id, + # must be a {...} or sequence, ending + contains_pattern=r'\{(?s:.*)}(?(load)(?:\s*,\s*\{(?s:.*)})*)', end_pattern=r'(?(load)\]|\))', + transform_source=transform_source, default=None) - def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): + def _extract_jwplayer_data(self, webpage, video_id, *args, transform_source=js_to_json, **kwargs): jwplayer_data = self._find_jwplayer_data( - webpage, video_id, transform_source=js_to_json) + webpage, video_id, transform_source=transform_source) return self._parse_jwplayer_data( jwplayer_data, video_id, *args, **kwargs) @@ -3289,22 +3437,14 @@ def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url) subtitles = {} - tracks = video_data.get('tracks') - if tracks and isinstance(tracks, list): - for track in tracks: - if not isinstance(track, dict): - continue - track_kind = track.get('kind') - if not track_kind or not isinstance(track_kind, str): - continue - if track_kind.lower() not in ('captions', 'subtitles'): - continue - track_url = urljoin(base_url, track.get('file')) - if not track_url: - continue - subtitles.setdefault(track.get('label') or 'en', []).append({ - 'url': self._proto_relative_url(track_url) - }) + for track in traverse_obj(video_data, ( + 'tracks', lambda _, v: v['kind'].lower() in ('captions', 'subtitles'))): + track_url = urljoin(base_url, track.get('file')) + if not track_url: + continue + subtitles.setdefault(track.get('label') or 'en', []).append({ + 'url': self._proto_relative_url(track_url), + }) entry = { 'id': this_video_id, @@ -3382,14 +3522,14 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, 'tbr': int_or_none(source.get('bitrate'), scale=1000), 'filesize': int_or_none(source.get('filesize')), 'ext': ext, - 'format_id': format_id + 'format_id': format_id, } if source_url.startswith('rtmp'): a_format['ext'] = 'flv' # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as # of jwplayer.flash.swf rtmp_url_parts = re.split( - r'((?:mp4|mp3|flv):)', source_url, 1) + r'((?:mp4|mp3|flv):)', source_url, maxsplit=1) if len(rtmp_url_parts) == 3: rtmp_url, prefix, play_path = rtmp_url_parts a_format.update({ @@ -3435,7 +3575,7 @@ def _set_cookie(self, domain, name, value, expire_time=None, port=None, def _get_cookies(self, url): """ Return a http.cookies.SimpleCookie with the cookies for the url """ - return LenientSimpleCookie(self._downloader._calc_cookies(url)) + return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url)) def _apply_first_set_cookie_header(self, url_handle, cookie): """ @@ -3456,7 +3596,7 @@ def _apply_first_set_cookie_header(self, url_handle, cookie): continue cookies = cookies.encode('iso-8859-1').decode('utf-8') cookie_value = re.search( - r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies) + rf'{cookie}=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)', cookies) if cookie_value: value, domain = cookie_value.groups() self._set_cookie(domain, cookie, value) @@ -3510,8 +3650,8 @@ def _RETURN_TYPE(cls): @classmethod def is_single_video(cls, url): """Returns whether the URL is of a single video, None if unknown""" - assert cls.suitable(url), 'The URL must be suitable for the extractor' - return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE) + if cls.suitable(url): + return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE) @classmethod def is_suitable(cls, age_limit): @@ -3524,7 +3664,7 @@ def description(cls, *, markdown=True, search_examples=None): desc = '' if cls._NETRC_MACHINE: if markdown: - desc += f' [{cls._NETRC_MACHINE}]' + desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")' else: desc += f' [{cls._NETRC_MACHINE}]' if cls.IE_DESC is False: @@ -3540,7 +3680,7 @@ def description(cls, *, markdown=True, search_examples=None): desc += ' (**Currently broken**)' if markdown else ' (Currently broken)' # Escape emojis. Ref: https://github.com/github/markup/issues/1153 - name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME + name = (' - **{}**'.format(re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME))) if markdown else cls.IE_NAME return f'{name}:{desc}' if desc else name def extract_subtitles(self, *args, **kwargs): @@ -3580,7 +3720,7 @@ def extractor(): self.to_screen(f'Extracted {comment_count} comments') return { 'comments': comments, - 'comment_count': None if interrupted else comment_count + 'comment_count': None if interrupted else comment_count, } return extractor @@ -3646,11 +3786,47 @@ def _generic_title(self, url='', webpage='', *, default=None): or urllib.parse.unquote(os.path.splitext(url_basename(url))[0]) or default) + def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True): + if not duration: + return + chapter_list = [{ + 'start_time': start_function(chapter), + 'title': title_function(chapter), + } for chapter in chapter_list or []] + if strict: + warn = self.report_warning + else: + warn = self.write_debug + chapter_list.sort(key=lambda c: c['start_time'] or 0) + + chapters = [{'start_time': 0}] + for idx, chapter in enumerate(chapter_list): + if chapter['start_time'] is None: + warn(f'Incomplete chapter {idx}') + elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration: + chapters.append(chapter) + elif chapter not in chapters: + issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration + else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}') + warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"') + return chapters[1:] + + def _extract_chapters_from_description(self, description, duration): + duration_re = r'(?:\d+:)?\d{1,2}:\d{2}' + sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$' + return self._extract_chapters_helper( + re.findall(sep_re % (duration_re, r'.+?'), description or ''), + start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1], + duration=duration, strict=False) or self._extract_chapters_helper( + re.findall(sep_re % (r'.+?', duration_re), description or ''), + start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0], + duration=duration, strict=False) + @staticmethod def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None): - all_known = all(map( - lambda x: x is not None, - (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted))) + all_known = all( + x is not None for x in + (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)) return ( 'private' if is_private else 'premium_only' if needs_premium @@ -3770,7 +3946,7 @@ class SearchInfoExtractor(InfoExtractor): @classproperty def _VALID_URL(cls): - return r'%s(?P|[1-9][0-9]*|all):(?P[\s\S]+)' % cls._SEARCH_KEY + return rf'{cls._SEARCH_KEY}(?P|[1-9][0-9]*|all):(?P[\s\S]+)' def _real_extract(self, query): prefix, query = self._match_valid_url(query).group('prefix', 'query') diff --git a/yt_dlp/extractor/commonmistakes.py b/yt_dlp/extractor/commonmistakes.py index 1d3b61c732..8ddb164b97 100644 --- a/yt_dlp/extractor/commonmistakes.py +++ b/yt_dlp/extractor/commonmistakes.py @@ -16,10 +16,10 @@ class CommonMistakesIE(InfoExtractor): def _real_extract(self, url): msg = ( - 'You\'ve asked yt-dlp to download the URL "%s". ' + f'You\'ve asked yt-dlp to download the URL "{url}". ' 'That doesn\'t make any sense. ' 'Simply remove the parameter in your command or configuration.' - ) % url + ) if not self.get_param('verbose'): msg += ' Add -v to the command line to see what arguments and configuration yt-dlp has' raise ExtractorError(msg, expected=True) @@ -38,5 +38,21 @@ def _real_extract(self, url): real_url = self._match_id(url) self.report_warning( 'Your URL starts with a Byte Order Mark (BOM). ' - 'Removing the BOM and looking for "%s" ...' % real_url) + f'Removing the BOM and looking for "{real_url}" ...') return self.url_result(real_url) + + +class BlobIE(InfoExtractor): + IE_DESC = False + _VALID_URL = r'blob:' + + _TESTS = [{ + 'url': 'blob:https://www.youtube.com/4eb3d090-a761-46e6-8083-c32016a36e3b', + 'only_matching': True, + }] + + def _real_extract(self, url): + raise ExtractorError( + 'You\'ve asked yt-dlp to download a blob URL. ' + 'A blob URL exists only locally in your browser. ' + 'It is not possible for yt-dlp to access it.', expected=True) diff --git a/yt_dlp/extractor/commonprotocols.py b/yt_dlp/extractor/commonprotocols.py index 2f93e8ea5a..7b3a5b6546 100644 --- a/yt_dlp/extractor/commonprotocols.py +++ b/yt_dlp/extractor/commonprotocols.py @@ -63,7 +63,7 @@ class ViewSourceIE(InfoExtractor): _TEST = { 'url': 'view-source:https://www.youtube.com/watch?v=BaW_jenozKc', - 'only_matching': True + 'only_matching': True, } def _real_extract(self, url): diff --git a/yt_dlp/extractor/condenast.py b/yt_dlp/extractor/condenast.py index 3170c2990e..9c02cd3429 100644 --- a/yt_dlp/extractor/condenast.py +++ b/yt_dlp/extractor/condenast.py @@ -1,10 +1,7 @@ import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlparse, - compat_urlparse, -) from ..utils import ( determine_ext, extract_attributes, @@ -48,20 +45,20 @@ class CondeNastIE(InfoExtractor): 'wmagazine': 'W Magazine', } - _VALID_URL = r'''(?x)https?://(?:video|www|player(?:-backend)?)\.(?:%s)\.com/ + _VALID_URL = r'''(?x)https?://(?:video|www|player(?:-backend)?)\.(?:{})\.com/ (?: (?: embed(?:js)?| (?:script|inline)/video - )/(?P[0-9a-f]{24})(?:/(?P[0-9a-f]{24}))?(?:.+?\btarget=(?P[^&]+))?| + )/(?P[0-9a-f]{{24}})(?:/(?P[0-9a-f]{{24}}))?(?:.+?\btarget=(?P[^&]+))?| (?Pwatch|series|video)/(?P[^/?#]+) - )''' % '|'.join(_SITES.keys()) - IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) + )'''.format('|'.join(_SITES.keys())) + IE_DESC = 'Condé Nast media group: {}'.format(', '.join(sorted(_SITES.values()))) _EMBED_REGEX = [r'''(?x) <(?:iframe|script)[^>]+?src=(["\'])(?P - (?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+? - )\1''' % '|'.join(_SITES.keys())] + (?:https?:)?//player(?:-backend)?\.(?:{})\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+? + )\1'''.format('|'.join(_SITES.keys()))] _TESTS = [{ 'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', @@ -74,7 +71,7 @@ class CondeNastIE(InfoExtractor): 'uploader': 'wired', 'upload_date': '20130314', 'timestamp': 1363219200, - } + }, }, { 'url': 'http://video.gq.com/watch/the-closer-with-keith-olbermann-the-only-true-surprise-trump-s-an-idiot?c=series', 'info_dict': { @@ -97,7 +94,7 @@ class CondeNastIE(InfoExtractor): 'uploader': 'arstechnica', 'upload_date': '20150916', 'timestamp': 1442434920, - } + }, }, { 'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player', 'only_matching': True, @@ -110,12 +107,12 @@ def _extract_series(self, url, webpage): title = self._html_search_regex( r'(?s)
.*?

(.+?)

', webpage, 'series title') - url_object = compat_urllib_parse_urlparse(url) - base_url = '%s://%s' % (url_object.scheme, url_object.netloc) + url_object = urllib.parse.urlparse(url) + base_url = f'{url_object.scheme}://{url_object.netloc}' m_paths = re.finditer( r'(?s)

.*? 1: - api_url = update_url_query(api_url, {'page': '%d' % (page, ), }) + api_url = update_url_query(api_url, {'page': page}) content = self._download_json( api_url, video_id, - note='Downloading continuation - %d' % (page, ), + note=f'Downloading continuation - {page}', fatal=False) for item in try_get(content, lambda x: x['page'][list_type]['item'], list) or []: - episode_url = urljoin(url, try_get(item, lambda x: x['url_%s_s' % (url_lang, )])) + episode_url = urljoin(url, try_get(item, lambda x: x[f'url_{url_lang}_s'])) if episode_url: entries.append(episode_url) return self.playlist_result( (self.url_result(entry) for entry in entries), playlist_id=video_id, - playlist_title=try_get(content, lambda x: x['page']['program']['title_%s_t' % (url_lang, )]) or video_id.split('=')[-1], - playlist_description=try_get(content, lambda x: x['page']['program']['description_%s_t' % (url_lang, )]), + playlist_title=try_get(content, lambda x: x['page']['program'][f'title_{url_lang}_t']) or video_id.split('=')[-1], + playlist_description=try_get(content, lambda x: x['page']['program'][f'description_{url_lang}_t']), ) diff --git a/yt_dlp/extractor/cracked.py b/yt_dlp/extractor/cracked.py index c6aabccc68..f2276547ef 100644 --- a/yt_dlp/extractor/cracked.py +++ b/yt_dlp/extractor/cracked.py @@ -19,7 +19,7 @@ class CrackedIE(InfoExtractor): 'title': 'If Animal Actors Got E! True Hollywood Stories', 'timestamp': 1404954000, 'upload_date': '20140710', - } + }, }, { # youtube embed 'url': 'http://www.cracked.com/video_19006_4-plot-holes-you-didnt-notice-in-your-favorite-movies.html', @@ -32,7 +32,7 @@ class CrackedIE(InfoExtractor): 'upload_date': '20140725', 'uploader_id': 'Cracked', 'uploader': 'Cracked', - } + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/crackle.py b/yt_dlp/extractor/crackle.py index 46100151a9..c4ceba9408 100644 --- a/yt_dlp/extractor/crackle.py +++ b/yt_dlp/extractor/crackle.py @@ -4,8 +4,9 @@ import time from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( + ExtractorError, determine_ext, float_or_none, int_or_none, @@ -13,7 +14,6 @@ parse_age_limit, parse_duration, url_or_none, - ExtractorError ) @@ -45,7 +45,7 @@ class CrackleIE(InfoExtractor): 'skip_download': True, }, 'expected_warnings': [ - 'Trying with a list of known countries' + 'Trying with a list of known countries', ], }, { 'url': 'https://www.sonycrackle.com/thanksgiving/2510064', @@ -89,7 +89,7 @@ def _real_extract(self, url): for num, country in enumerate(countries): if num == 1: # start hard-coded list self.report_warning('%s. Trying with a list of known countries' % ( - 'Unable to obtain video formats from %s API' % geo_bypass_country if geo_bypass_country + f'Unable to obtain video formats from {geo_bypass_country} API' if geo_bypass_country else 'No country code was given using --geo-bypass-country')) elif num == num_countries: # end of list geo_info = self._download_json( @@ -99,28 +99,28 @@ def _real_extract(self, url): country = geo_info.get('CountryCode') if country is None: continue - self.to_screen('%s identified country as %s' % (self.IE_NAME, country)) + self.to_screen(f'{self.IE_NAME} identified country as {country}') if country in countries: - self.to_screen('Downloading from %s API was already attempted. Skipping...' % country) + self.to_screen(f'Downloading from {country} API was already attempted. Skipping...') continue if country is None: continue try: media = self._download_json( - 'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s?disableProtocols=true' % (video_id, country), - video_id, note='Downloading media JSON from %s API' % country, + f'https://web-api-us.crackle.com/Service.svc/details/media/{video_id}/{country}?disableProtocols=true', + video_id, note=f'Downloading media JSON from {country} API', errnote='Unable to download media JSON') except ExtractorError as e: # 401 means geo restriction, trying next country - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: continue raise status = media.get('status') if status.get('messageCode') != '0': raise ExtractorError( - '%s said: %s %s - %s' % ( + '{} said: {} {} - {}'.format( self.IE_NAME, status.get('messageCodeDescription'), status.get('messageCode'), status.get('message')), expected=True) diff --git a/yt_dlp/extractor/craftsy.py b/yt_dlp/extractor/craftsy.py index 307bfb9460..0d7d759ab7 100644 --- a/yt_dlp/extractor/craftsy.py +++ b/yt_dlp/extractor/craftsy.py @@ -1,16 +1,17 @@ +import json + from .brightcove import BrightcoveNewIE from .common import InfoExtractor - from ..utils import ( - dict_get, - get_element_by_id, - js_to_json, - traverse_obj, + extract_attributes, + get_element_html_by_class, + get_element_text_and_html_by_tag, ) +from ..utils.traversal import traverse_obj class CraftsyIE(InfoExtractor): - _VALID_URL = r'https?://www.craftsy.com/class/(?P[a-z0-9_-]+)/' + _VALID_URL = r'https?://www\.craftsy\.com/class/(?P[\w-]+)' _TESTS = [{ 'url': 'https://www.craftsy.com/class/the-midnight-quilt-show-season-5/', 'info_dict': { @@ -41,28 +42,34 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_data = self._parse_json(self._search_regex( - r'class_video_player_vars\s*=\s*({.*})\s*;', - get_element_by_id('vidstore-classes_class-video-player-js-extra', webpage), - 'video data'), video_id, transform_source=js_to_json) + video_player = get_element_html_by_class('class-video-player', webpage) + video_data = traverse_obj(video_player, ( + {extract_attributes}, 'wire:snapshot', {json.loads}, 'data', {dict})) or {} + video_js = traverse_obj(video_player, ( + {lambda x: get_element_text_and_html_by_tag('video-js', x)}, 1, {extract_attributes})) or {} - account_id = traverse_obj(video_data, ('video_player', 'bc_account_id')) + has_access = video_data.get('userHasAccess') + lessons = traverse_obj(video_data, ('lessons', ..., ..., lambda _, v: v['video_id'])) - entries = [] - class_preview = traverse_obj(video_data, ('video_player', 'class_preview')) - if class_preview: - v_id = class_preview.get('video_id') - entries.append(self.url_result( - f'http://players.brightcove.net/{account_id}/default_default/index.html?videoId={v_id}', - BrightcoveNewIE, v_id, class_preview.get('title'))) + preview_id = video_js.get('data-video-id') + if preview_id and preview_id not in traverse_obj(lessons, (..., 'video_id')): + if not lessons and not has_access: + self.report_warning( + 'Only extracting preview. For the full class, pass cookies ' + f'from an account that has access. {self._login_hint()}') + lessons.append({'video_id': preview_id}) - if dict_get(video_data, ('is_free', 'user_has_access')): - entries += [ - self.url_result( + if not lessons and not has_access: + self.raise_login_required('You do not have access to this class') + + account_id = video_data.get('accountId') or video_js['data-account'] + + def entries(lessons): + for lesson in lessons: + yield self.url_result( f'http://players.brightcove.net/{account_id}/default_default/index.html?videoId={lesson["video_id"]}', BrightcoveNewIE, lesson['video_id'], lesson.get('title')) - for lesson in video_data['lessons']] return self.playlist_result( - entries, video_id, video_data.get('class_title'), + entries(lessons), video_id, self._html_search_meta(('og:title', 'twitter:title'), webpage), self._html_search_meta(('og:description', 'description'), webpage, default=None)) diff --git a/yt_dlp/extractor/crooksandliars.py b/yt_dlp/extractor/crooksandliars.py index 4de7e3d530..abd3322a95 100644 --- a/yt_dlp/extractor/crooksandliars.py +++ b/yt_dlp/extractor/crooksandliars.py @@ -21,7 +21,7 @@ class CrooksAndLiarsIE(InfoExtractor): 'upload_date': '20150405', 'uploader': 'Heather', 'duration': 236, - } + }, }, { 'url': 'http://embed.crooksandliars.com/v/MTE3MjUtMzQ2MzA', 'only_matching': True, @@ -31,12 +31,9 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://embed.crooksandliars.com/embed/%s' % video_id, video_id) + f'http://embed.crooksandliars.com/embed/{video_id}', video_id) - manifest = self._parse_json( - self._search_regex( - r'var\s+manifest\s*=\s*({.+?})\n', webpage, 'manifest JSON'), - video_id) + manifest = self._search_json(r'var\s+manifest\s*=', webpage, 'manifest JSON', video_id) quality = qualities(('webm_low', 'mp4_low', 'webm_high', 'mp4_high')) diff --git a/yt_dlp/extractor/crowdbunker.py b/yt_dlp/extractor/crowdbunker.py index d83c01560c..bf814570fe 100644 --- a/yt_dlp/extractor/crowdbunker.py +++ b/yt_dlp/extractor/crowdbunker.py @@ -24,15 +24,16 @@ class CrowdBunkerIE(InfoExtractor): 'uploader_id': 'UCeN_qQV829NYf0pvPJhW5dQ', 'like_count': int, 'upload_date': '20211218', - 'thumbnail': 'https://scw.divulg.org/cb-medias4/images/0z4Kms8pi8I/maxres.jpg' + 'thumbnail': 'https://scw.divulg.org/cb-medias4/images/0z4Kms8pi8I/maxres.jpg', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }] def _real_extract(self, url): - id = self._match_id(url) - data_json = self._download_json(f'https://api.divulg.org/post/{id}/details', - id, headers={'accept': 'application/json, text/plain, */*'}) + video_id = self._match_id(url) + data_json = self._download_json( + f'https://api.divulg.org/post/{video_id}/details', video_id, + headers={'accept': 'application/json, text/plain, */*'}) video_json = data_json['video'] formats, subtitles = [], {} for sub in video_json.get('captions') or []: @@ -45,12 +46,12 @@ def _real_extract(self, url): mpd_url = try_get(video_json, lambda x: x['dashManifest']['url']) if mpd_url: - fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, id) + fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, video_id) formats.extend(fmts) subtitles = self._merge_subtitles(subtitles, subs) m3u8_url = try_get(video_json, lambda x: x['hlsManifest']['url']) if m3u8_url: - fmts, subs = self._extract_m3u8_formats_and_subtitles(mpd_url, id) + fmts, subs = self._extract_m3u8_formats_and_subtitles(mpd_url, video_id) formats.extend(fmts) subtitles = self._merge_subtitles(subtitles, subs) @@ -61,7 +62,7 @@ def _real_extract(self, url): } for image in video_json.get('thumbnails') or [] if image.get('url')] return { - 'id': id, + 'id': video_id, 'title': video_json.get('title'), 'description': video_json.get('description'), 'view_count': video_json.get('viewCount'), @@ -87,23 +88,24 @@ class CrowdBunkerChannelIE(InfoExtractor): }, }] - def _entries(self, id): + def _entries(self, playlist_id): last = None for page in itertools.count(): channel_json = self._download_json( - f'https://api.divulg.org/organization/{id}/posts', id, headers={'accept': 'application/json, text/plain, */*'}, + f'https://api.divulg.org/organization/{playlist_id}/posts', playlist_id, + headers={'accept': 'application/json, text/plain, */*'}, query={'after': last} if last else {}, note=f'Downloading Page {page}') for item in channel_json.get('items') or []: v_id = item.get('uid') if not v_id: continue yield self.url_result( - 'https://crowdbunker.com/v/%s' % v_id, ie=CrowdBunkerIE.ie_key(), video_id=v_id) + f'https://crowdbunker.com/v/{v_id}', ie=CrowdBunkerIE.ie_key(), video_id=v_id) last = channel_json.get('last') if not last: break def _real_extract(self, url): - id = self._match_id(url) - return self.playlist_result(self._entries(id), playlist_id=id) + playlist_id = self._match_id(url) + return self.playlist_result(self._entries(playlist_id), playlist_id=playlist_id) diff --git a/yt_dlp/extractor/crtvg.py b/yt_dlp/extractor/crtvg.py new file mode 100644 index 0000000000..6d9a77824e --- /dev/null +++ b/yt_dlp/extractor/crtvg.py @@ -0,0 +1,53 @@ +import re + +from .common import InfoExtractor +from ..utils import make_archive_id, remove_end + + +class CrtvgIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?crtvg\.es/tvg/a-carta/(?P[^/#?]+)' + _TESTS = [{ + 'url': 'https://www.crtvg.es/tvg/a-carta/os-caimans-do-tea-5839623', + 'md5': 'c0958d9ff90e4503a75544358758921d', + 'info_dict': { + 'id': 'os-caimans-do-tea-5839623', + 'title': 'Os caimáns do Tea', + 'ext': 'mp4', + 'description': 'md5:f71cfba21ae564f0a6f415b31de1f842', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + '_old_archive_ids': ['crtvg 5839623'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crtvg.es/tvg/a-carta/a-parabolica-love-story', + 'md5': '9a47b95a1749db7b7eb3214904624584', + 'info_dict': { + 'id': 'a-parabolica-love-story', + 'title': 'A parabólica / Trabuco, o can mordedor / Love Story', + 'ext': 'mp4', + 'description': 'md5:f71cfba21ae564f0a6f415b31de1f842', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_url = self._search_regex(r'var\s+url\s*=\s*["\']([^"\']+)', webpage, 'video url') + formats = self._extract_m3u8_formats(video_url + '/playlist.m3u8', video_id, fatal=False) + formats.extend(self._extract_mpd_formats(video_url + '/manifest.mpd', video_id, fatal=False)) + + old_video_id = None + if mobj := re.fullmatch(r'[^/#?]+-(?P\d{7})', video_id): + old_video_id = [make_archive_id(self, mobj.group('old_id'))] + + return { + 'id': video_id, + '_old_archive_ids': old_video_id, + 'formats': formats, + 'title': remove_end(self._html_search_meta( + ['og:title', 'twitter:title'], webpage, 'title', default=None), ' | CRTVG'), + 'description': self._html_search_meta('description', webpage, 'description', default=None), + 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage, 'thumbnail', default=None), + } diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index 1abffcd745..1b124c6557 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -1,108 +1,279 @@ import base64 -import urllib.parse +import uuid from .common import InfoExtractor +from ..networking import Request +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, float_or_none, format_field, - join_nonempty, + int_or_none, + jwt_decode_hs256, + parse_age_limit, + parse_count, parse_iso8601, qualities, + time_seconds, traverse_obj, - try_get, + url_or_none, + urlencode_postdata, ) class CrunchyrollBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login' + _BASE_URL = 'https://www.crunchyroll.com' _API_BASE = 'https://api.crunchyroll.com' _NETRC_MACHINE = 'crunchyroll' - params = None + _SWITCH_USER_AGENT = 'Crunchyroll/1.8.0 Nintendo Switch/12.3.12.0 UE4/4.27' + _REFRESH_TOKEN = None + _AUTH_HEADERS = None + _AUTH_EXPIRY = None + _API_ENDPOINT = None + _BASIC_AUTH = 'Basic ' + base64.b64encode(':'.join(( + 't-kdgp2h8c3jub8fn0fq', + 'yfLDfMfrYvKXh4JXS1LEI2cCqu1v5Wan', + )).encode()).decode() + _IS_PREMIUM = None + _LOCALE_LOOKUP = { + 'ar': 'ar-SA', + 'de': 'de-DE', + '': 'en-US', + 'es': 'es-419', + 'es-es': 'es-ES', + 'fr': 'fr-FR', + 'it': 'it-IT', + 'pt-br': 'pt-BR', + 'pt-pt': 'pt-PT', + 'ru': 'ru-RU', + 'hi': 'hi-IN', + } - @property - def is_logged_in(self): - return self._get_cookies(self._LOGIN_URL).get('etp_rt') + def _set_auth_info(self, response): + CrunchyrollBaseIE._IS_PREMIUM = 'cr_premium' in traverse_obj(response, ('access_token', {jwt_decode_hs256}, 'benefits', ...)) + CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': response['token_type'] + ' ' + response['access_token']} + CrunchyrollBaseIE._AUTH_EXPIRY = time_seconds(seconds=traverse_obj(response, ('expires_in', {float_or_none}), default=300) - 10) + + def _request_token(self, headers, data, note='Requesting token', errnote='Failed to request token'): + try: + return self._download_json( + f'{self._BASE_URL}/auth/v1/token', None, note=note, errnote=errnote, + headers=headers, data=urlencode_postdata(data), impersonate=True) + except ExtractorError as error: + if not isinstance(error.cause, HTTPError) or error.cause.status != 403: + raise + if target := error.cause.response.extensions.get('impersonate'): + raise ExtractorError(f'Got HTTP Error 403 when using impersonate target "{target}"') + raise ExtractorError( + 'Request blocked by Cloudflare. ' + 'Install the required impersonation dependency if possible, ' + 'or else navigate to Crunchyroll in your browser, ' + 'then pass the fresh cookies (with --cookies-from-browser or --cookies) ' + 'and your browser\'s User-Agent (with --user-agent)', expected=True) def _perform_login(self, username, password): - if self.is_logged_in: + if not CrunchyrollBaseIE._REFRESH_TOKEN: + CrunchyrollBaseIE._REFRESH_TOKEN = self.cache.load(self._NETRC_MACHINE, username) + if CrunchyrollBaseIE._REFRESH_TOKEN: return - upsell_response = self._download_json( - f'{self._API_BASE}/get_upsell_data.0.json', None, 'Getting session id', - query={ - 'sess_id': 1, - 'device_id': 'whatvalueshouldbeforweb', - 'device_type': 'com.crunchyroll.static', - 'access_token': 'giKq5eY27ny3cqz', - 'referer': self._LOGIN_URL - }) - if upsell_response['code'] != 'ok': - raise ExtractorError('Could not get session id') - session_id = upsell_response['data']['session_id'] + try: + login_response = self._request_token( + headers={'Authorization': self._BASIC_AUTH}, data={ + 'username': username, + 'password': password, + 'grant_type': 'password', + 'scope': 'offline_access', + }, note='Logging in', errnote='Failed to log in') + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 401: + raise ExtractorError('Invalid username and/or password', expected=True) + raise - login_response = self._download_json( - f'{self._API_BASE}/login.1.json', None, 'Logging in', - data=urllib.parse.urlencode({ - 'account': username, - 'password': password, - 'session_id': session_id - }).encode('ascii')) - if login_response['code'] != 'ok': - raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True) - if not self.is_logged_in: - raise ExtractorError('Login succeeded but did not set etp_rt cookie') + CrunchyrollBaseIE._REFRESH_TOKEN = login_response['refresh_token'] + self.cache.store(self._NETRC_MACHINE, username, CrunchyrollBaseIE._REFRESH_TOKEN) + self._set_auth_info(login_response) - def _get_embedded_json(self, webpage, display_id): - initial_state = self._parse_json(self._search_regex( - r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id) - app_config = self._parse_json(self._search_regex( - r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id) - return initial_state, app_config + def _update_auth(self): + if CrunchyrollBaseIE._AUTH_HEADERS and CrunchyrollBaseIE._AUTH_EXPIRY > time_seconds(): + return - def _get_params(self, lang): - if not CrunchyrollBaseIE.params: - if self._get_cookies(f'https://www.crunchyroll.com/{lang}').get('etp_rt'): - grant_type, key = 'etp_rt_cookie', 'accountAuthClientId' - else: - grant_type, key = 'client_id', 'anonClientId' - - initial_state, app_config = self._get_embedded_json(self._download_webpage( - f'https://www.crunchyroll.com/{lang}', None, note='Retrieving main page'), None) - api_domain = app_config['cxApiParams']['apiDomain'].replace('beta.crunchyroll.com', 'www.crunchyroll.com') - - auth_response = self._download_json( - f'{api_domain}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', - headers={ - 'Authorization': 'Basic ' + str(base64.b64encode(('%s:' % app_config['cxApiParams'][key]).encode('ascii')), 'ascii') - }, data=f'grant_type={grant_type}'.encode('ascii')) - policy_response = self._download_json( - f'{api_domain}/index/v2', None, note='Retrieving signed policy', - headers={ - 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token'] - }) - cms = policy_response.get('cms_web') - bucket = cms['bucket'] - params = { - 'Policy': cms['policy'], - 'Signature': cms['signature'], - 'Key-Pair-Id': cms['key_pair_id'] + auth_headers = {'Authorization': self._BASIC_AUTH} + if CrunchyrollBaseIE._REFRESH_TOKEN: + data = { + 'refresh_token': CrunchyrollBaseIE._REFRESH_TOKEN, + 'grant_type': 'refresh_token', + 'scope': 'offline_access', } - locale = traverse_obj(initial_state, ('localization', 'locale')) - if locale: - params['locale'] = locale - CrunchyrollBaseIE.params = (api_domain, bucket, params) - return CrunchyrollBaseIE.params + else: + data = {'grant_type': 'client_id'} + auth_headers['ETP-Anonymous-ID'] = uuid.uuid4() + try: + auth_response = self._request_token(auth_headers, data) + except ExtractorError as error: + username, password = self._get_login_info() + if not username or not isinstance(error.cause, HTTPError) or error.cause.status != 400: + raise + self.to_screen('Refresh token has expired. Re-logging in') + CrunchyrollBaseIE._REFRESH_TOKEN = None + self.cache.store(self._NETRC_MACHINE, username, None) + self._perform_login(username, password) + return + + self._set_auth_info(auth_response) + + def _locale_from_language(self, language): + config_locale = self._configuration_arg('metadata', ie_key=CrunchyrollBetaIE, casesense=True) + return config_locale[0] if config_locale else self._LOCALE_LOOKUP.get(language) + + def _call_base_api(self, endpoint, internal_id, lang, note=None, query={}): + self._update_auth() + + if not endpoint.startswith('/'): + endpoint = f'/{endpoint}' + + query = query.copy() + locale = self._locale_from_language(lang) + if locale: + query['locale'] = locale + + return self._download_json( + f'{self._BASE_URL}{endpoint}', internal_id, note or f'Calling API: {endpoint}', + headers=CrunchyrollBaseIE._AUTH_HEADERS, query=query) + + def _call_api(self, path, internal_id, lang, note='api', query={}): + if not path.startswith(f'/content/v2/{self._API_ENDPOINT}/'): + path = f'/content/v2/{self._API_ENDPOINT}/{path}' + + try: + result = self._call_base_api( + path, internal_id, lang, f'Downloading {note} JSON ({self._API_ENDPOINT})', query=query) + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 404: + return None + raise + + if not result: + raise ExtractorError(f'Unexpected response when downloading {note} JSON') + return result + + def _extract_chapters(self, internal_id): + # if no skip events are available, a 403 xml error is returned + skip_events = self._download_json( + f'https://static.crunchyroll.com/skip-events/production/{internal_id}.json', + internal_id, note='Downloading chapter info', fatal=False, errnote=False) + if not skip_events: + return None + + chapters = [] + for event in ('recap', 'intro', 'credits', 'preview'): + start = traverse_obj(skip_events, (event, 'start', {float_or_none})) + end = traverse_obj(skip_events, (event, 'end', {float_or_none})) + # some chapters have no start and/or ending time, they will just be ignored + if start is None or end is None: + continue + chapters.append({'title': event.capitalize(), 'start_time': start, 'end_time': end}) + + return chapters + + def _extract_stream(self, identifier, display_id=None): + if not display_id: + display_id = identifier + + self._update_auth() + headers = {**CrunchyrollBaseIE._AUTH_HEADERS, 'User-Agent': self._SWITCH_USER_AGENT} + try: + stream_response = self._download_json( + f'https://cr-play-service.prd.crunchyrollsvc.com/v1/{identifier}/console/switch/play', + display_id, note='Downloading stream info', errnote='Failed to download stream info', headers=headers) + except ExtractorError as error: + if self.get_param('ignore_no_formats_error'): + self.report_warning(error.orig_msg) + return [], {} + elif isinstance(error.cause, HTTPError) and error.cause.status == 420: + raise ExtractorError( + 'You have reached the rate-limit for active streams; try again later', expected=True) + raise + + available_formats = {'': ('', '', stream_response['url'])} + for hardsub_lang, stream in traverse_obj(stream_response, ('hardSubs', {dict.items}, lambda _, v: v[1]['url'])): + available_formats[hardsub_lang] = (f'hardsub-{hardsub_lang}', hardsub_lang, stream['url']) + + requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] + hardsub_langs = [lang for lang in available_formats if lang] + if hardsub_langs and 'all' not in requested_hardsubs: + full_format_langs = set(requested_hardsubs) + self.to_screen(f'Available hardsub languages: {", ".join(hardsub_langs)}') + self.to_screen( + 'To extract formats of a hardsub language, use ' + '"--extractor-args crunchyrollbeta:hardsub=". ' + 'See https://github.com/yt-dlp/yt-dlp#crunchyrollbeta-crunchyroll for more info', + only_once=True) + else: + full_format_langs = set(map(str.lower, available_formats)) + + audio_locale = traverse_obj(stream_response, ('audioLocale', {str})) + hardsub_preference = qualities(requested_hardsubs[::-1]) + formats, subtitles = [], {} + for format_id, hardsub_lang, stream_url in available_formats.values(): + if hardsub_lang.lower() in full_format_langs: + adaptive_formats, dash_subs = self._extract_mpd_formats_and_subtitles( + stream_url, display_id, mpd_id=format_id, headers=CrunchyrollBaseIE._AUTH_HEADERS, + fatal=False, note=f'Downloading {f"{format_id} " if hardsub_lang else ""}MPD manifest') + self._merge_subtitles(dash_subs, target=subtitles) + else: + continue # XXX: Update this if meta mpd formats work; will be tricky with token invalidation + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = audio_locale + f['quality'] = hardsub_preference(hardsub_lang.lower()) + formats.extend(adaptive_formats) + + for locale, subtitle in traverse_obj(stream_response, (('subtitles', 'captions'), {dict.items}, ...)): + subtitles.setdefault(locale, []).append(traverse_obj(subtitle, {'url': 'url', 'ext': 'format'})) + + # Invalidate stream token to avoid rate-limit + error_msg = 'Unable to invalidate stream token; you may experience rate-limiting' + if stream_token := stream_response.get('token'): + self._request_webpage(Request( + f'https://cr-play-service.prd.crunchyrollsvc.com/v1/token/{identifier}/{stream_token}/inactive', + headers=headers, method='PATCH'), display_id, 'Invalidating stream token', error_msg, fatal=False) + else: + self.report_warning(error_msg) + + return formats, subtitles -class CrunchyrollBetaIE(CrunchyrollBaseIE): +class CrunchyrollCmsBaseIE(CrunchyrollBaseIE): + _API_ENDPOINT = 'cms' + _CMS_EXPIRY = None + + def _call_cms_api_signed(self, path, internal_id, lang, note='api'): + if not CrunchyrollCmsBaseIE._CMS_EXPIRY or CrunchyrollCmsBaseIE._CMS_EXPIRY <= time_seconds(): + response = self._call_base_api('index/v2', None, lang, 'Retrieving signed policy')['cms_web'] + CrunchyrollCmsBaseIE._CMS_QUERY = { + 'Policy': response['policy'], + 'Signature': response['signature'], + 'Key-Pair-Id': response['key_pair_id'], + } + CrunchyrollCmsBaseIE._CMS_BUCKET = response['bucket'] + CrunchyrollCmsBaseIE._CMS_EXPIRY = parse_iso8601(response['expires']) - 10 + + if not path.startswith('/cms/v2'): + path = f'/cms/v2{CrunchyrollCmsBaseIE._CMS_BUCKET}/{path}' + + return self._call_base_api( + path, internal_id, lang, f'Downloading {note} JSON (signed cms)', query=CrunchyrollCmsBaseIE._CMS_QUERY) + + +class CrunchyrollBetaIE(CrunchyrollCmsBaseIE): IE_NAME = 'crunchyroll' _VALID_URL = r'''(?x) - https?://(?:beta|www)\.crunchyroll\.com/ - (?P(?:\w{2}(?:-\w{2})?/)?) - watch/(?P\w+) - (?:/(?P[\w-]+))?/?(?:[?#]|$)''' + https?://(?:beta\.|www\.)?crunchyroll\.com/ + (?:(?P\w{2}(?:-\w{2})?)/)? + watch/(?!concert|musicvideo)(?P\w+)''' _TESTS = [{ + # Premium only 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', 'info_dict': { 'id': 'GY2P1Q98Y', @@ -119,11 +290,19 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): 'season_number': 1, 'episode': 'To the Future', 'episode_number': 73, - 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$', + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', 'chapters': 'count:2', + 'age_limit': 14, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': 'm3u8', + 'extractor_args': {'crunchyrollbeta': {'hardsub': ['de-DE']}}, + 'format': 'bv[format_id~=hardsub]', }, - 'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'}, }, { + # Premium only 'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR', 'info_dict': { 'id': 'GYE5WKQGR', @@ -131,7 +310,7 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): 'duration': 366.459, 'timestamp': 1476788400, 'description': 'md5:74b67283ffddd75f6e224ca7dc031e76', - 'title': 'SHELTER Episode – Porter Robinson presents Shelter the Animation', + 'title': 'SHELTER – Porter Robinson presents Shelter the Animation', 'upload_date': '20161018', 'series': 'SHELTER', 'series_id': 'GYGG09WWY', @@ -140,137 +319,192 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): 'season_number': 1, 'episode': 'Porter Robinson presents Shelter the Animation', 'episode_number': 0, - 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$', - 'chapters': 'count:0', + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'age_limit': 14, + 'like_count': int, + 'dislike_count': int, }, 'params': {'skip_download': True}, - 'skip': 'Video is Premium only', }, { - 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y', + 'url': 'https://www.crunchyroll.com/watch/GJWU2VKK3/cherry-blossom-meeting-and-a-coming-blizzard', + 'info_dict': { + 'id': 'GJWU2VKK3', + 'ext': 'mp4', + 'duration': 1420.054, + 'description': 'md5:2d1c67c0ec6ae514d9c30b0b99a625cd', + 'title': 'The Ice Guy and His Cool Female Colleague Episode 1 – Cherry Blossom Meeting and a Coming Blizzard', + 'series': 'The Ice Guy and His Cool Female Colleague', + 'series_id': 'GW4HM75NP', + 'season': 'The Ice Guy and His Cool Female Colleague', + 'season_id': 'GY9PC21VE', + 'season_number': 1, + 'episode': 'Cherry Blossom Meeting and a Coming Blizzard', + 'episode_number': 1, + 'chapters': 'count:2', + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'timestamp': 1672839000, + 'upload_date': '20230104', + 'age_limit': 14, + 'like_count': int, + 'dislike_count': int, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/GM8F313NQ', + 'info_dict': { + 'id': 'GM8F313NQ', + 'ext': 'mp4', + 'title': 'Garakowa -Restore the World-', + 'description': 'md5:8d2f8b6b9dd77d87810882e7d2ee5608', + 'duration': 3996.104, + 'age_limit': 13, + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'no longer exists', + }, { + 'url': 'https://www.crunchyroll.com/watch/G62PEZ2E6', + 'info_dict': { + 'id': 'G62PEZ2E6', + 'description': 'md5:8d2f8b6b9dd77d87810882e7d2ee5608', + 'age_limit': 13, + 'duration': 65.138, + 'title': 'Garakowa -Restore the World-', + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://www.crunchyroll.com/de/watch/GY2P1Q98Y', 'only_matching': True, }, { 'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy', 'only_matching': True, }] + # We want to support lazy playlist filtering and movie listings cannot be inside a playlist + _RETURN_TYPE = 'video' def _real_extract(self, url): - lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - api_domain, bucket, params = self._get_params(lang) + lang, internal_id = self._match_valid_url(url).group('lang', 'id') - episode_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id, - note='Retrieving episode metadata', query=params) - if episode_response.get('is_premium_only') and not bucket.endswith('crunchyroll'): - if self.is_logged_in: - raise ExtractorError('This video is for premium members only', expected=True) - else: - self.raise_login_required('This video is for premium members only') + # We need to use unsigned API call to allow ratings query string + response = traverse_obj(self._call_api( + f'objects/{internal_id}', internal_id, lang, 'object info', {'ratings': 'true'}), ('data', 0, {dict})) + if not response: + raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True) - stream_response = self._download_json( - f'{api_domain}{episode_response["__links__"]["streams"]["href"]}', display_id, - note='Retrieving stream info', query=params) - get_streams = lambda name: (traverse_obj(stream_response, name) or {}).items() + object_type = response.get('type') + if object_type == 'episode': + result = self._transform_episode_response(response) - requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] - hardsub_preference = qualities(requested_hardsubs[::-1]) - requested_formats = self._configuration_arg('format') or ['adaptive_hls'] + elif object_type == 'movie': + result = self._transform_movie_response(response) - available_formats = {} - for stream_type, streams in get_streams('streams'): - if stream_type not in requested_formats: - continue - for stream in streams.values(): - if not stream.get('url'): - continue - hardsub_lang = stream.get('hardsub_locale') or '' - format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s')) - available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url']) + elif object_type == 'movie_listing': + first_movie_id = traverse_obj(response, ('movie_listing_metadata', 'first_movie_id')) + if not self._yes_playlist(internal_id, first_movie_id): + return self.url_result(f'{self._BASE_URL}/{lang}watch/{first_movie_id}', CrunchyrollBetaIE, first_movie_id) + + def entries(): + movies = self._call_api(f'movie_listings/{internal_id}/movies', internal_id, lang, 'movie list') + for movie_response in traverse_obj(movies, ('data', ...)): + yield self.url_result( + f'{self._BASE_URL}/{lang}watch/{movie_response["id"]}', + CrunchyrollBetaIE, **self._transform_movie_response(movie_response)) + + return self.playlist_result(entries(), **self._transform_movie_response(response)) - if '' in available_formats and 'all' not in requested_hardsubs: - full_format_langs = set(requested_hardsubs) - self.to_screen( - 'To get all formats of a hardsub language, use ' - '"--extractor-args crunchyrollbeta:hardsub=". ' - 'See https://github.com/yt-dlp/yt-dlp#crunchyrollbeta-crunchyroll for more info', - only_once=True) else: - full_format_langs = set(map(str.lower, available_formats)) + raise ExtractorError(f'Unknown object type {object_type}') - formats = [] - for stream_type, format_id, hardsub_lang, stream_url in available_formats.values(): - if stream_type.endswith('hls'): - if hardsub_lang.lower() in full_format_langs: - adaptive_formats = self._extract_m3u8_formats( - stream_url, display_id, 'mp4', m3u8_id=format_id, - fatal=False, note=f'Downloading {format_id} HLS manifest') - else: - adaptive_formats = (self._m3u8_meta_format(stream_url, ext='mp4', m3u8_id=format_id),) - elif stream_type.endswith('dash'): - adaptive_formats = self._extract_mpd_formats( - stream_url, display_id, mpd_id=format_id, - fatal=False, note=f'Downloading {format_id} MPD manifest') + if not self._IS_PREMIUM and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')): + message = f'This {object_type} is for premium members only' + if CrunchyrollBaseIE._REFRESH_TOKEN: + self.raise_no_formats(message, expected=True, video_id=internal_id) else: - self.report_warning(f'Encountered unknown stream_type: {stream_type!r}', display_id, only_once=True) - continue - for f in adaptive_formats: - if f.get('acodec') != 'none': - f['language'] = stream_response.get('audio_locale') - f['quality'] = hardsub_preference(hardsub_lang.lower()) - formats.extend(adaptive_formats) + self.raise_login_required(message, method='password', metadata_available=True) + else: + result['formats'], result['subtitles'] = self._extract_stream(internal_id) - chapters = None - # if no intro chapter is available, a 403 without usable data is returned - intro_chapter = self._download_json(f'https://static.crunchyroll.com/datalab-intro-v2/{internal_id}.json', - display_id, fatal=False, errnote=False) - if isinstance(intro_chapter, dict): - chapters = [{ - 'title': 'Intro', - 'start_time': float_or_none(intro_chapter.get('startTime')), - 'end_time': float_or_none(intro_chapter.get('endTime')) - }] + result['chapters'] = self._extract_chapters(internal_id) + def calculate_count(item): + return parse_count(''.join((item['displayed'], item.get('unit') or ''))) + + result.update(traverse_obj(response, ('rating', { + 'like_count': ('up', {calculate_count}), + 'dislike_count': ('down', {calculate_count}), + }))) + + return result + + @staticmethod + def _transform_episode_response(data): + metadata = traverse_obj(data, (('episode_metadata', None), {dict}), get_all=False) or {} return { - 'id': internal_id, - 'title': '%s Episode %s – %s' % ( - episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')), - 'description': try_get(episode_response, lambda x: x['description'].replace(r'\r\n', '\n')), - 'duration': float_or_none(episode_response.get('duration_ms'), 1000), - 'timestamp': parse_iso8601(episode_response.get('upload_date')), - 'series': episode_response.get('series_title'), - 'series_id': episode_response.get('series_id'), - 'season': episode_response.get('season_title'), - 'season_id': episode_response.get('season_id'), - 'season_number': episode_response.get('season_number'), - 'episode': episode_response.get('title'), - 'episode_number': episode_response.get('sequence_number'), - 'formats': formats, - 'thumbnails': [{ - 'url': thumb.get('source'), - 'width': thumb.get('width'), - 'height': thumb.get('height'), - } for thumb in traverse_obj(episode_response, ('images', 'thumbnail', ..., ...)) or []], - 'subtitles': { - lang: [{ - 'url': subtitle_data.get('url'), - 'ext': subtitle_data.get('format') - }] for lang, subtitle_data in get_streams('subtitles') - }, - 'chapters': chapters + 'id': data['id'], + 'title': ' \u2013 '.join(( + ('{}{}'.format( + format_field(metadata, 'season_title'), + format_field(metadata, 'episode', ' Episode %s'))), + format_field(data, 'title'))), + **traverse_obj(data, { + 'episode': ('title', {str}), + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}), + 'thumbnails': ('images', 'thumbnail', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + }), + **traverse_obj(metadata, { + 'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}), + 'timestamp': ('upload_date', {parse_iso8601}), + 'series': ('series_title', {str}), + 'series_id': ('series_id', {str}), + 'season': ('season_title', {str}), + 'season_id': ('season_id', {str}), + 'season_number': ('season_number', ({int}, {float_or_none})), + 'episode_number': ('sequence_number', ({int}, {float_or_none})), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + 'language': ('audio_locale', {str}), + }, get_all=False), + } + + @staticmethod + def _transform_movie_response(data): + metadata = traverse_obj(data, (('movie_metadata', 'movie_listing_metadata', None), {dict}), get_all=False) or {} + return { + 'id': data['id'], + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}), + 'thumbnails': ('images', 'thumbnail', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + }), + **traverse_obj(metadata, { + 'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + }), } -class CrunchyrollBetaShowIE(CrunchyrollBaseIE): +class CrunchyrollBetaShowIE(CrunchyrollCmsBaseIE): IE_NAME = 'crunchyroll:playlist' _VALID_URL = r'''(?x) - https?://(?:beta|www)\.crunchyroll\.com/ + https?://(?:beta\.|www\.)?crunchyroll\.com/ (?P(?:\w{2}(?:-\w{2})?/)?) - series/(?P\w+) - (?:/(?P[\w-]+))?/?(?:[?#]|$)''' + series/(?P\w+)''' _TESTS = [{ 'url': 'https://www.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', 'info_dict': { 'id': 'GY19NQ2QR', 'title': 'Girl Friend BETA', + 'description': 'md5:99c1b22ee30a74b536a8277ced8eb750', + # XXX: `thumbnail` does not get set from `thumbnails` in playlist + # 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'age_limit': 14, }, 'playlist_mincount': 10, }, { @@ -279,41 +513,180 @@ class CrunchyrollBetaShowIE(CrunchyrollBaseIE): }] def _real_extract(self, url): - lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - api_domain, bucket, params = self._get_params(lang) - - series_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/series/{internal_id}', display_id, - note='Retrieving series metadata', query=params) - - seasons_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/seasons?series_id={internal_id}', display_id, - note='Retrieving season list', query=params) + lang, internal_id = self._match_valid_url(url).group('lang', 'id') def entries(): - for season in seasons_response['items']: - episodes_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/episodes?season_id={season["id"]}', display_id, - note=f'Retrieving episode list for {season.get("slug_title")}', query=params) - for episode in episodes_response['items']: - episode_id = episode['id'] - episode_display_id = episode['slug_title'] - yield { - '_type': 'url', - 'url': f'https://www.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}', - 'ie_key': CrunchyrollBetaIE.ie_key(), - 'id': episode_id, - 'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')), - 'description': try_get(episode, lambda x: x['description'].replace(r'\r\n', '\n')), - 'duration': float_or_none(episode.get('duration_ms'), 1000), - 'series': episode.get('series_title'), - 'series_id': episode.get('series_id'), - 'season': episode.get('season_title'), - 'season_id': episode.get('season_id'), - 'season_number': episode.get('season_number'), - 'episode': episode.get('title'), - 'episode_number': episode.get('sequence_number'), - 'language': episode.get('audio_locale'), - } + seasons_response = self._call_cms_api_signed(f'seasons?series_id={internal_id}', internal_id, lang, 'seasons') + for season in traverse_obj(seasons_response, ('items', ..., {dict})): + episodes_response = self._call_cms_api_signed( + f'episodes?season_id={season["id"]}', season['id'], lang, 'episode list') + for episode_response in traverse_obj(episodes_response, ('items', ..., {dict})): + yield self.url_result( + f'{self._BASE_URL}/{lang}watch/{episode_response["id"]}', + CrunchyrollBetaIE, **CrunchyrollBetaIE._transform_episode_response(episode_response)) - return self.playlist_result(entries(), internal_id, series_response.get('title')) + return self.playlist_result( + entries(), internal_id, + **traverse_obj(self._call_api(f'series/{internal_id}', internal_id, lang, 'series'), ('data', 0, { + 'title': ('title', {str}), + 'description': ('description', {lambda x: x.replace(r'\r\n', '\n')}), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + 'thumbnails': ('images', ..., ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + }))) + + +class CrunchyrollMusicIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:music' + _VALID_URL = r'''(?x) + https?://(?:www\.)?crunchyroll\.com/ + (?P(?:\w{2}(?:-\w{2})?/)?) + watch/(?Pconcert|musicvideo)/(?P\w+)''' + _TESTS = [{ + 'url': 'https://www.crunchyroll.com/de/watch/musicvideo/MV5B02C79', + 'info_dict': { + 'ext': 'mp4', + 'id': 'MV5B02C79', + 'display_id': 'egaono-hana', + 'title': 'Egaono Hana', + 'track': 'Egaono Hana', + 'artists': ['Goose house'], + 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'genres': ['J-Pop'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C', + 'info_dict': { + 'ext': 'mp4', + 'id': 'MV88BB7F2C', + 'display_id': 'crossing-field', + 'title': 'Crossing Field', + 'track': 'Crossing Field', + 'artists': ['LiSA'], + 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'genres': ['Anime'], + }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'no longer exists', + }, { + 'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135', + 'info_dict': { + 'ext': 'mp4', + 'id': 'MC2E2AC135', + 'display_id': 'live-is-smile-always-364joker-at-yokohama-arena', + 'title': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA', + 'track': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA', + 'artists': ['LiSA'], + 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'description': 'md5:747444e7e6300907b7a43f0a0503072e', + 'genres': ['J-Pop'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/de/watch/musicvideo/MV5B02C79/egaono-hana', + 'only_matching': True, + }, { + 'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135/live-is-smile-always-364joker-at-yokohama-arena', + 'only_matching': True, + }, { + 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C/crossing-field', + 'only_matching': True, + }] + _API_ENDPOINT = 'music' + + def _real_extract(self, url): + lang, internal_id, object_type = self._match_valid_url(url).group('lang', 'id', 'type') + path, name = { + 'concert': ('concerts', 'concert info'), + 'musicvideo': ('music_videos', 'music video info'), + }[object_type] + response = traverse_obj(self._call_api(f'{path}/{internal_id}', internal_id, lang, name), ('data', 0, {dict})) + if not response: + raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True) + + result = self._transform_music_response(response) + + if not self._IS_PREMIUM and response.get('isPremiumOnly'): + message = f'This {response.get("type") or "media"} is for premium members only' + if CrunchyrollBaseIE._REFRESH_TOKEN: + self.raise_no_formats(message, expected=True, video_id=internal_id) + else: + self.raise_login_required(message, method='password', metadata_available=True) + else: + result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id) + + return result + + @staticmethod + def _transform_music_response(data): + return { + 'id': data['id'], + **traverse_obj(data, { + 'display_id': 'slug', + 'title': 'title', + 'track': 'title', + 'artists': ('artist', 'name', all), + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n') or None}), + 'thumbnails': ('images', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + 'genres': ('genres', ..., 'displayValue'), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + }), + } + + +class CrunchyrollArtistIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:artist' + _VALID_URL = r'''(?x) + https?://(?:www\.)?crunchyroll\.com/ + (?P(?:\w{2}(?:-\w{2})?/)?) + artist/(?P\w{10})''' + _TESTS = [{ + 'url': 'https://www.crunchyroll.com/artist/MA179CB50D', + 'info_dict': { + 'id': 'MA179CB50D', + 'title': 'LiSA', + 'genres': ['Anime', 'J-Pop', 'Rock'], + 'description': 'md5:16d87de61a55c3f7d6c454b73285938e', + }, + 'playlist_mincount': 83, + }, { + 'url': 'https://www.crunchyroll.com/artist/MA179CB50D/lisa', + 'only_matching': True, + }] + _API_ENDPOINT = 'music' + + def _real_extract(self, url): + lang, internal_id = self._match_valid_url(url).group('lang', 'id') + response = traverse_obj(self._call_api( + f'artists/{internal_id}', internal_id, lang, 'artist info'), ('data', 0)) + + def entries(): + for attribute, path in [('concerts', 'concert'), ('videos', 'musicvideo')]: + for internal_id in traverse_obj(response, (attribute, ...)): + yield self.url_result(f'{self._BASE_URL}/watch/{path}/{internal_id}', CrunchyrollMusicIE, internal_id) + + return self.playlist_result(entries(), **self._transform_artist_response(response)) + + @staticmethod + def _transform_artist_response(data): + return { + 'id': data['id'], + **traverse_obj(data, { + 'title': 'name', + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}), + 'thumbnails': ('images', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + 'genres': ('genres', ..., 'displayValue'), + }), + } diff --git a/yt_dlp/extractor/cspan.py b/yt_dlp/extractor/cspan.py index 0075680e8f..e940c2dea3 100644 --- a/yt_dlp/extractor/cspan.py +++ b/yt_dlp/extractor/cspan.py @@ -1,10 +1,12 @@ import re from .common import InfoExtractor +from .senategov import SenateISVPIE +from .ustream import UstreamIE from ..compat import compat_HTMLParseError from ..utils import ( - determine_ext, ExtractorError, + determine_ext, extract_attributes, find_xpath_attr, get_element_by_attribute, @@ -19,8 +21,6 @@ str_to_int, unescapeHTML, ) -from .senategov import SenateISVPIE -from .ustream import UstreamIE class CSpanIE(InfoExtractor): @@ -43,7 +43,7 @@ class CSpanIE(InfoExtractor): 'ext': 'mp4', 'title': 'CSPAN - International Health Care Models', 'description': 'md5:7a985a2d595dba00af3d9c9f0783c967', - } + }, }, { 'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall', 'info_dict': { @@ -61,7 +61,7 @@ class CSpanIE(InfoExtractor): }, 'params': { 'skip_download': True, # m3u8 downloads - } + }, }, { # Ustream embedded video 'url': 'https://www.c-span.org/video/?114917-1/armed-services', @@ -151,7 +151,7 @@ def add_referer(formats): # Obsolete # We first look for clipid, because clipprog always appears before - patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')] + patterns = [rf'id=\'clip({t})\'\s*value=\'([0-9]+)\'' for t in ('id', 'prog')] results = list(filter(None, (re.search(p, webpage) for p in patterns))) if results: matches = results[0] @@ -183,13 +183,13 @@ def get_text_attr(d, attr): return d.get(attr, {}).get('#text') data = self._download_json( - 'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), + f'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5={video_type}&id={video_id}', video_id)['video'] if data['@status'] != 'Success': - raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True) + raise ExtractorError('{} said: {}'.format(self.IE_NAME, get_text_attr(data, 'error')), expected=True) doc = self._download_xml( - 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), + f'http://www.c-span.org/common/services/flashXml.php?{video_type}id={video_id}', video_id) description = self._html_search_meta('description', webpage) @@ -205,7 +205,7 @@ def get_text_attr(d, attr): formats = [] for quality in f.get('qualities', []): formats.append({ - 'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')), + 'format_id': '{}-{}p'.format(get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')), 'url': unescapeHTML(get_text_attr(quality, 'file')), 'height': int_or_none(get_text_attr(quality, 'height')), 'tbr': int_or_none(get_text_attr(quality, 'bitrate')), @@ -216,13 +216,13 @@ def get_text_attr(d, attr): continue formats = self._extract_m3u8_formats( path, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }] + m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path}] add_referer(formats) entries.append({ - 'id': '%s_%d' % (video_id, partnum + 1), + 'id': f'{video_id}_{partnum + 1}', 'title': ( title if len(files) == 1 else - '%s part %d' % (title, partnum + 1)), + f'{title} part {partnum + 1}'), 'formats': formats, 'description': description, 'thumbnail': thumbnail, @@ -230,7 +230,7 @@ def get_text_attr(d, attr): 'subtitles': { 'en': [{ 'url': capfile, - 'ext': determine_ext(capfile, 'dfxp') + 'ext': determine_ext(capfile, 'dfxp'), }], } if capfile else None, }) @@ -257,8 +257,8 @@ class CSpanCongressIE(InfoExtractor): 'title': 'Congressional Chronicle - Members of Congress, Hearings and More', 'description': 'md5:54c264b7a8f219937987610243305a84', 'thumbnail': r're:https://ximage.c-spanvideo.org/.+', - 'ext': 'mp4' - } + 'ext': 'mp4', + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/ctsnews.py b/yt_dlp/extractor/ctsnews.py index cec178f034..b249c7b553 100644 --- a/yt_dlp/extractor/ctsnews.py +++ b/yt_dlp/extractor/ctsnews.py @@ -1,6 +1,6 @@ from .common import InfoExtractor -from ..utils import unified_timestamp from .youtube import YoutubeIE +from ..utils import unified_timestamp class CtsNewsIE(InfoExtractor): @@ -16,7 +16,7 @@ class CtsNewsIE(InfoExtractor): 'description': '以色列和黎巴嫩真主黨,爆發五年最嚴重衝突,雙方砲轟交火,兩名以軍死亡,還有一名西班牙籍的聯合國維和人員也不幸罹難。大陸陝西、河南、安徽、江蘇和湖北五個省份出現大暴雪,嚴重影響陸空交通,不過九華山卻出現...', 'timestamp': 1422528540, 'upload_date': '20150129', - } + }, }, { # News count not appear on page but still available in database 'url': 'http://news.cts.com.tw/cts/international/201309/201309031304098.html', @@ -29,7 +29,7 @@ class CtsNewsIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1378205880, 'upload_date': '20130903', - } + }, }, { # With Youtube embedded video 'url': 'http://news.cts.com.tw/cts/money/201501/201501291578003.html', diff --git a/yt_dlp/extractor/ctv.py b/yt_dlp/extractor/ctv.py index f125c1ce99..a41dab11b1 100644 --- a/yt_dlp/extractor/ctv.py +++ b/yt_dlp/extractor/ctv.py @@ -41,9 +41,9 @@ def _real_extract(self, url): } } } -}''' % display_id, +}''' % display_id, # noqa: UP031 })['data']['resolvedPath']['lastSegment']['content'] video_id = content['axisId'] return self.url_result( - '9c9media:%s:%s' % (content['videoPlayerDestCode'], video_id), + '9c9media:{}:{}'.format(content['videoPlayerDestCode'], video_id), 'NineCNineMedia', video_id) diff --git a/yt_dlp/extractor/ctvnews.py b/yt_dlp/extractor/ctvnews.py index ad3f0d8e4d..ebed9eb2d3 100644 --- a/yt_dlp/extractor/ctvnews.py +++ b/yt_dlp/extractor/ctvnews.py @@ -16,7 +16,7 @@ class CTVNewsIE(InfoExtractor): 'description': 'md5:958dd3b4f5bbbf0ed4d045c790d89285', 'timestamp': 1467286284, 'upload_date': '20160630', - } + }, }, { 'url': 'http://www.ctvnews.ca/video?playlistId=1.2966224', 'info_dict': @@ -49,14 +49,14 @@ def ninecninemedia_url_result(clip_id): return { '_type': 'url_transparent', 'id': clip_id, - 'url': '9c9media:ctvnews_web:%s' % clip_id, + 'url': f'9c9media:ctvnews_web:{clip_id}', 'ie_key': 'NineCNineMedia', } if page_id.isdigit(): return ninecninemedia_url_result(page_id) else: - webpage = self._download_webpage('http://www.ctvnews.ca/%s' % page_id, page_id, query={ + webpage = self._download_webpage(f'http://www.ctvnews.ca/{page_id}', page_id, query={ 'ot': 'example.AjaxPageLayout.ot', 'maxItemsPerPage': 1000000, }) diff --git a/yt_dlp/extractor/cultureunplugged.py b/yt_dlp/extractor/cultureunplugged.py index 2fb22800f3..8e6579c355 100644 --- a/yt_dlp/extractor/cultureunplugged.py +++ b/yt_dlp/extractor/cultureunplugged.py @@ -1,10 +1,8 @@ import time from .common import InfoExtractor -from ..utils import ( - int_or_none, - HEADRequest, -) +from ..networking import HEADRequest +from ..utils import int_or_none class CultureUnpluggedIE(InfoExtractor): @@ -22,7 +20,7 @@ class CultureUnpluggedIE(InfoExtractor): 'creator': 'Coldstream Creative', 'duration': 2203, 'view_count': int, - } + }, }, { 'url': 'http://www.cultureunplugged.com/documentary/watch-online/play/53662', 'only_matching': True, @@ -37,7 +35,7 @@ def _real_extract(self, url): self._request_webpage(HEADRequest( 'http://www.cultureunplugged.com/setClientTimezone.php?timeOffset=%d' % -(time.timezone / 3600)), display_id) movie_data = self._download_json( - 'http://www.cultureunplugged.com/movie-data/cu-%s.json' % video_id, display_id) + f'http://www.cultureunplugged.com/movie-data/cu-{video_id}.json', display_id) video_url = movie_data['url'] title = movie_data['title'] @@ -48,11 +46,11 @@ def _real_extract(self, url): view_count = int_or_none(movie_data.get('views')) thumbnails = [{ - 'url': movie_data['%s_thumb' % size], + 'url': movie_data[f'{size}_thumb'], 'id': size, 'preference': preference, } for preference, size in enumerate(( - 'small', 'large')) if movie_data.get('%s_thumb' % size)] + 'small', 'large')) if movie_data.get(f'{size}_thumb')] return { 'id': video_id, diff --git a/yt_dlp/extractor/curiositystream.py b/yt_dlp/extractor/curiositystream.py index 941cf4e79c..f5a2c3c311 100644 --- a/yt_dlp/extractor/curiositystream.py +++ b/yt_dlp/extractor/curiositystream.py @@ -2,7 +2,6 @@ import urllib.parse from .common import InfoExtractor -from ..compat import compat_str from ..utils import ExtractorError, int_or_none, urlencode_postdata @@ -16,7 +15,7 @@ def _handle_errors(self, result): if isinstance(error, dict): error = ', '.join(error.values()) raise ExtractorError( - '%s said: %s' % (self.IE_NAME, error), expected=True) + f'{self.IE_NAME} said: {error}', expected=True) def _call_api(self, path, video_id, query=None): headers = {} @@ -59,7 +58,7 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): 'series_id': '2', 'thumbnail': r're:https://img.curiositystream.com/.+\.jpg', 'tags': [], - 'duration': 158 + 'duration': 158, }, 'params': { # m3u8 download @@ -157,10 +156,10 @@ def _real_extract(self, url): collection = self._call_api(collection_id, collection_id) entries = [] for media in collection.get('media', []): - media_id = compat_str(media.get('id')) + media_id = str(media.get('id')) media_type, ie = ('series', CuriosityStreamSeriesIE) if media.get('is_collection') else ('video', CuriosityStreamIE) entries.append(self.url_result( - 'https://curiositystream.com/%s/%s' % (media_type, media_id), + f'https://curiositystream.com/{media_type}/{media_id}', ie=ie.ie_key(), video_id=media_id)) return self.playlist_result( entries, collection_id, diff --git a/yt_dlp/extractor/cwtv.py b/yt_dlp/extractor/cwtv.py index 9b83264ee1..870d4f39e3 100644 --- a/yt_dlp/extractor/cwtv.py +++ b/yt_dlp/extractor/cwtv.py @@ -46,6 +46,10 @@ class CWTVIE(InfoExtractor): 'timestamp': 1444107300, 'age_limit': 14, 'uploader': 'CWTV', + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'chapters': 'count:4', + 'episode': 'Episode 20', + 'season': 'Season 11', }, 'params': { # m3u8 download @@ -71,7 +75,7 @@ def _real_extract(self, url): raise ExtractorError(data['msg'], expected=True) video_data = data['video'] title = video_data['title'] - mpx_url = video_data.get('mpx_url') or 'http://link.theplatform.com/s/cwtv/media/guid/2703454149/%s?formats=M3U' % video_id + mpx_url = video_data.get('mpx_url') or f'http://link.theplatform.com/s/cwtv/media/guid/2703454149/{video_id}?formats=M3U' season = str_or_none(video_data.get('season')) episode = str_or_none(video_data.get('episode')) @@ -91,5 +95,5 @@ def _real_extract(self, url): 'timestamp': parse_iso8601(video_data.get('start_time')), 'age_limit': parse_age_limit(video_data.get('rating')), 'ie_key': 'ThePlatform', - 'thumbnail': video_data.get('large_thumbnail') + 'thumbnail': video_data.get('large_thumbnail'), } diff --git a/yt_dlp/extractor/cybrary.py b/yt_dlp/extractor/cybrary.py index 73f2439b31..59c8ab473d 100644 --- a/yt_dlp/extractor/cybrary.py +++ b/yt_dlp/extractor/cybrary.py @@ -45,7 +45,7 @@ def _get_vimeo_id(self, activity_id): class CybraryIE(CybraryBaseIE): - _VALID_URL = r'https?://app.cybrary.it/immersive/(?P[0-9]+)/activity/(?P[0-9]+)' + _VALID_URL = r'https?://app\.cybrary\.it/immersive/(?P[0-9]+)/activity/(?P[0-9]+)' _TESTS = [{ 'url': 'https://app.cybrary.it/immersive/12487950/activity/63102', 'md5': '9ae12d37e555cb2ed554223a71a701d0', @@ -61,9 +61,9 @@ class CybraryIE(CybraryBaseIE): 'series': 'Cybrary Orientation', 'uploader': 'Cybrary', 'chapter': 'Cybrary Orientation Series', - 'chapter_id': '63110' + 'chapter_id': '63110', }, - 'expected_warnings': ['No authenticators for vimeo'] + 'expected_warnings': ['No authenticators for vimeo'], }, { 'url': 'https://app.cybrary.it/immersive/12747143/activity/52686', 'md5': '62f26547dccc59c44363e2a13d4ad08d', @@ -79,9 +79,9 @@ class CybraryIE(CybraryBaseIE): 'series': 'AZ-500: Microsoft Azure Security Technologies', 'uploader': 'Cybrary', 'chapter': 'Implement Network Security', - 'chapter_id': '52693' + 'chapter_id': '52693', }, - 'expected_warnings': ['No authenticators for vimeo'] + 'expected_warnings': ['No authenticators for vimeo'], }] def _real_extract(self, url): @@ -93,7 +93,7 @@ def _real_extract(self, url): raise ExtractorError('The activity is not a video', expected=True) module = next((m for m in course.get('learning_modules') or [] - if int(activity_id) in traverse_obj(m, ('activities', ..., 'id') or [])), None) + if int(activity_id) in traverse_obj(m, ('activities', ..., 'id'))), None) vimeo_id = self._get_vimeo_id(activity_id) @@ -105,28 +105,28 @@ def _real_extract(self, url): 'chapter': module.get('title'), 'chapter_id': str_or_none(module.get('id')), 'title': activity.get('title'), - 'url': smuggle_url(f'https://player.vimeo.com/video/{vimeo_id}', {'http_headers': {'Referer': 'https://api.cybrary.it'}}) + 'url': smuggle_url(f'https://player.vimeo.com/video/{vimeo_id}', {'referer': 'https://api.cybrary.it'}), } class CybraryCourseIE(CybraryBaseIE): - _VALID_URL = r'https://app.cybrary.it/browse/course/(?P[\w-]+)/?(?:$|[#?])' + _VALID_URL = r'https?://app\.cybrary\.it/browse/course/(?P[\w-]+)/?(?:$|[#?])' _TESTS = [{ 'url': 'https://app.cybrary.it/browse/course/az-500-microsoft-azure-security-technologies', 'info_dict': { - 'id': 898, + 'id': '898', 'title': 'AZ-500: Microsoft Azure Security Technologies', - 'description': 'md5:69549d379c0fc1dec92926d4e8b6fbd4' + 'description': 'md5:69549d379c0fc1dec92926d4e8b6fbd4', }, - 'playlist_count': 59 + 'playlist_count': 59, }, { 'url': 'https://app.cybrary.it/browse/course/cybrary-orientation', 'info_dict': { - 'id': 1245, + 'id': '1245', 'title': 'Cybrary Orientation', - 'description': 'md5:9e69ff66b32fe78744e0ad4babe2e88e' + 'description': 'md5:9e69ff66b32fe78744e0ad4babe2e88e', }, - 'playlist_count': 4 + 'playlist_count': 4, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/dacast.py b/yt_dlp/extractor/dacast.py new file mode 100644 index 0000000000..4e81aa4a7b --- /dev/null +++ b/yt_dlp/extractor/dacast.py @@ -0,0 +1,158 @@ +import hashlib +import re +import time + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + classproperty, + float_or_none, + traverse_obj, + url_or_none, +) + + +class DacastBaseIE(InfoExtractor): + _URL_TYPE = None + + @classproperty + def _VALID_URL(cls): + return fr'https?://iframe\.dacast\.com/{cls._URL_TYPE}/(?P[\w-]+)/(?P[\w-]+)' + + @classproperty + def _EMBED_REGEX(cls): + return [rf']+\bsrc=["\'](?P{cls._VALID_URL})'] + + _API_INFO_URL = 'https://playback.dacast.com/content/info' + + @classmethod + def _get_url_from_id(cls, content_id): + user_id, media_id = content_id.split(f'-{cls._URL_TYPE}-') + return f'https://iframe.dacast.com/{cls._URL_TYPE}/{user_id}/{media_id}' + + @classmethod + def _extract_embed_urls(cls, url, webpage): + yield from super()._extract_embed_urls(url, webpage) + for content_id in re.findall( + rf']+\bsrc=["\']https://player\.dacast\.com/js/player\.js\?contentId=([\w-]+-{cls._URL_TYPE}-[\w-]+)["\']', webpage): + yield cls._get_url_from_id(content_id) + + +class DacastVODIE(DacastBaseIE): + _URL_TYPE = 'vod' + _TESTS = [{ + 'url': 'https://iframe.dacast.com/vod/acae82153ef4d7a7344ae4eaa86af534/1c6143e3-5a06-371d-8695-19b96ea49090', + 'info_dict': { + 'id': '1c6143e3-5a06-371d-8695-19b96ea49090', + 'ext': 'mp4', + 'uploader_id': 'acae82153ef4d7a7344ae4eaa86af534', + 'title': '2_4||Adnexal mass characterisation: O-RADS US and MRI||N. Bharwani, London/UK', + 'thumbnail': 'https://universe-files.dacast.com/26137208-5858-65c1-5e9a-9d6b6bd2b6c2', + }, + 'params': {'skip_download': 'm3u8'}, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.dacast.com/support/knowledgebase/how-can-i-embed-a-video-on-my-website/', + 'info_dict': { + 'id': 'b6674869-f08a-23c5-1d7b-81f5309e1a90', + 'ext': 'mp4', + 'title': '4-HowToEmbedVideo.mp4', + 'uploader_id': '3b67c4a9-3886-4eb1-d0eb-39b23b14bef3', + 'thumbnail': 'https://universe-files.dacast.com/d26ab48f-a52a-8783-c42e-a90290ba06b6.png', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://gist.githubusercontent.com/bashonly/4ad249ef2910346fbdf3809b220f11ee/raw/87349778d4af1a80b1fcc3beb9c88108de5858f5/dacast_embeds.html', + 'info_dict': { + 'id': 'e7df418e-a83b-7a7f-7b5e-1a667981e8fa', + 'ext': 'mp4', + 'title': 'Evening Service 2-5-23', + 'uploader_id': '943bb1ab3c03695ba85330d92d6d226e', + 'thumbnail': 'https://universe-files.dacast.com/337472b3-e92c-2ea4-7eb7-5700da477f67', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + user_id, video_id = self._match_valid_url(url).group('user_id', 'id') + query = {'contentId': f'{user_id}-vod-{video_id}', 'provider': 'universe'} + info = self._download_json(self._API_INFO_URL, video_id, query=query, fatal=False) + access = self._download_json( + 'https://playback.dacast.com/content/access', video_id, + note='Downloading access JSON', query=query, expected_status=403) + + error = access.get('error') + if error in ('Broadcaster has been blocked', 'Content is offline'): + raise ExtractorError(error, expected=True) + elif error: + raise ExtractorError(f'Dacast API says "{error}"') + + hls_url = access['hls'] + hls_aes = {} + + if 'DRM_EXT' in hls_url: + self.report_drm(video_id) + elif '/uspaes/' in hls_url: + # From https://player.dacast.com/js/player.js + ts = int(time.time()) + signature = hashlib.sha1( + f'{10413792000 - ts}{ts}YfaKtquEEpDeusCKbvYszIEZnWmBcSvw').digest().hex() + hls_aes['uri'] = f'https://keys.dacast.com/uspaes/{video_id}.key?s={signature}&ts={ts}' + + for retry in self.RetryManager(): + try: + formats = self._extract_m3u8_formats(hls_url, video_id, 'mp4', m3u8_id='hls') + except ExtractorError as e: + # CDN will randomly respond with 403 + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + retry.error = e + continue + raise + + return { + 'id': video_id, + 'uploader_id': user_id, + 'formats': formats, + 'hls_aes': hls_aes or None, + **traverse_obj(info, ('contentInfo', { + 'title': 'title', + 'duration': ('duration', {float_or_none}), + 'thumbnail': ('thumbnailUrl', {url_or_none}), + })), + } + + +class DacastPlaylistIE(DacastBaseIE): + _URL_TYPE = 'playlist' + _TESTS = [{ + 'url': 'https://iframe.dacast.com/playlist/943bb1ab3c03695ba85330d92d6d226e/b632eb053cac17a9c9a02bcfc827f2d8', + 'playlist_mincount': 28, + 'info_dict': { + 'id': 'b632eb053cac17a9c9a02bcfc827f2d8', + 'title': 'Archive Sermons', + }, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://gist.githubusercontent.com/bashonly/7efb606f49f3c6e07ea0327de5a661d1/raw/05a16eac830245ea301fb0a585023bec71e6093c/dacast_playlist_embed.html', + 'playlist_mincount': 28, + 'info_dict': { + 'id': 'b632eb053cac17a9c9a02bcfc827f2d8', + 'title': 'Archive Sermons', + }, + }] + + def _real_extract(self, url): + user_id, playlist_id = self._match_valid_url(url).group('user_id', 'id') + info = self._download_json( + self._API_INFO_URL, playlist_id, note='Downloading playlist JSON', query={ + 'contentId': f'{user_id}-playlist-{playlist_id}', + 'provider': 'universe', + })['contentInfo'] + + def entries(info): + for video in traverse_obj(info, ('features', 'playlist', 'contents', lambda _, v: v['id'])): + yield self.url_result( + DacastVODIE._get_url_from_id(video['id']), DacastVODIE, video['id'], video.get('title')) + + return self.playlist_result(entries(info), playlist_id, info.get('title')) diff --git a/yt_dlp/extractor/daftsex.py b/yt_dlp/extractor/daftsex.py deleted file mode 100644 index 551d5e3abe..0000000000 --- a/yt_dlp/extractor/daftsex.py +++ /dev/null @@ -1,141 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_b64decode -from ..utils import ( - int_or_none, - js_to_json, - parse_count, - parse_duration, - traverse_obj, - try_get, - unified_timestamp, -) - - -class DaftsexIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?daftsex\.com/watch/(?P-?\d+_\d+)' - _TESTS = [{ - 'url': 'https://daftsex.com/watch/-35370899_456246186', - 'md5': 'd95135e6cea2d905bea20dbe82cda64a', - 'info_dict': { - 'id': '-35370899_456246186', - 'ext': 'mp4', - 'title': 'just relaxing', - 'description': 'just relaxing - Watch video Watch video in high quality', - 'upload_date': '20201113', - 'timestamp': 1605261911, - 'thumbnail': r're:https://[^/]+/impf/-43BuMDIawmBGr3GLcZ93CYwWf2PBv_tVWoS1A/dnu41DnARU4\.jpg\?size=800x450&quality=96&keep_aspect_ratio=1&background=000000&sign=6af2c26ff4a45e55334189301c867384&type=video_thumb', - }, - }, { - 'url': 'https://daftsex.com/watch/-156601359_456242791', - 'info_dict': { - 'id': '-156601359_456242791', - 'ext': 'mp4', - 'title': 'Skye Blue - Dinner And A Show', - 'description': 'Skye Blue - Dinner And A Show - Watch video Watch video in high quality', - 'upload_date': '20200916', - 'timestamp': 1600250735, - 'thumbnail': 'https://psv153-1.crazycloud.ru/videos/-156601359/456242791/thumb.jpg?extra=i3D32KaBbBFf9TqDRMAVmQ', - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - title = self._html_search_meta('name', webpage, 'title') - timestamp = unified_timestamp(self._html_search_meta('uploadDate', webpage, 'Upload Date', default=None)) - description = self._html_search_meta('description', webpage, 'Description', default=None) - - duration = parse_duration(self._search_regex( - r'Duration: ((?:[0-9]{2}:){0,2}[0-9]{2})', - webpage, 'duration', fatal=False)) - views = parse_count(self._search_regex( - r'Views: ([0-9 ]+)', - webpage, 'views', fatal=False)) - - player_hash = self._search_regex( - r'DaxabPlayer\.Init\({[\s\S]*hash:\s*"([0-9a-zA-Z_\-]+)"[\s\S]*}', - webpage, 'player hash') - player_color = self._search_regex( - r'DaxabPlayer\.Init\({[\s\S]*color:\s*"([0-9a-z]+)"[\s\S]*}', - webpage, 'player color', fatal=False) or '' - - embed_page = self._download_webpage( - 'https://daxab.com/player/%s?color=%s' % (player_hash, player_color), - video_id, headers={'Referer': url}) - video_params = self._parse_json( - self._search_regex( - r'window\.globParams\s*=\s*({[\S\s]+})\s*;\s*<\/script>', - embed_page, 'video parameters'), - video_id, transform_source=js_to_json) - - server_domain = 'https://%s' % compat_b64decode(video_params['server'][::-1]).decode('utf-8') - - cdn_files = traverse_obj(video_params, ('video', 'cdn_files')) or {} - if cdn_files: - formats = [] - for format_id, format_data in cdn_files.items(): - ext, height = format_id.split('_') - formats.append({ - 'format_id': format_id, - 'url': f'{server_domain}/videos/{video_id.replace("_", "/")}/{height}.mp4?extra={format_data.split(".")[-1]}', - 'height': int_or_none(height), - 'ext': ext, - }) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'description': description, - 'duration': duration, - 'thumbnail': try_get(video_params, lambda vi: 'https:' + compat_b64decode(vi['video']['thumb']).decode('utf-8')), - 'timestamp': timestamp, - 'view_count': views, - 'age_limit': 18, - } - - item = self._download_json( - f'{server_domain}/method/video.get/{video_id}', video_id, - headers={'Referer': url}, query={ - 'token': video_params['video']['access_token'], - 'videos': video_id, - 'ckey': video_params['c_key'], - 'credentials': video_params['video']['credentials'], - })['response']['items'][0] - - formats = [] - for f_id, f_url in item.get('files', {}).items(): - if f_id == 'external': - return self.url_result(f_url) - ext, height = f_id.split('_') - height_extra_key = traverse_obj(video_params, ('video', 'partial', 'quality', height)) - if height_extra_key: - formats.append({ - 'format_id': f'{height}p', - 'url': f'{server_domain}/{f_url[8:]}&videos={video_id}&extra_key={height_extra_key}', - 'height': int_or_none(height), - 'ext': ext, - }) - - thumbnails = [] - for k, v in item.items(): - if k.startswith('photo_') and v: - width = k.replace('photo_', '') - thumbnails.append({ - 'id': width, - 'url': v, - 'width': int_or_none(width), - }) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'comment_count': int_or_none(item.get('comments')), - 'description': description, - 'duration': duration, - 'thumbnails': thumbnails, - 'timestamp': timestamp, - 'view_count': views, - 'age_limit': 18, - } diff --git a/yt_dlp/extractor/dailymail.py b/yt_dlp/extractor/dailymail.py index 43401e1115..540676ac0f 100644 --- a/yt_dlp/extractor/dailymail.py +++ b/yt_dlp/extractor/dailymail.py @@ -1,8 +1,8 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( - int_or_none, determine_protocol, + int_or_none, + join_nonempty, try_get, unescapeHTML, ) @@ -19,7 +19,7 @@ class DailyMailIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Mountain appears in sparkling water ad for \'Heavy Bubbles\'', 'description': 'md5:a93d74b6da172dd5dc4d973e0b766a84', - } + }, }, { 'url': 'http://www.dailymail.co.uk/embed/video/1295863.html', 'only_matching': True, @@ -35,8 +35,8 @@ def _real_extract(self, url): sources_url = (try_get( video_data, (lambda x: x['plugins']['sources']['url'], - lambda x: x['sources']['url']), compat_str) - or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id) + lambda x: x['sources']['url']), str) + or f'http://www.dailymail.co.uk/api/player/{video_id}/video-sources.json') video_sources = self._download_json(sources_url, video_id) body = video_sources.get('body') @@ -53,7 +53,7 @@ def _real_extract(self, url): is_hls = container == 'M2TS' protocol = 'm3u8_native' if is_hls else determine_protocol({'url': rendition_url}) formats.append({ - 'format_id': ('hls' if is_hls else protocol) + ('-%d' % tbr if tbr else ''), + 'format_id': join_nonempty('hls' if is_hls else protocol, tbr), 'url': rendition_url, 'width': int_or_none(rendition.get('frameWidth')), 'height': int_or_none(rendition.get('frameHeight')), diff --git a/yt_dlp/extractor/dailymotion.py b/yt_dlp/extractor/dailymotion.py index 2a44718fb5..632335e5b0 100644 --- a/yt_dlp/extractor/dailymotion.py +++ b/yt_dlp/extractor/dailymotion.py @@ -1,9 +1,10 @@ import functools import json import re +import urllib.parse from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, OnDemandPagedList, @@ -44,36 +45,41 @@ def _real_initialize(self): self._FAMILY_FILTER = ff == 'on' if ff else age_restricted(18, self.get_param('age_limit')) self._set_dailymotion_cookie('ff', 'on' if self._FAMILY_FILTER else 'off') + def _get_token(self, xid): + cookies = self._get_dailymotion_cookies() + token = self._get_cookie_value(cookies, 'access_token') or self._get_cookie_value(cookies, 'client_token') + if token: + return token + + data = { + 'client_id': 'f1a362d288c1b98099c7', + 'client_secret': 'eea605b96e01c796ff369935357eca920c5da4c5', + } + username, password = self._get_login_info() + if username: + data.update({ + 'grant_type': 'password', + 'password': password, + 'username': username, + }) + else: + data['grant_type'] = 'client_credentials' + try: + token = self._download_json( + 'https://graphql.api.dailymotion.com/oauth/token', + None, 'Downloading Access Token', + data=urlencode_postdata(data))['access_token'] + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: + raise ExtractorError(self._parse_json( + e.cause.response.read().decode(), xid)['error_description'], expected=True) + raise + self._set_dailymotion_cookie('access_token' if username else 'client_token', token) + return token + def _call_api(self, object_type, xid, object_fields, note, filter_extra=None): if not self._HEADERS.get('Authorization'): - cookies = self._get_dailymotion_cookies() - token = self._get_cookie_value(cookies, 'access_token') or self._get_cookie_value(cookies, 'client_token') - if not token: - data = { - 'client_id': 'f1a362d288c1b98099c7', - 'client_secret': 'eea605b96e01c796ff369935357eca920c5da4c5', - } - username, password = self._get_login_info() - if username: - data.update({ - 'grant_type': 'password', - 'password': password, - 'username': username, - }) - else: - data['grant_type'] = 'client_credentials' - try: - token = self._download_json( - 'https://graphql.api.dailymotion.com/oauth/token', - None, 'Downloading Access Token', - data=urlencode_postdata(data))['access_token'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - raise ExtractorError(self._parse_json( - e.cause.read().decode(), xid)['error_description'], expected=True) - raise - self._set_dailymotion_cookie('access_token' if username else 'client_token', token) - self._HEADERS['Authorization'] = 'Bearer ' + token + self._HEADERS['Authorization'] = f'Bearer {self._get_token(xid)}' resp = self._download_json( 'https://graphql.api.dailymotion.com/', xid, note, data=json.dumps({ @@ -81,7 +87,7 @@ def _call_api(self, object_type, xid, object_fields, note, filter_extra=None): %s(xid: "%s"%s) { %s } -}''' % (object_type, xid, ', ' + filter_extra if filter_extra else '', object_fields), +}''' % (object_type, xid, ', ' + filter_extra if filter_extra else '', object_fields), # noqa: UP031 }).encode(), headers=self._HEADERS) obj = resp['data'][object_type] if not obj: @@ -93,7 +99,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): _VALID_URL = r'''(?ix) https?:// (?: - (?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:(?:embed|swf|\#)/)|player\.html\?)?video|swf)| + (?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:(?:embed|swf|\#)/)|player(?:/\w+)?\.html\?)?video|swf)| (?:www\.)?lequipe\.fr/video ) [/=](?P[^/?_&]+)(?:.+?\bplaylist=(?Px[0-9a-z]+))? @@ -107,13 +113,17 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'id': 'x5kesuj', 'ext': 'mp4', 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller', - 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller', + 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller', 'duration': 187, 'timestamp': 1493651285, 'upload_date': '20170501', 'uploader': 'Deadline', 'uploader_id': 'x1xm8ri', 'age_limit': 0, + 'view_count': int, + 'like_count': int, + 'tags': ['hollywood', 'celeb', 'celebrity', 'movies', 'red carpet'], + 'thumbnail': r're:https://(?:s[12]\.)dmcdn\.net/v/K456B1aXqIx58LKWQ/x1080', }, }, { 'url': 'https://geo.dailymotion.com/player.html?video=x89eyek&mute=true', @@ -132,8 +142,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'view_count': int, 'like_count': int, 'tags': ['en_quete_d_esprit'], - 'thumbnail': 'https://s2.dmcdn.net/v/Tncwi1YGKdvFbDuDY/x1080', - } + 'thumbnail': r're:https://(?:s[12]\.)dmcdn\.net/v/Tncwi1YNg_RUl7ueu/x1080', + }, }, { 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', 'md5': '2137c41a8e78554bb09225b8eb322406', @@ -201,6 +211,12 @@ class DailymotionIE(DailymotionBaseInfoExtractor): }, { 'url': 'https://www.dailymotion.com/video/x3z49k?playlist=xv4bw', 'only_matching': True, + }, { + 'url': 'https://geo.dailymotion.com/player/x86gw.html?video=k46oCapRs4iikoz9DWy', + 'only_matching': True, + }, { + 'url': 'https://geo.dailymotion.com/player/xakln.html?video=x8mjju4&customConfig%5BcustomParams%5D=%2Ffr-fr%2Ftennis%2Fwimbledon-mens-singles%2Farticles-video', + 'only_matching': True, }] _GEO_BYPASS = False _COMMON_MEDIA_FIELDS = '''description @@ -244,8 +260,8 @@ def _real_extract(self, url): %s audienceCount isOnAir - }''' % (self._COMMON_MEDIA_FIELDS, self._COMMON_MEDIA_FIELDS), 'Downloading media JSON metadata', - 'password: "%s"' % self.get_param('videopassword') if password else None) + }''' % (self._COMMON_MEDIA_FIELDS, self._COMMON_MEDIA_FIELDS), 'Downloading media JSON metadata', # noqa: UP031 + 'password: "{}"'.format(self.get_param('videopassword')) if password else None) xid = media['xid'] metadata = self._download_json( @@ -261,7 +277,7 @@ def _real_extract(self, url): allowed_countries = try_get(media, lambda x: x['geoblockedCountries']['allowed'], list) self.raise_geo_restricted(msg=title, countries=allowed_countries) raise ExtractorError( - '%s said: %s' % (self.IE_NAME, title), expected=True) + f'{self.IE_NAME} said: {title}', expected=True) title = metadata['title'] is_live = media.get('isOnAir') @@ -347,7 +363,7 @@ def _fetch_page(self, playlist_id, page): } } }''' % ('false' if self._FAMILY_FILTER else 'true', self._PAGE_SIZE, page), - 'Downloading page %d' % page)['videos'] + f'Downloading page {page}')['videos'] for edge in videos['edges']: node = edge['node'] yield self.url_result( @@ -380,12 +396,58 @@ def _extract_embed_urls(cls, url, webpage): r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage): for p in re.findall(r'list\[\]=/playlist/([^/]+)/', unescapeHTML(mobj.group('url'))): - yield '//dailymotion.com/playlist/%s' % p + yield f'//dailymotion.com/playlist/{p}' + + +class DailymotionSearchIE(DailymotionPlaylistBaseIE): + IE_NAME = 'dailymotion:search' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/search/(?P[^/?#]+)/videos' + _PAGE_SIZE = 20 + _TESTS = [{ + 'url': 'http://www.dailymotion.com/search/king of turtles/videos', + 'info_dict': { + 'id': 'king of turtles', + 'title': 'king of turtles', + }, + 'playlist_mincount': 90, + }] + _SEARCH_QUERY = 'query SEARCH_QUERY( $query: String! $page: Int $limit: Int ) { search { videos( query: $query first: $limit page: $page ) { edges { node { xid } } } } } ' + + def _call_search_api(self, term, page, note): + if not self._HEADERS.get('Authorization'): + self._HEADERS['Authorization'] = f'Bearer {self._get_token(term)}' + resp = self._download_json( + 'https://graphql.api.dailymotion.com/', None, note, data=json.dumps({ + 'operationName': 'SEARCH_QUERY', + 'query': self._SEARCH_QUERY, + 'variables': { + 'limit': 20, + 'page': page, + 'query': term, + }, + }).encode(), headers=self._HEADERS) + obj = traverse_obj(resp, ('data', 'search', {dict})) + if not obj: + raise ExtractorError( + traverse_obj(resp, ('errors', 0, 'message', {str})) or 'Could not fetch search data') + + return obj + + def _fetch_page(self, term, page): + page += 1 + response = self._call_search_api(term, page, f'Searching "{term}" page {page}') + for xid in traverse_obj(response, ('videos', 'edges', ..., 'node', 'xid')): + yield self.url_result(f'https://www.dailymotion.com/video/{xid}', DailymotionIE, xid) + + def _real_extract(self, url): + term = urllib.parse.unquote_plus(self._match_id(url)) + return self.playlist_result( + OnDemandPagedList(functools.partial(self._fetch_page, term), self._PAGE_SIZE), term, term) class DailymotionUserIE(DailymotionPlaylistBaseIE): IE_NAME = 'dailymotion:user' - _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist|search)/)(?:(?:old/)?user/)?(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.dailymotion.com/user/nqtv', 'info_dict': { diff --git a/yt_dlp/extractor/dailywire.py b/yt_dlp/extractor/dailywire.py index f177c9d9cb..3d5bb257ce 100644 --- a/yt_dlp/extractor/dailywire.py +++ b/yt_dlp/extractor/dailywire.py @@ -35,7 +35,7 @@ class DailyWireIE(DailyWireBaseIE): 'creator': 'Caroline Roberts', 'series_id': 'ckzplm0a097fn0826r2vc3j7h', 'series': 'China: The Enemy Within', - } + }, }, { 'url': 'https://www.dailywire.com/episode/ep-124-bill-maher', 'info_dict': { @@ -48,7 +48,7 @@ class DailyWireIE(DailyWireBaseIE): 'description': 'md5:adb0de584bcfa9c41374999d9e324e98', 'series_id': 'cjzvep7270hp00786l9hwccob', 'series': 'The Sunday Special', - } + }, }, { 'url': 'https://www.dailywire.com/videos/the-hyperions', 'only_matching': True, @@ -95,7 +95,7 @@ class DailyWirePodcastIE(DailyWireBaseIE): 'description': 'md5:c4afbadda4e1c38a4496f6d62be55634', 'thumbnail': 'https://daily-wire-production.imgix.net/podcasts/ckx4otgd71jm508699tzb6hf4-1639506575562.jpg', 'duration': 900.117667, - } + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/damtomo.py b/yt_dlp/extractor/damtomo.py index 0e08e4f651..9ac0b6f2f4 100644 --- a/yt_dlp/extractor/damtomo.py +++ b/yt_dlp/extractor/damtomo.py @@ -2,7 +2,6 @@ from .common import InfoExtractor from ..utils import ExtractorError, clean_html, int_or_none, try_get, unified_strdate -from ..compat import compat_str class DamtomoBaseIE(InfoExtractor): @@ -32,7 +31,7 @@ def _real_extract(self, url): # and never likely to happen in the future transform_source=lambda x: re.sub(r'\s*encoding="[^"]+?"', '', x)) m3u8_url = try_get(stream_tree, lambda x: x.find( - './/d:streamingUrl', {'d': self._DKML_XML_NS}).text.strip(), compat_str) + './/d:streamingUrl', {'d': self._DKML_XML_NS}).text.strip(), str) if not m3u8_url: raise ExtractorError('Failed to obtain m3u8 URL') formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') @@ -68,7 +67,7 @@ class DamtomoVideoIE(DamtomoBaseIE): 'track': 'Get Wild', 'artist': 'TM NETWORK(TMN)', 'upload_date': '20201226', - } + }, }] @@ -83,7 +82,6 @@ class DamtomoRecordIE(DamtomoBaseIE): 'info_dict': { 'id': '27376862', 'title': 'イカSUMMER [良音]', - 'description': None, 'uploader': 'NANA', 'uploader_id': 'MzAyMDExNTY', 'upload_date': '20210721', @@ -91,7 +89,7 @@ class DamtomoRecordIE(DamtomoBaseIE): 'like_count': 1, 'track': 'イカSUMMER [良音]', 'artist': 'ORANGE RANGE', - } + }, }, { 'url': 'https://www.clubdam.com/app/damtomo/karaokePost/StreamingKrk.do?karaokeContributeId=27489418', 'info_dict': { @@ -105,5 +103,5 @@ class DamtomoRecordIE(DamtomoBaseIE): 'like_count': 3, 'track': '心みだれて〜say it with flowers〜(生音)', 'artist': '小林明子', - } + }, }] diff --git a/yt_dlp/extractor/dangalplay.py b/yt_dlp/extractor/dangalplay.py new file mode 100644 index 0000000000..50e4136b57 --- /dev/null +++ b/yt_dlp/extractor/dangalplay.py @@ -0,0 +1,197 @@ +import hashlib +import json +import re +import time + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ExtractorError, int_or_none, join_nonempty, url_or_none +from ..utils.traversal import traverse_obj + + +class DangalPlayBaseIE(InfoExtractor): + _NETRC_MACHINE = 'dangalplay' + _OTV_USER_ID = None + _LOGIN_HINT = 'Pass credentials as -u "token" -p "USER_ID" where USER_ID is the `otv_user_id` in browser local storage' + _API_BASE = 'https://ottapi.dangalplay.com' + _AUTH_TOKEN = 'jqeGWxRKK7FK5zEk3xCM' # from https://www.dangalplay.com/main.48ad19e24eb46acccef3.js + _SECRET_KEY = 'f53d31a4377e4ef31fa0' # same as above + + def _perform_login(self, username, password): + if self._OTV_USER_ID: + return + if username != 'token' or not re.fullmatch(r'[\da-f]{32}', password): + raise ExtractorError(self._LOGIN_HINT, expected=True) + self._OTV_USER_ID = password + + def _real_initialize(self): + if not self._OTV_USER_ID: + self.raise_login_required(f'Login required. {self._LOGIN_HINT}', method=None) + + def _extract_episode_info(self, metadata, episode_slug, series_slug): + return { + 'display_id': episode_slug, + 'episode_number': int_or_none(self._search_regex( + r'ep-(?:number-)?(\d+)', episode_slug, 'episode number', default=None)), + 'season_number': int_or_none(self._search_regex( + r'season-(\d+)', series_slug, 'season number', default='1')), + 'series': series_slug, + **traverse_obj(metadata, { + 'id': ('content_id', {str}), + 'title': ('display_title', {str}), + 'episode': ('title', {str}), + 'series': ('show_name', {str}, {lambda x: x or None}), + 'series_id': ('catalog_id', {str}), + 'duration': ('duration', {int_or_none}), + 'release_timestamp': ('release_date_uts', {int_or_none}), + }), + } + + def _call_api(self, path, display_id, note='Downloading JSON metadata', fatal=True, query={}): + return self._download_json( + f'{self._API_BASE}/{path}', display_id, note, fatal=fatal, + headers={'Accept': 'application/json'}, query={ + 'auth_token': self._AUTH_TOKEN, + 'region': 'IN', + **query, + }) + + +class DangalPlayIE(DangalPlayBaseIE): + IE_NAME = 'dangalplay' + _VALID_URL = r'https?://(?:www\.)?dangalplay.com/shows/(?P[^/?#]+)/(?P(?!episodes)[^/?#]+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://www.dangalplay.com/shows/kitani-mohabbat-hai-season-2/kitani-mohabbat-hai-season-2-ep-number-01', + 'info_dict': { + 'id': '647c61dc1e7171310dcd49b4', + 'ext': 'mp4', + 'release_timestamp': 1262304000, + 'episode_number': 1, + 'episode': 'EP 1 | KITANI MOHABBAT HAI SEASON 2', + 'series': 'kitani-mohabbat-hai-season-2', + 'season_number': 2, + 'title': 'EP 1 | KITANI MOHABBAT HAI SEASON 2', + 'release_date': '20100101', + 'duration': 2325, + 'season': 'Season 2', + 'display_id': 'kitani-mohabbat-hai-season-2-ep-number-01', + 'series_id': '645c9ea41e717158ca574966', + }, + }, { + 'url': 'https://www.dangalplay.com/shows/milke-bhi-hum-na-mile/milke-bhi-hum-na-mile-ep-number-01', + 'info_dict': { + 'id': '65d31d9ba73b9c3abd14a7f3', + 'ext': 'mp4', + 'episode': 'EP 1 | MILKE BHI HUM NA MILE', + 'release_timestamp': 1708367411, + 'episode_number': 1, + 'season': 'Season 1', + 'title': 'EP 1 | MILKE BHI HUM NA MILE', + 'duration': 156048, + 'release_date': '20240219', + 'season_number': 1, + 'series': 'MILKE BHI HUM NA MILE', + 'series_id': '645c9ea41e717158ca574966', + 'display_id': 'milke-bhi-hum-na-mile-ep-number-01', + }, + }] + + def _generate_api_data(self, data): + catalog_id = data['catalog_id'] + content_id = data['content_id'] + timestamp = str(int(time.time())) + unhashed = ''.join((catalog_id, content_id, self._OTV_USER_ID, timestamp, self._SECRET_KEY)) + + return json.dumps({ + 'catalog_id': catalog_id, + 'content_id': content_id, + 'category': '', + 'region': 'IN', + 'auth_token': self._AUTH_TOKEN, + 'id': self._OTV_USER_ID, + 'md5': hashlib.md5(unhashed.encode()).hexdigest(), + 'ts': timestamp, + }, separators=(',', ':')).encode() + + def _real_extract(self, url): + series_slug, episode_slug = self._match_valid_url(url).group('series', 'id') + metadata = self._call_api( + f'catalogs/shows/{series_slug}/episodes/{episode_slug}.gzip', + episode_slug, query={'item_language': ''})['data'] + + try: + details = self._download_json( + f'{self._API_BASE}/v2/users/get_all_details.gzip', episode_slug, + 'Downloading playback details JSON', headers={ + 'Accept': 'application/json', + 'Content-Type': 'application/json', + }, data=self._generate_api_data(metadata))['data'] + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 422: + error_info = traverse_obj(e.cause.response.read().decode(), ({json.loads}, 'error', {dict})) or {} + if error_info.get('code') == '1016': + self.raise_login_required( + f'Your token has expired or is invalid. {self._LOGIN_HINT}', method=None) + elif msg := error_info.get('message'): + raise ExtractorError(msg) + raise + + m3u8_url = traverse_obj(details, ( + ('adaptive_url', ('adaptive_urls', 'hd', 'hls', ..., 'playback_url')), {url_or_none}, any)) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, episode_slug, 'mp4') + + return { + 'formats': formats, + 'subtitles': subtitles, + **self._extract_episode_info(metadata, episode_slug, series_slug), + } + + +class DangalPlaySeasonIE(DangalPlayBaseIE): + IE_NAME = 'dangalplay:season' + _VALID_URL = r'https?://(?:www\.)?dangalplay.com/shows/(?P[^/?#]+)(?:/(?Pep-[^/?#]+)/episodes)?/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://www.dangalplay.com/shows/kitani-mohabbat-hai-season-1', + 'playlist_mincount': 170, + 'info_dict': { + 'id': 'kitani-mohabbat-hai-season-1', + }, + }, { + 'url': 'https://www.dangalplay.com/shows/kitani-mohabbat-hai-season-1/ep-01-30-1/episodes', + 'playlist_count': 30, + 'info_dict': { + 'id': 'kitani-mohabbat-hai-season-1-ep-01-30-1', + }, + }, { + # 1 season only, series page is season page + 'url': 'https://www.dangalplay.com/shows/milke-bhi-hum-na-mile', + 'playlist_mincount': 15, + 'info_dict': { + 'id': 'milke-bhi-hum-na-mile', + }, + }] + + def _entries(self, subcategories, series_slug): + for subcategory in subcategories: + data = self._call_api( + f'catalogs/shows/items/{series_slug}/subcategories/{subcategory}/episodes.gzip', + series_slug, f'Downloading episodes JSON for {subcategory}', fatal=False, query={ + 'order_by': 'asc', + 'status': 'published', + }) + for ep in traverse_obj(data, ('data', 'items', lambda _, v: v['friendly_id'])): + episode_slug = ep['friendly_id'] + yield self.url_result( + f'https://www.dangalplay.com/shows/{series_slug}/{episode_slug}', + DangalPlayIE, **self._extract_episode_info(ep, episode_slug, series_slug)) + + def _real_extract(self, url): + series_slug, subcategory = self._match_valid_url(url).group('id', 'sub') + subcategories = [subcategory] if subcategory else traverse_obj( + self._call_api( + f'catalogs/shows/items/{series_slug}.gzip', series_slug, + 'Downloading season info JSON', query={'item_language': ''}), + ('data', 'subcategories', ..., 'friendly_id', {str})) + + return self.playlist_result( + self._entries(subcategories, series_slug), join_nonempty(series_slug, subcategory)) diff --git a/yt_dlp/extractor/daum.py b/yt_dlp/extractor/daum.py index 3ef5140658..ee84449141 100644 --- a/yt_dlp/extractor/daum.py +++ b/yt_dlp/extractor/daum.py @@ -1,9 +1,7 @@ import itertools +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, -) from ..utils import parse_qs @@ -27,7 +25,7 @@ class DaumIE(DaumBaseIE): 'duration': 2117, 'view_count': int, 'comment_count': int, - 'uploader_id': 186139, + 'uploader_id': '186139', 'uploader': '콘간지', 'timestamp': 1387310323, }, @@ -44,7 +42,7 @@ class DaumIE(DaumBaseIE): 'view_count': int, 'comment_count': int, 'uploader': 'MBC 예능', - 'uploader_id': 132251, + 'uploader_id': '132251', 'timestamp': 1421604228, }, }, { @@ -63,7 +61,7 @@ class DaumIE(DaumBaseIE): 'view_count': int, 'comment_count': int, 'uploader': '까칠한 墮落始祖 황비홍님의', - 'uploader_id': 560824, + 'uploader_id': '560824', 'timestamp': 1203770745, }, }, { @@ -77,13 +75,13 @@ class DaumIE(DaumBaseIE): 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', 'upload_date': '20170129', 'uploader': '쇼! 음악중심', - 'uploader_id': 2653210, + 'uploader_id': '2653210', 'timestamp': 1485684628, }, }] def _real_extract(self, url): - video_id = compat_urllib_parse_unquote(self._match_id(url)) + video_id = urllib.parse.unquote(self._match_id(url)) if not video_id.isdigit(): video_id += '@my' return self.url_result( @@ -107,7 +105,7 @@ class DaumClipIE(DaumBaseIE): 'duration': 3868, 'view_count': int, 'uploader': 'GOMeXP', - 'uploader_id': 6667, + 'uploader_id': '6667', 'timestamp': 1377911092, }, }, { @@ -117,7 +115,7 @@ class DaumClipIE(DaumBaseIE): @classmethod def suitable(cls, url): - return False if DaumPlaylistIE.suitable(url) or DaumUserIE.suitable(url) else super(DaumClipIE, cls).suitable(url) + return False if DaumPlaylistIE.suitable(url) or DaumUserIE.suitable(url) else super().suitable(url) def _real_extract(self, url): video_id = self._match_id(url) @@ -131,12 +129,12 @@ def _get_entries(self, list_id, list_id_type): entries = [] for pagenum in itertools.count(1): list_info = self._download_json( - 'http://tvpot.daum.net/mypot/json/GetClipInfo.do?size=48&init=true&order=date&page=%d&%s=%s' % ( - pagenum, list_id_type, list_id), list_id, 'Downloading list info - %s' % pagenum) + f'http://tvpot.daum.net/mypot/json/GetClipInfo.do?size=48&init=true&order=date&page={pagenum}&{list_id_type}={list_id}', + list_id, f'Downloading list info - {pagenum}') entries.extend([ self.url_result( - 'http://tvpot.daum.net/v/%s' % clip['vid']) + 'http://tvpot.daum.net/v/{}'.format(clip['vid'])) for clip in list_info['clip_list'] ]) @@ -169,7 +167,7 @@ class DaumPlaylistIE(DaumListIE): 'id': '6213966', 'title': 'Woorissica Official', }, - 'playlist_mincount': 181 + 'playlist_mincount': 181, }, { 'note': 'Playlist url with clipid - noplaylist', 'url': 'http://tvpot.daum.net/mypot/View.do?playlistid=6213966&clipid=73806844', @@ -182,12 +180,12 @@ class DaumPlaylistIE(DaumListIE): 'params': { 'noplaylist': True, 'skip_download': True, - } + }, }] @classmethod def suitable(cls, url): - return False if DaumUserIE.suitable(url) else super(DaumPlaylistIE, cls).suitable(url) + return False if DaumUserIE.suitable(url) else super().suitable(url) def _real_extract(self, url): list_id = self._match_id(url) @@ -211,7 +209,7 @@ class DaumUserIE(DaumListIE): 'id': 'o2scDLIVbHc0', 'title': '마이 리틀 텔레비전', }, - 'playlist_mincount': 213 + 'playlist_mincount': 213, }, { 'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=o2scDLIVbHc0&clipid=73801156', 'info_dict': { @@ -219,12 +217,12 @@ class DaumUserIE(DaumListIE): 'ext': 'mp4', 'title': '[미공개] 김구라, 오만석이 부릅니다 \'오케피\' - 마이 리틀 텔레비전 20160116', 'upload_date': '20160117', - 'description': 'md5:5e91d2d6747f53575badd24bd62b9f36' + 'description': 'md5:5e91d2d6747f53575badd24bd62b9f36', }, 'params': { 'noplaylist': True, 'skip_download': True, - } + }, }, { 'note': 'Playlist url has ownerid and playlistid, playlistid takes precedence', 'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=o2scDLIVbHc0&playlistid=6196631', @@ -232,7 +230,7 @@ class DaumUserIE(DaumListIE): 'id': '6196631', 'title': '마이 리틀 텔레비전 - 20160109', }, - 'playlist_count': 11 + 'playlist_count': 11, }, { 'url': 'http://tvpot.daum.net/mypot/Top.do?ownerid=o2scDLIVbHc0', 'only_matching': True, diff --git a/yt_dlp/extractor/dbtv.py b/yt_dlp/extractor/dbtv.py index 18be46f7e8..795fbacc41 100644 --- a/yt_dlp/extractor/dbtv.py +++ b/yt_dlp/extractor/dbtv.py @@ -18,7 +18,7 @@ class DBTVIE(InfoExtractor): 'uploader_id': 'UCk5pvsyZJoYJBd7_oFPTlRQ', 'uploader': 'Dagbladet', }, - 'add_ie': ['Youtube'] + 'add_ie': ['Youtube'], }, { 'url': 'https://www.dagbladet.no/video/embed/xlGmyIeN9Jo/?autoplay=false', 'only_matching': True, diff --git a/yt_dlp/extractor/dctp.py b/yt_dlp/extractor/dctp.py index 24bb6aca25..09bdbf243a 100644 --- a/yt_dlp/extractor/dctp.py +++ b/yt_dlp/extractor/dctp.py @@ -1,5 +1,4 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( float_or_none, int_or_none, @@ -37,18 +36,18 @@ def _real_extract(self, url): display_id = self._match_id(url) version = self._download_json( - '%s/version.json' % self._BASE_URL, display_id, + f'{self._BASE_URL}/version.json', display_id, 'Downloading version JSON') - restapi_base = '%s/%s/restapi' % ( + restapi_base = '{}/{}/restapi'.format( self._BASE_URL, version['version_name']) info = self._download_json( - '%s/slugs/%s.json' % (restapi_base, display_id), display_id, + f'{restapi_base}/slugs/{display_id}.json', display_id, 'Downloading video info JSON') media = self._download_json( - '%s/media/%s.json' % (restapi_base, compat_str(info['object_id'])), + '{}/media/{}.json'.format(restapi_base, str(info['object_id'])), display_id, 'Downloading media JSON') uuid = media['uuid'] @@ -57,7 +56,7 @@ def _real_extract(self, url): formats = [] def add_formats(suffix): - templ = 'https://%%s/%s_dctp_%s.m4v' % (uuid, suffix) + templ = f'https://%s/{uuid}_dctp_{suffix}.m4v' formats.extend([{ 'format_id': 'hls-' + suffix, 'url': templ % 'cdn-segments.dctp.tv' + '/playlist.m3u8', diff --git a/yt_dlp/extractor/deezer.py b/yt_dlp/extractor/deezer.py index f61f12af02..2ca8be5ca0 100644 --- a/yt_dlp/extractor/deezer.py +++ b/yt_dlp/extractor/deezer.py @@ -22,7 +22,7 @@ def get_data(self, url): default=None) if geoblocking_msg is not None: raise ExtractorError( - 'Deezer said: %s' % geoblocking_msg, expected=True) + f'Deezer said: {geoblocking_msg}', expected=True) data_json = self._search_regex( (r'__DZR_APP_STATE__\s*=\s*({.+?})\s*', @@ -67,7 +67,7 @@ def _real_extract(self, url): entries.append({ 'id': s.get('SNG_ID'), 'duration': int_or_none(s.get('DURATION')), - 'title': '%s - %s' % (artists, s.get('SNG_TITLE')), + 'title': '{} - {}'.format(artists, s.get('SNG_TITLE')), 'uploader': s.get('ART_NAME'), 'uploader_id': s.get('ART_ID'), 'age_limit': 16 if s.get('EXPLICIT_LYRICS') == '1' else 0, @@ -119,7 +119,7 @@ def _real_extract(self, url): entries.append({ 'id': s.get('SNG_ID'), 'duration': int_or_none(s.get('DURATION')), - 'title': '%s - %s' % (artists, s.get('SNG_TITLE')), + 'title': '{} - {}'.format(artists, s.get('SNG_TITLE')), 'uploader': s.get('ART_NAME'), 'uploader_id': s.get('ART_ID'), 'age_limit': 16 if s.get('EXPLICIT_LYRICS') == '1' else 0, diff --git a/yt_dlp/extractor/defense.py b/yt_dlp/extractor/defense.py deleted file mode 100644 index 7d73ea862e..0000000000 --- a/yt_dlp/extractor/defense.py +++ /dev/null @@ -1,37 +0,0 @@ -from .common import InfoExtractor - - -class DefenseGouvFrIE(InfoExtractor): - IE_NAME = 'defense.gouv.fr' - _VALID_URL = r'https?://.*?\.defense\.gouv\.fr/layout/set/ligthboxvideo/base-de-medias/webtv/(?P[^/?#]*)' - - _TEST = { - 'url': 'http://www.defense.gouv.fr/layout/set/ligthboxvideo/base-de-medias/webtv/attaque-chimique-syrienne-du-21-aout-2013-1', - 'md5': '75bba6124da7e63d2d60b5244ec9430c', - 'info_dict': { - 'id': '11213', - 'ext': 'mp4', - 'title': 'attaque-chimique-syrienne-du-21-aout-2013-1' - } - } - - def _real_extract(self, url): - title = self._match_id(url) - webpage = self._download_webpage(url, title) - - video_id = self._search_regex( - r"flashvars.pvg_id=\"(\d+)\";", - webpage, 'ID') - - json_url = ( - 'http://static.videos.gouv.fr/brightcovehub/export/json/%s' % - video_id) - info = self._download_json(json_url, title, 'Downloading JSON config') - video_url = info['renditions'][0]['url'] - - return { - 'id': video_id, - 'ext': 'mp4', - 'url': video_url, - 'title': title, - } diff --git a/yt_dlp/extractor/democracynow.py b/yt_dlp/extractor/democracynow.py index 1624d085c1..80c56b4d45 100644 --- a/yt_dlp/extractor/democracynow.py +++ b/yt_dlp/extractor/democracynow.py @@ -1,11 +1,11 @@ -import re import os.path +import re +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( - url_basename, remove_start, + url_basename, ) @@ -52,7 +52,7 @@ def _real_extract(self, url): media_url = json_data.get(key, '') if not media_url: continue - media_url = re.sub(r'\?.*', '', compat_urlparse.urljoin(url, media_url)) + media_url = re.sub(r'\?.*', '', urllib.parse.urljoin(url, media_url)) video_id = video_id or remove_start(os.path.splitext(url_basename(media_url))[0], 'dn') formats.append({ 'url': media_url, @@ -70,13 +70,13 @@ def add_subtitle_item(lang, info_dict): # chapter_file are not subtitles if 'caption_file' in json_data: add_subtitle_item(default_lang, { - 'url': compat_urlparse.urljoin(url, json_data['caption_file']), + 'url': urllib.parse.urljoin(url, json_data['caption_file']), }) for subtitle_item in json_data.get('captions', []): lang = subtitle_item.get('language', '').lower() or default_lang add_subtitle_item(lang, { - 'url': compat_urlparse.urljoin(url, subtitle_item['url']), + 'url': urllib.parse.urljoin(url, subtitle_item['url']), }) description = self._og_search_description(webpage, default=None) diff --git a/yt_dlp/extractor/detik.py b/yt_dlp/extractor/detik.py index f148054246..5097759194 100644 --- a/yt_dlp/extractor/detik.py +++ b/yt_dlp/extractor/detik.py @@ -17,8 +17,8 @@ class DetikEmbedIE(InfoExtractor): 'tags': ['raja charles', ' raja charles iii', ' ratu elizabeth', ' ratu elizabeth meninggal dunia', ' raja inggris', ' inggris'], 'release_timestamp': 1662869995, 'release_date': '20220911', - 'uploader': 'REUTERS' - } + 'uploader': 'REUTERS', + }, }, { # 20.detik 'url': 'https://20.detik.com/otobuzz/20220704-220704093/mulai-rp-10-jutaan-ini-skema-kredit-mitsubishi-pajero-sport', @@ -36,8 +36,8 @@ class DetikEmbedIE(InfoExtractor): 'release_timestamp': 1656926321, 'release_date': '20220704', 'age_limit': 0, - 'uploader': 'Ridwan Arifin ' # TODO: strip trailling whitespace at uploader - } + 'uploader': 'Ridwan Arifin ', # TODO: strip trailling whitespace at uploader + }, }, { # pasangmata.detik 'url': 'https://pasangmata.detik.com/contribution/366649', @@ -49,7 +49,7 @@ class DetikEmbedIE(InfoExtractor): 'age_limit': 0, 'tags': 'count:17', 'thumbnail': 'https://akcdn.detik.net.id/community/data/media/thumbs-pasangmata/2022/09/08/366649-16626229351533009620.mp4-03.jpg', - } + }, }, { # insertlive embed 'url': 'https://www.insertlive.com/embed/video/290482', @@ -64,7 +64,7 @@ class DetikEmbedIE(InfoExtractor): 'title': 'Diincar Leonardo DiCaprio, Gigi Hadid Ngaku Tertarik Tapi Belum Cinta', 'tags': ['leonardo dicaprio', ' gigi hadid', ' hollywood'], 'uploader': '!nsertlive', - } + }, }, { # beautynesia embed 'url': 'https://www.beautynesia.id/embed/video/261636', @@ -79,7 +79,7 @@ class DetikEmbedIE(InfoExtractor): 'tags': ['zodiac update', ' zodiak', ' ramalan bintang', ' zodiak beruntung 2022', ' zodiak hoki september 2022', ' zodiak beruntung september 2022'], 'thumbnail': 'https://akcdn.detik.net.id/visual/2022/09/05/3-zodiak-paling-beruntung-selama-september-2022_169.jpeg?w=600&q=90', 'uploader': 'amh', - } + }, }, { # cnbcindonesia embed 'url': 'https://www.cnbcindonesia.com/embed/video/371839', @@ -91,7 +91,7 @@ class DetikEmbedIE(InfoExtractor): 'age_limit': 0, 'thumbnail': 'https://awsimages.detik.net.id/visual/2022/09/13/cnbc-indonesia-tv-3_169.png?w=600&q=80', 'description': 'md5:8b9111e37555fcd95fe549a9b4ae6fdc', - } + }, }, { # detik shortlink (we can get it from https://dtk.id/?) 'url': 'https://dtk.id/NkISKr', @@ -110,7 +110,7 @@ class DetikEmbedIE(InfoExtractor): 'timestamp': 1663139688, 'duration': 213.0, 'tags': ['hacker bjorka', 'bjorka', 'hacker bjorka bocorkan data rahasia presiden jokowi', 'jokowi'], - } + }, }] def _extract_from_webpage(self, url, webpage): @@ -142,7 +142,7 @@ def _extract_from_webpage(self, url, webpage): 'timestamp': int_or_none(self._html_search_meta('dtk:createdateunix', webpage, fatal=False, default=None), 1000), 'uploader': self._search_regex( r'([^-]+)', self._html_search_meta('dtk:author', webpage, default='').strip(), 'uploader', - default=None) + default=None), } formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, display_id) diff --git a/yt_dlp/extractor/deuxm.py b/yt_dlp/extractor/deuxm.py index 74a6da6c68..c8ce32ca90 100644 --- a/yt_dlp/extractor/deuxm.py +++ b/yt_dlp/extractor/deuxm.py @@ -12,8 +12,8 @@ class DeuxMIE(InfoExtractor): 'id': '6351d439b15e1a613b3debe8', 'ext': 'mp4', 'title': 'Grand Angle : Jeudi 20 Octobre 2022', - 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$' - } + 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$', + }, }, { 'url': 'https://2m.ma/fr/replay/single/635c0aeab4eec832622356da', 'md5': 'ad6af2f5e4d5b2ad2194a84b6e890b4c', @@ -21,8 +21,8 @@ class DeuxMIE(InfoExtractor): 'id': '635c0aeab4eec832622356da', 'ext': 'mp4', 'title': 'Journal Amazigh : Vendredi 28 Octobre 2022', - 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$' - } + 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$', + }, }] def _real_extract(self, url): @@ -49,8 +49,8 @@ class DeuxMNewsIE(InfoExtractor): 'ext': 'mp4', 'title': 'Kan Ya Mkan d\u00e9poussi\u00e8re l\u2019histoire du phare du Cap Beddouza', 'description': 'md5:99dcf29b82f1d7f2a4acafed1d487527', - 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$' - } + 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$', + }, }, { 'url': 'https://2m.ma/fr/news/Interview-Casablanca-hors-des-sentiers-battus-avec-Abderrahim-KASSOU-Replay--20221017', 'md5': '7aca29f02230945ef635eb8290283c0c', @@ -59,8 +59,8 @@ class DeuxMNewsIE(InfoExtractor): 'ext': 'mp4', 'title': 'Interview: Casablanca hors des sentiers battus avec Abderrahim KASSOU (Replay) ', 'description': 'md5:3b8e78111de9fcc6ef7f7dd6cff2430c', - 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$' - } + 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$', + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/dfb.py b/yt_dlp/extractor/dfb.py index c4fb5c2a42..b397ed9097 100644 --- a/yt_dlp/extractor/dfb.py +++ b/yt_dlp/extractor/dfb.py @@ -22,7 +22,7 @@ def _real_extract(self, url): display_id, video_id = self._match_valid_url(url).groups() player_info = self._download_xml( - 'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id, + f'http://tv.dfb.de/server/hd_video.php?play={video_id}', display_id) video_info = player_info.find('video') stream_access_url = self._proto_relative_url(video_info.find('url').text.strip()) @@ -46,7 +46,7 @@ def _real_extract(self, url): 'id': video_id, 'display_id': display_id, 'title': video_info.find('title').text, - 'thumbnail': 'http://tv.dfb.de/images/%s_640x360.jpg' % video_id, + 'thumbnail': f'http://tv.dfb.de/images/{video_id}_640x360.jpg', 'upload_date': unified_strdate(video_info.find('time_date').text), 'formats': formats, } diff --git a/yt_dlp/extractor/dhm.py b/yt_dlp/extractor/dhm.py index 3d42fc2b0c..a5f5f794cb 100644 --- a/yt_dlp/extractor/dhm.py +++ b/yt_dlp/extractor/dhm.py @@ -3,6 +3,7 @@ class DHMIE(InfoExtractor): + _WORKING = False IE_DESC = 'Filmarchiv - Deutsches Historisches Museum' _VALID_URL = r'https?://(?:www\.)?dhm\.de/filmarchiv/(?:[^/]+/)+(?P[^/]+)' diff --git a/yt_dlp/extractor/digg.py b/yt_dlp/extractor/digg.py deleted file mode 100644 index 86e8a6facb..0000000000 --- a/yt_dlp/extractor/digg.py +++ /dev/null @@ -1,54 +0,0 @@ -from .common import InfoExtractor -from ..utils import js_to_json - - -class DiggIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?digg\.com/video/(?P[^/?#&]+)' - _TESTS = [{ - # JWPlatform via provider - 'url': 'http://digg.com/video/sci-fi-short-jonah-daniel-kaluuya-get-out', - 'info_dict': { - 'id': 'LcqvmS0b', - 'ext': 'mp4', - 'title': "'Get Out' Star Daniel Kaluuya Goes On 'Moby Dick'-Like Journey In Sci-Fi Short 'Jonah'", - 'description': 'md5:541bb847648b6ee3d6514bc84b82efda', - 'upload_date': '20180109', - 'timestamp': 1515530551, - }, - 'params': { - 'skip_download': True, - }, - }, { - # Youtube via provider - 'url': 'http://digg.com/video/dog-boat-seal-play', - 'only_matching': True, - }, { - # vimeo as regular embed - 'url': 'http://digg.com/video/dream-girl-short-film', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - info = self._parse_json( - self._search_regex( - r'(?s)video_info\s*=\s*({.+?});\n', webpage, 'video info', - default='{}'), display_id, transform_source=js_to_json, - fatal=False) - - video_id = info.get('video_id') - - if video_id: - provider = info.get('provider_name') - if provider == 'youtube': - return self.url_result( - video_id, ie='Youtube', video_id=video_id) - elif provider == 'jwplayer': - return self.url_result( - 'jwplatform:%s' % video_id, ie='JWPlatform', - video_id=video_id) - - return self.url_result(url, 'Generic') diff --git a/yt_dlp/extractor/digitalconcerthall.py b/yt_dlp/extractor/digitalconcerthall.py index 3461e36eb6..edb6fa9c08 100644 --- a/yt_dlp/extractor/digitalconcerthall.py +++ b/yt_dlp/extractor/digitalconcerthall.py @@ -1,18 +1,20 @@ from .common import InfoExtractor - +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, - parse_resolution, - traverse_obj, + parse_codecs, try_get, + url_or_none, urlencode_postdata, ) +from ..utils.traversal import traverse_obj class DigitalConcertHallIE(InfoExtractor): IE_DESC = 'DigitalConcertHall extractor' - _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P[a-z]+)/concert/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P[a-z]+)/(?Pfilm|concert|work)/(?P[0-9]+)-?(?P[0-9]+)?' _OAUTH_URL = 'https://api.digitalconcerthall.com/v2/oauth2/token' + _USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15' _ACCESS_TOKEN = None _NETRC_MACHINE = 'digitalconcerthall' _TESTS = [{ @@ -27,7 +29,8 @@ class DigitalConcertHallIE(InfoExtractor): 'upload_date': '20210624', 'timestamp': 1624548600, 'duration': 2798, - 'album_artist': 'Members of the Berliner Philharmoniker / Simon Rössler', + 'album_artists': ['Members of the Berliner Philharmoniker', 'Simon Rössler'], + 'composers': ['Kurt Weill'], }, 'params': {'skip_download': 'm3u8'}, }, { @@ -35,65 +38,102 @@ class DigitalConcertHallIE(InfoExtractor): 'url': 'https://www.digitalconcerthall.com/en/concert/53785', 'info_dict': { 'id': '53785', - 'album_artist': 'Berliner Philharmoniker / Kirill Petrenko', + 'album_artists': ['Berliner Philharmoniker', 'Kirill Petrenko'], 'title': 'Kirill Petrenko conducts Mendelssohn and Shostakovich', + 'thumbnail': r're:^https?://images.digitalconcerthall.com/cms/thumbnails.*\.jpg$', }, 'params': {'skip_download': 'm3u8'}, 'playlist_count': 3, + }, { + 'url': 'https://www.digitalconcerthall.com/en/film/388', + 'info_dict': { + 'id': '388', + 'ext': 'mp4', + 'title': 'The Berliner Philharmoniker and Frank Peter Zimmermann', + 'description': 'md5:cfe25a7044fa4be13743e5089b5b5eb2', + 'thumbnail': r're:^https?://images.digitalconcerthall.com/cms/thumbnails.*\.jpg$', + 'upload_date': '20220714', + 'timestamp': 1657785600, + 'album_artists': ['Frank Peter Zimmermann', 'Benedikt von Bernstorff', 'Jakob von Bernstorff'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'Concert with several works and an interview', + 'url': 'https://www.digitalconcerthall.com/en/work/53785-1', + 'info_dict': { + 'id': '53785', + 'album_artists': ['Berliner Philharmoniker', 'Kirill Petrenko'], + 'title': 'Kirill Petrenko conducts Mendelssohn and Shostakovich', + 'thumbnail': r're:^https?://images.digitalconcerthall.com/cms/thumbnails.*\.jpg$', + }, + 'params': {'skip_download': 'm3u8'}, + 'playlist_count': 1, }] def _perform_login(self, username, password): - token_response = self._download_json( + login_token = self._download_json( self._OAUTH_URL, None, 'Obtaining token', errnote='Unable to obtain token', data=urlencode_postdata({ 'affiliate': 'none', 'grant_type': 'device', 'device_vendor': 'unknown', + # device_model 'Safari' gets split streams of 4K/HEVC video and lossless/FLAC audio + 'device_model': 'unknown' if self._configuration_arg('prefer_combined_hls') else 'Safari', 'app_id': 'dch.webapp', - 'app_version': '1.0.0', + 'app_distributor': 'berlinphil', + 'app_version': '1.84.0', 'client_secret': '2ySLN+2Fwb', }), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }) - self._ACCESS_TOKEN = token_response['access_token'] + 'Accept': 'application/json', + 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', + 'User-Agent': self._USER_AGENT, + })['access_token'] try: - self._download_json( + login_response = self._download_json( self._OAUTH_URL, None, note='Logging in', errnote='Unable to login', data=urlencode_postdata({ 'grant_type': 'password', 'username': username, 'password': password, }), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', + 'Accept': 'application/json', + 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'Referer': 'https://www.digitalconcerthall.com', - 'Authorization': f'Bearer {self._ACCESS_TOKEN}' + 'Authorization': f'Bearer {login_token}', + 'User-Agent': self._USER_AGENT, }) - except ExtractorError: - self.raise_login_required(msg='Login info incorrect') + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 401: + raise ExtractorError('Invalid username or password', expected=True) + raise + self._ACCESS_TOKEN = login_response['access_token'] def _real_initialize(self): if not self._ACCESS_TOKEN: self.raise_login_required(method='password') - def _entries(self, items, language, **kwargs): + def _entries(self, items, language, type_, **kwargs): for item in items: video_id = item['id'] stream_info = self._download_json( self._proto_relative_url(item['_links']['streams']['href']), video_id, headers={ 'Accept': 'application/json', 'Authorization': f'Bearer {self._ACCESS_TOKEN}', - 'Accept-Language': language + 'Accept-Language': language, + 'User-Agent': self._USER_AGENT, }) - m3u8_url = traverse_obj( - stream_info, ('channel', lambda k, _: k.startswith('vod_mixed'), 'stream', 0, 'url'), get_all=False) - formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native', fatal=False) + formats = [] + for m3u8_url in traverse_obj(stream_info, ('channel', ..., 'stream', ..., 'url', {url_or_none})): + formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + for fmt in formats: + if fmt.get('format_note') and fmt.get('vcodec') == 'none': + fmt.update(parse_codecs(fmt['format_note'])) yield { 'id': video_id, 'title': item.get('title'), 'composer': item.get('name_composer'), - 'url': m3u8_url, 'formats': formats, 'duration': item.get('duration_total'), 'timestamp': traverse_obj(item, ('date', 'published')), @@ -103,35 +143,38 @@ def _entries(self, items, language, **kwargs): 'start_time': chapter.get('time'), 'end_time': try_get(chapter, lambda x: x['time'] + x['duration']), 'title': chapter.get('text'), - } for chapter in item['cuepoints']] if item.get('cuepoints') else None, + } for chapter in item['cuepoints']] if item.get('cuepoints') and type_ == 'concert' else None, } def _real_extract(self, url): - language, video_id = self._match_valid_url(url).group('language', 'id') + language, type_, video_id, part = self._match_valid_url(url).group('language', 'type', 'id', 'part') if not language: language = 'en' - thumbnail_url = self._html_search_regex( - r'(https?://images\.digitalconcerthall\.com/cms/thumbnails/.*\.jpg)', - self._download_webpage(url, video_id), 'thumbnail') - thumbnails = [{ - 'url': thumbnail_url, - **parse_resolution(thumbnail_url) - }] - + api_type = 'concert' if type_ == 'work' else type_ vid_info = self._download_json( - f'https://api.digitalconcerthall.com/v2/concert/{video_id}', video_id, headers={ + f'https://api.digitalconcerthall.com/v2/{api_type}/{video_id}', video_id, headers={ 'Accept': 'application/json', - 'Accept-Language': language + 'Accept-Language': language, + 'User-Agent': self._USER_AGENT, + 'Authorization': f'Bearer {self._ACCESS_TOKEN}', }) - album_artist = ' / '.join(traverse_obj(vid_info, ('_links', 'artist', ..., 'name')) or '') + videos = [vid_info] if type_ == 'film' else traverse_obj(vid_info, ('_embedded', ..., ...)) + + if type_ == 'work': + videos = [videos[int(part) - 1]] + + album_artists = traverse_obj(vid_info, ('_links', 'artist', ..., 'name', {str})) + thumbnail = traverse_obj(vid_info, ( + 'image', ..., {self._proto_relative_url}, {url_or_none}, + {lambda x: x.format(width=0, height=0)}, any)) # NB: 0x0 is the original size return { '_type': 'playlist', 'id': video_id, 'title': vid_info.get('title'), - 'entries': self._entries(traverse_obj(vid_info, ('_embedded', ..., ...)), language, - thumbnails=thumbnails, album_artist=album_artist), - 'thumbnails': thumbnails, - 'album_artist': album_artist, + 'entries': self._entries( + videos, language, type_, thumbnail=thumbnail, album_artists=album_artists), + 'thumbnail': thumbnail, + 'album_artists': album_artists, } diff --git a/yt_dlp/extractor/digiteka.py b/yt_dlp/extractor/digiteka.py index 912e33ba7b..e56ec63e86 100644 --- a/yt_dlp/extractor/digiteka.py +++ b/yt_dlp/extractor/digiteka.py @@ -65,7 +65,7 @@ def _real_extract(self, url): video_type = 'musique' deliver_info = self._download_json( - 'http://www.ultimedia.com/deliver/video?video=%s&topic=%s' % (video_id, video_type), + f'http://www.ultimedia.com/deliver/video?video={video_id}&topic={video_type}', video_id) yt_id = deliver_info.get('yt_id') diff --git a/yt_dlp/extractor/discogs.py b/yt_dlp/extractor/discogs.py new file mode 100644 index 0000000000..048c62288c --- /dev/null +++ b/yt_dlp/extractor/discogs.py @@ -0,0 +1,35 @@ +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import traverse_obj + + +class DiscogsReleasePlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?discogs\.com/(?Prelease|master)/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.discogs.com/release/1-The-Persuader-Stockholm', + 'info_dict': { + 'id': 'release1', + 'title': 'Stockholm', + }, + 'playlist_mincount': 7, + }, { + 'url': 'https://www.discogs.com/master/113-Vince-Watson-Moments-In-Time', + 'info_dict': { + 'id': 'master113', + 'title': 'Moments In Time', + }, + 'playlist_mincount': 53, + }] + + def _real_extract(self, url): + playlist_id, playlist_type = self._match_valid_url(url).group('id', 'type') + + display_id = f'{playlist_type}{playlist_id}' + response = self._download_json( + f'https://api.discogs.com/{playlist_type}s/{playlist_id}', display_id) + + entries = [ + self.url_result(video['uri'], YoutubeIE, video_title=video.get('title')) + for video in traverse_obj(response, ('videos', lambda _, v: YoutubeIE.suitable(v['uri'])))] + + return self.playlist_result(entries, display_id, response.get('title')) diff --git a/yt_dlp/extractor/discovery.py b/yt_dlp/extractor/discovery.py deleted file mode 100644 index e6e109d5c5..0000000000 --- a/yt_dlp/extractor/discovery.py +++ /dev/null @@ -1,115 +0,0 @@ -import random -import string - -from .discoverygo import DiscoveryGoBaseIE -from ..compat import compat_urllib_parse_unquote -from ..utils import ExtractorError -from ..compat import compat_HTTPError - - -class DiscoveryIE(DiscoveryGoBaseIE): - _VALID_URL = r'''(?x)https?:// - (?P - go\.discovery| - www\. - (?: - investigationdiscovery| - discoverylife| - animalplanet| - ahctv| - destinationamerica| - sciencechannel| - tlc - )| - watch\. - (?: - hgtv| - foodnetwork| - travelchannel| - diynetwork| - cookingchanneltv| - motortrend - ) - )\.com/tv-shows/(?P[^/]+)/(?:video|full-episode)s/(?P[^./?#]+)''' - _TESTS = [{ - 'url': 'https://go.discovery.com/tv-shows/cash-cab/videos/riding-with-matthew-perry', - 'info_dict': { - 'id': '5a2f35ce6b66d17a5026e29e', - 'ext': 'mp4', - 'title': 'Riding with Matthew Perry', - 'description': 'md5:a34333153e79bc4526019a5129e7f878', - 'duration': 84, - }, - 'params': { - 'skip_download': True, # requires ffmpeg - } - }, { - 'url': 'https://www.investigationdiscovery.com/tv-shows/final-vision/full-episodes/final-vision', - 'only_matching': True, - }, { - 'url': 'https://go.discovery.com/tv-shows/alaskan-bush-people/videos/follow-your-own-road', - 'only_matching': True, - }, { - # using `show_slug` is important to get the correct video data - 'url': 'https://www.sciencechannel.com/tv-shows/mythbusters-on-science/full-episodes/christmas-special', - 'only_matching': True, - }] - _GEO_COUNTRIES = ['US'] - _GEO_BYPASS = False - _API_BASE_URL = 'https://api.discovery.com/v1/' - - def _real_extract(self, url): - site, show_slug, display_id = self._match_valid_url(url).groups() - - access_token = None - cookies = self._get_cookies(url) - - # prefer Affiliate Auth Token over Anonymous Auth Token - auth_storage_cookie = cookies.get('eosAf') or cookies.get('eosAn') - if auth_storage_cookie and auth_storage_cookie.value: - auth_storage = self._parse_json(compat_urllib_parse_unquote( - compat_urllib_parse_unquote(auth_storage_cookie.value)), - display_id, fatal=False) or {} - access_token = auth_storage.get('a') or auth_storage.get('access_token') - - if not access_token: - access_token = self._download_json( - 'https://%s.com/anonymous' % site, display_id, - 'Downloading token JSON metadata', query={ - 'authRel': 'authorization', - 'client_id': '3020a40c2356a645b4b4', - 'nonce': ''.join(random.choices(string.ascii_letters, k=32)), - 'redirectUri': 'https://www.discovery.com/', - })['access_token'] - - headers = self.geo_verification_headers() - headers['Authorization'] = 'Bearer ' + access_token - - try: - video = self._download_json( - self._API_BASE_URL + 'content/videos', - display_id, 'Downloading content JSON metadata', - headers=headers, query={ - 'embed': 'show.name', - 'fields': 'authenticated,description.detailed,duration,episodeNumber,id,name,parental.rating,season.number,show,tags', - 'slug': display_id, - 'show_slug': show_slug, - })[0] - video_id = video['id'] - stream = self._download_json( - self._API_BASE_URL + 'streaming/video/' + video_id, - display_id, 'Downloading streaming JSON metadata', headers=headers) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): - e_description = self._parse_json( - e.cause.read().decode(), display_id)['description'] - if 'resource not available for country' in e_description: - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - if 'Authorized Networks' in e_description: - raise ExtractorError( - 'This video is only available via cable service provider subscription that' - ' is not currently supported. You may want to use --cookies.', expected=True) - raise ExtractorError(e_description) - raise - - return self._extract_video_info(video, stream, display_id) diff --git a/yt_dlp/extractor/discoverygo.py b/yt_dlp/extractor/discoverygo.py deleted file mode 100644 index 1f3d8e31c5..0000000000 --- a/yt_dlp/extractor/discoverygo.py +++ /dev/null @@ -1,172 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - extract_attributes, - ExtractorError, - int_or_none, - parse_age_limit, - remove_end, - unescapeHTML, - url_or_none, -) - - -class DiscoveryGoBaseIE(InfoExtractor): - _VALID_URL_TEMPLATE = r'''(?x)https?://(?:www\.)?(?: - discovery| - investigationdiscovery| - discoverylife| - animalplanet| - ahctv| - destinationamerica| - sciencechannel| - tlc| - velocitychannel - )go\.com/%s(?P[^/?#&]+)''' - - def _extract_video_info(self, video, stream, display_id): - title = video['name'] - - if not stream: - if video.get('authenticated') is True: - raise ExtractorError( - 'This video is only available via cable service provider subscription that' - ' is not currently supported. You may want to use --cookies.', expected=True) - else: - raise ExtractorError('Unable to find stream') - STREAM_URL_SUFFIX = 'streamUrl' - formats = [] - for stream_kind in ('', 'hds'): - suffix = STREAM_URL_SUFFIX.capitalize() if stream_kind else STREAM_URL_SUFFIX - stream_url = stream.get('%s%s' % (stream_kind, suffix)) - if not stream_url: - continue - if stream_kind == '': - formats.extend(self._extract_m3u8_formats( - stream_url, display_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif stream_kind == 'hds': - formats.extend(self._extract_f4m_formats( - stream_url, display_id, f4m_id=stream_kind, fatal=False)) - - video_id = video.get('id') or display_id - description = video.get('description', {}).get('detailed') - duration = int_or_none(video.get('duration')) - - series = video.get('show', {}).get('name') - season_number = int_or_none(video.get('season', {}).get('number')) - episode_number = int_or_none(video.get('episodeNumber')) - - tags = video.get('tags') - age_limit = parse_age_limit(video.get('parental', {}).get('rating')) - - subtitles = {} - captions = stream.get('captions') - if isinstance(captions, list): - for caption in captions: - subtitle_url = url_or_none(caption.get('fileUrl')) - if not subtitle_url or not subtitle_url.startswith('http'): - continue - lang = caption.get('fileLang', 'en') - ext = determine_ext(subtitle_url) - subtitles.setdefault(lang, []).append({ - 'url': subtitle_url, - 'ext': 'ttml' if ext == 'xml' else ext, - }) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'duration': duration, - 'series': series, - 'season_number': season_number, - 'episode_number': episode_number, - 'tags': tags, - 'age_limit': age_limit, - 'formats': formats, - 'subtitles': subtitles, - } - - -class DiscoveryGoIE(DiscoveryGoBaseIE): - _VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % r'(?:[^/]+/)+' - _GEO_COUNTRIES = ['US'] - _TEST = { - 'url': 'https://www.discoverygo.com/bering-sea-gold/reaper-madness/', - 'info_dict': { - 'id': '58c167d86b66d12f2addeb01', - 'ext': 'mp4', - 'title': 'Reaper Madness', - 'description': 'md5:09f2c625c99afb8946ed4fb7865f6e78', - 'duration': 2519, - 'series': 'Bering Sea Gold', - 'season_number': 8, - 'episode_number': 6, - 'age_limit': 14, - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - container = extract_attributes( - self._search_regex( - r'(]+class=["\']video-player-container[^>]+>)', - webpage, 'video container')) - - video = self._parse_json( - container.get('data-video') or container.get('data-json'), - display_id) - - stream = video.get('stream') - - return self._extract_video_info(video, stream, display_id) - - -class DiscoveryGoPlaylistIE(DiscoveryGoBaseIE): - _VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % '' - _TEST = { - 'url': 'https://www.discoverygo.com/bering-sea-gold/', - 'info_dict': { - 'id': 'bering-sea-gold', - 'title': 'Bering Sea Gold', - 'description': 'md5:cc5c6489835949043c0cc3ad66c2fa0e', - }, - 'playlist_mincount': 6, - } - - @classmethod - def suitable(cls, url): - return False if DiscoveryGoIE.suitable(url) else super( - DiscoveryGoPlaylistIE, cls).suitable(url) - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - entries = [] - for mobj in re.finditer(r'data-json=(["\'])(?P{.+?})\1', webpage): - data = self._parse_json( - mobj.group('json'), display_id, - transform_source=unescapeHTML, fatal=False) - if not isinstance(data, dict) or data.get('type') != 'episode': - continue - episode_url = data.get('socialUrl') - if not episode_url: - continue - entries.append(self.url_result( - episode_url, ie=DiscoveryGoIE.ie_key(), - video_id=data.get('id'))) - - return self.playlist_result( - entries, display_id, - remove_end(self._og_search_title( - webpage, fatal=False), ' | Discovery GO'), - self._og_search_description(webpage)) diff --git a/yt_dlp/extractor/disney.py b/yt_dlp/extractor/disney.py index 430de326f4..a90f12389e 100644 --- a/yt_dlp/extractor/disney.py +++ b/yt_dlp/extractor/disney.py @@ -2,10 +2,10 @@ from .common import InfoExtractor from ..utils import ( - int_or_none, - unified_strdate, determine_ext, + int_or_none, join_nonempty, + unified_strdate, update_url_query, ) @@ -26,7 +26,7 @@ class DisneyIE(InfoExtractor): 'params': { # m3u8 download 'skip_download': True, - } + }, }, { # Grill.burger 'url': 'http://www.starwars.com/video/rogue-one-a-star-wars-story-intro-featurette', @@ -40,7 +40,7 @@ class DisneyIE(InfoExtractor): 'params': { # m3u8 download 'skip_download': True, - } + }, }, { 'url': 'http://videos.disneylatino.com/ver/spider-man-de-regreso-a-casa-primer-adelanto-543a33a1850bdcfcca13bae2', 'only_matching': True, @@ -84,7 +84,7 @@ def _real_extract(self, url): video_data = page_data['data'][0] else: webpage = self._download_webpage( - 'http://%s/embed/%s' % (domain, video_id), video_id) + f'http://{domain}/embed/{video_id}', video_id) page_data = self._parse_json(self._search_regex( r'Disney\.EmbedVideo\s*=\s*({.+});', webpage, 'embed data'), video_id) @@ -132,7 +132,7 @@ def _real_extract(self, url): }) if not formats and video_data.get('expired'): self.raise_no_formats( - '%s said: %s' % (self.IE_NAME, page_data['translations']['video_expired']), + '{} said: {}'.format(self.IE_NAME, page_data['translations']['video_expired']), expected=True) subtitles = {} diff --git a/yt_dlp/extractor/dispeak.py b/yt_dlp/extractor/dispeak.py index 37f89b9bc0..89c27e0b55 100644 --- a/yt_dlp/extractor/dispeak.py +++ b/yt_dlp/extractor/dispeak.py @@ -55,7 +55,7 @@ def _parse_mp4(self, metadata): if video_root is None: http_host = xpath_text(metadata, 'httpHost', default=None) if http_host: - video_root = 'http://%s/' % http_host + video_root = f'http://{http_host}/' if video_root is None: # Hard-coded in http://evt.dispeak.com/ubm/gdc/sf16/custom/player2.js # Works for GPUTechConf, too @@ -86,7 +86,7 @@ def _parse_flv(self, metadata): audios = metadata.findall('./audios/audio') for audio in audios: formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'url': f'rtmp://{akamai_url}/ondemand?ovpfv=1.1', 'play_path': remove_end(audio.get('url'), '.flv'), 'ext': 'flv', 'vcodec': 'none', @@ -95,14 +95,14 @@ def _parse_flv(self, metadata): }) for video_key, format_id, preference in ( ('slide', 'slides', -2), ('speaker', 'speaker', -1)): - video_path = xpath_text(metadata, './%sVideo' % video_key) + video_path = xpath_text(metadata, f'./{video_key}Video') if not video_path: continue formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'url': f'rtmp://{akamai_url}/ondemand?ovpfv=1.1', 'play_path': remove_end(video_path, '.flv'), 'ext': 'flv', - 'format_note': '%s video' % video_key, + 'format_note': f'{video_key} video', 'quality': preference, 'format_id': format_id, }) diff --git a/yt_dlp/extractor/dlf.py b/yt_dlp/extractor/dlf.py new file mode 100644 index 0000000000..eac2190139 --- /dev/null +++ b/yt_dlp/extractor/dlf.py @@ -0,0 +1,192 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + extract_attributes, + int_or_none, + traverse_obj, + url_or_none, +) + + +class DLFBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?deutschlandfunk\.de/' + _BUTTON_REGEX = r'(]+alt="Anhören"[^>]+data-audio-diraid[^>]*>)' + + def _parse_button_attrs(self, button, audio_id=None): + attrs = extract_attributes(button) + audio_id = audio_id or attrs['data-audio-diraid'] + + url = traverse_obj( + attrs, 'data-audio-download-src', 'data-audio', 'data-audioreference', + 'data-audio-src', expected_type=url_or_none) + ext = determine_ext(url) + + return { + 'id': audio_id, + 'extractor_key': DLFIE.ie_key(), + 'extractor': DLFIE.IE_NAME, + **traverse_obj(attrs, { + 'title': (('data-audiotitle', 'data-audio-title', 'data-audio-download-tracking-title'), {str}), + 'duration': (('data-audioduration', 'data-audio-duration'), {int_or_none}), + 'thumbnail': ('data-audioimage', {url_or_none}), + 'uploader': 'data-audio-producer', + 'series': 'data-audio-series', + 'channel': 'data-audio-origin-site-name', + 'webpage_url': ('data-audio-download-tracking-path', {url_or_none}), + }, get_all=False), + 'formats': (self._extract_m3u8_formats(url, audio_id, fatal=False) + if ext == 'm3u8' else [{'url': url, 'ext': ext, 'vcodec': 'none'}]), + } + + +class DLFIE(DLFBaseIE): + IE_NAME = 'dlf' + _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'[\w-]+-dlf-(?P[\da-f]{8})-100\.html' + _TESTS = [ + # Audio as an HLS stream + { + 'url': 'https://www.deutschlandfunk.de/tanz-der-saiteninstrumente-das-wild-strings-trio-aus-slowenien-dlf-03a3eb19-100.html', + 'info_dict': { + 'id': '03a3eb19', + 'title': r're:Tanz der Saiteninstrumente [-/] Das Wild Strings Trio aus Slowenien', + 'ext': 'm4a', + 'duration': 3298, + 'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673', + 'uploader': 'Deutschlandfunk', + 'series': 'On Stage', + 'channel': 'deutschlandfunk', + }, + 'params': { + 'skip_download': 'm3u8', + }, + 'skip': 'This webpage no longer exists', + }, { + 'url': 'https://www.deutschlandfunk.de/russische-athleten-kehren-zurueck-auf-die-sportbuehne-ein-gefaehrlicher-tueroeffner-dlf-d9cc1856-100.html', + 'info_dict': { + 'id': 'd9cc1856', + 'title': 'Russische Athleten kehren zurück auf die Sportbühne: Ein gefährlicher Türöffner', + 'ext': 'mp3', + 'duration': 291, + 'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673', + 'uploader': 'Deutschlandfunk', + 'series': 'Kommentare und Themen der Woche', + 'channel': 'deutschlandfunk', + }, + }, + ] + + def _real_extract(self, url): + audio_id = self._match_id(url) + webpage = self._download_webpage(url, audio_id) + + return self._parse_button_attrs( + self._search_regex(self._BUTTON_REGEX, webpage, 'button'), audio_id) + + +class DLFCorpusIE(DLFBaseIE): + IE_NAME = 'dlf:corpus' + IE_DESC = 'DLF Multi-feed Archives' + _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'(?P(?![\w-]+-dlf-[\da-f]{8})[\w-]+-\d+)\.html' + _TESTS = [ + # Recorded news broadcast with referrals to related broadcasts + { + 'url': 'https://www.deutschlandfunk.de/fechten-russland-belarus-ukraine-protest-100.html', + 'info_dict': { + 'id': 'fechten-russland-belarus-ukraine-protest-100', + 'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet', + 'description': 'md5:91340aab29c71aa7518ad5be13d1e8ad', + }, + 'playlist_mincount': 5, + 'playlist': [{ + 'info_dict': { + 'id': '1fc5d64a', + 'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet', + 'ext': 'mp3', + 'duration': 252, + 'thumbnail': 'https://assets.deutschlandfunk.de/aad16241-6b76-4a09-958b-96d0ee1d6f57/512x512.jpg?t=1679480020313', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport', + 'channel': 'deutschlandfunk', + }, + }, { + 'info_dict': { + 'id': '2ada145f', + 'title': r're:(?:Sportpolitik / )?Fechtverband votiert für Rückkehr russischer Athleten', + 'ext': 'mp3', + 'duration': 336, + 'thumbnail': 'https://assets.deutschlandfunk.de/FILE_93982766f7317df30409b8a184ac044a/512x512.jpg?t=1678547581005', + 'uploader': 'Deutschlandfunk', + 'series': 'Deutschlandfunk Nova', + 'channel': 'deutschlandfunk-nova', + }, + }, { + 'info_dict': { + 'id': '5e55e8c9', + 'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis', + 'ext': 'mp3', + 'duration': 187, + 'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport am Samstag', + 'channel': 'deutschlandfunk', + }, + }, { + 'info_dict': { + 'id': '47e1a096', + 'title': r're:Rückkehr Russlands im Fechten [-/] "Fassungslos, dass es einfach so passiert ist"', + 'ext': 'mp3', + 'duration': 602, + 'thumbnail': 'https://assets.deutschlandfunk.de/da4c494a-21cc-48b4-9cc7-40e09fd442c2/512x512.jpg?t=1678562155770', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport am Samstag', + 'channel': 'deutschlandfunk', + }, + }, { + 'info_dict': { + 'id': '5e55e8c9', + 'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis', + 'ext': 'mp3', + 'duration': 187, + 'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport am Samstag', + 'channel': 'deutschlandfunk', + }, + }], + }, + # Podcast feed with tag buttons, playlist count fluctuates + { + 'url': 'https://www.deutschlandfunk.de/kommentare-und-themen-der-woche-100.html', + 'info_dict': { + 'id': 'kommentare-und-themen-der-woche-100', + 'title': 'Meinung - Kommentare und Themen der Woche', + 'description': 'md5:2901bbd65cd2d45e116d399a099ce5d5', + }, + 'playlist_mincount': 10, + }, + # Podcast feed with no description + { + 'url': 'https://www.deutschlandfunk.de/podcast-tolle-idee-100.html', + 'info_dict': { + 'id': 'podcast-tolle-idee-100', + 'title': 'Wissenschaftspodcast - Tolle Idee! - Was wurde daraus?', + }, + 'playlist_mincount': 11, + }, + ] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'description': self._html_search_meta( + ['description', 'og:description', 'twitter:description'], webpage, default=None), + 'title': self._html_search_meta( + ['og:title', 'twitter:title'], webpage, default=None), + 'entries': map(self._parse_button_attrs, re.findall(self._BUTTON_REGEX, webpage)), + } diff --git a/yt_dlp/extractor/dlive.py b/yt_dlp/extractor/dlive.py index 30fcf9fcef..157d06c403 100644 --- a/yt_dlp/extractor/dlive.py +++ b/yt_dlp/extractor/dlive.py @@ -16,7 +16,7 @@ class DLiveVODIE(InfoExtractor): 'upload_date': '20190701', 'timestamp': 1562011015, 'uploader_id': 'pdp', - } + }, }, { 'url': 'https://dlive.tv/p/pdpreplay+D-RD-xSZg', 'only_matching': True, @@ -36,7 +36,7 @@ def _real_extract(self, url): thumbnailUrl viewCount } -}''' % (uploader_id, vod_id)}).encode())['data']['pastBroadcast'] +}''' % (uploader_id, vod_id)}).encode())['data']['pastBroadcast'] # noqa: UP031 title = broadcast['title'] formats = self._extract_m3u8_formats( broadcast['playbackUrl'], vod_id, 'mp4', 'm3u8_native') @@ -71,12 +71,12 @@ def _real_extract(self, url): } username } -}''' % display_name}).encode())['data']['userByDisplayName'] +}''' % display_name}).encode())['data']['userByDisplayName'] # noqa: UP031 livestream = user['livestream'] title = livestream['title'] username = user['username'] formats = self._extract_m3u8_formats( - 'https://live.prd.dlive.tv/hls/live/%s.m3u8' % username, + f'https://live.prd.dlive.tv/hls/live/{username}.m3u8', display_name, 'mp4') return { 'id': display_name, diff --git a/yt_dlp/extractor/dotsub.py b/yt_dlp/extractor/dotsub.py deleted file mode 100644 index 079f837500..0000000000 --- a/yt_dlp/extractor/dotsub.py +++ /dev/null @@ -1,81 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - float_or_none, - int_or_none, -) - - -class DotsubIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dotsub\.com/view/(?P[^/]+)' - _TESTS = [{ - 'url': 'https://dotsub.com/view/9c63db2a-fa95-4838-8e6e-13deafe47f09', - 'md5': '21c7ff600f545358134fea762a6d42b6', - 'info_dict': { - 'id': '9c63db2a-fa95-4838-8e6e-13deafe47f09', - 'ext': 'flv', - 'title': 'MOTIVATION - "It\'s Possible" Best Inspirational Video Ever', - 'description': 'md5:41af1e273edbbdfe4e216a78b9d34ac6', - 'thumbnail': 're:^https?://dotsub.com/media/9c63db2a-fa95-4838-8e6e-13deafe47f09/p', - 'duration': 198, - 'uploader': 'liuxt', - 'timestamp': 1385778501.104, - 'upload_date': '20131130', - 'view_count': int, - } - }, { - 'url': 'https://dotsub.com/view/747bcf58-bd59-45b7-8c8c-ac312d084ee6', - 'md5': '2bb4a83896434d5c26be868c609429a3', - 'info_dict': { - 'id': '168006778', - 'ext': 'mp4', - 'title': 'Apartments and flats in Raipur the white symphony', - 'description': 'md5:784d0639e6b7d1bc29530878508e38fe', - 'thumbnail': 're:^https?://dotsub.com/media/747bcf58-bd59-45b7-8c8c-ac312d084ee6/p', - 'duration': 290, - 'timestamp': 1476767794.2809999, - 'upload_date': '20161018', - 'uploader': 'parthivi001', - 'uploader_id': 'user52596202', - 'view_count': int, - }, - 'add_ie': ['Vimeo'], - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - info = self._download_json( - 'https://dotsub.com/api/media/%s/metadata' % video_id, video_id) - video_url = info.get('mediaURI') - - if not video_url: - webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - [r']+src="([^"]+)"', r'"file"\s*:\s*\'([^\']+)'], - webpage, 'video url', default=None) - info_dict = { - 'id': video_id, - 'url': video_url, - 'ext': 'flv', - } - - if not video_url: - setup_data = self._parse_json(self._html_search_regex( - r'(?s)data-setup=([\'"])(?P(?!\1).+?)\1', - webpage, 'setup data', group='content'), video_id) - info_dict = { - '_type': 'url_transparent', - 'url': setup_data['src'], - } - - info_dict.update({ - 'title': info['title'], - 'description': info.get('description'), - 'thumbnail': info.get('screenshotURI'), - 'duration': int_or_none(info.get('duration'), 1000), - 'uploader': info.get('user'), - 'timestamp': float_or_none(info.get('dateCreated'), 1000), - 'view_count': int_or_none(info.get('numberOfViews')), - }) - - return info_dict diff --git a/yt_dlp/extractor/douyutv.py b/yt_dlp/extractor/douyutv.py index fa40844df5..e36eac9193 100644 --- a/yt_dlp/extractor/douyutv.py +++ b/yt_dlp/extractor/douyutv.py @@ -1,31 +1,74 @@ -import time import hashlib -import re +import time import urllib +import uuid from .common import InfoExtractor +from .openload import PhantomJSwrapper from ..utils import ( ExtractorError, + UserNotLive, + determine_ext, + int_or_none, + js_to_json, + parse_resolution, + str_or_none, + traverse_obj, unescapeHTML, - unified_strdate, + url_or_none, + urlencode_postdata, urljoin, ) -class DouyuTVIE(InfoExtractor): - IE_DESC = '斗鱼' +class DouyuBaseIE(InfoExtractor): + def _download_cryptojs_md5(self, video_id): + for url in [ + # XXX: Do NOT use cdn.bootcdn.net; ref: https://sansec.io/research/polyfill-supply-chain-attack + 'https://cdnjs.cloudflare.com/ajax/libs/crypto-js/3.1.2/rollups/md5.js', + 'https://unpkg.com/cryptojslib@3.1.2/rollups/md5.js', + ]: + js_code = self._download_webpage( + url, video_id, note='Downloading signing dependency', fatal=False) + if js_code: + self.cache.store('douyu', 'crypto-js-md5', js_code) + return js_code + raise ExtractorError('Unable to download JS dependency (crypto-js/md5)') + + def _get_cryptojs_md5(self, video_id): + return self.cache.load( + 'douyu', 'crypto-js-md5', min_ver='2024.07.04') or self._download_cryptojs_md5(video_id) + + def _calc_sign(self, sign_func, video_id, a): + b = uuid.uuid4().hex + c = round(time.time()) + js_script = f'{self._get_cryptojs_md5(video_id)};{sign_func};console.log(ub98484234("{a}","{b}","{c}"))' + phantom = PhantomJSwrapper(self) + result = phantom.execute(js_script, video_id, + note='Executing JS signing script').strip() + return {i: v[0] for i, v in urllib.parse.parse_qs(result).items()} + + def _search_js_sign_func(self, webpage, fatal=True): + # The greedy look-behind ensures last possible script tag is matched + return self._search_regex( + r'(?:]*>(.*?ub98484234.*?)', webpage, 'JS sign func', fatal=fatal) + + +class DouyuTVIE(DouyuBaseIE): + IE_DESC = '斗鱼直播' _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(topic/\w+\?rid=|(?:[^/]+/))*(?P[A-Za-z0-9]+)' _TESTS = [{ - 'url': 'http://www.douyutv.com/iseven', + 'url': 'https://www.douyu.com/pigff', 'info_dict': { - 'id': '17732', - 'display_id': 'iseven', - 'ext': 'flv', - 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': r're:.*m7show@163\.com.*', - 'thumbnail': r're:^https?://.*\.png', - 'uploader': '7师傅', + 'id': '24422', + 'display_id': 'pigff', + 'ext': 'mp4', + 'title': 're:^【PIGFF】.* [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': r'≥15级牌子看鱼吧置顶帖进粉丝vx群', + 'thumbnail': str, + 'uploader': 'pigff', 'is_live': True, + 'live_status': 'is_live', }, 'params': { 'skip_download': True, @@ -85,15 +128,43 @@ class DouyuTVIE(InfoExtractor): 'only_matching': True, }] + def _get_sign_func(self, room_id, video_id): + return self._download_json( + f'https://www.douyu.com/swf_api/homeH5Enc?rids={room_id}', video_id, + note='Getting signing script')['data'][f'room{room_id}'] + + def _extract_stream_formats(self, stream_formats): + formats = [] + for stream_info in traverse_obj(stream_formats, (..., 'data')): + stream_url = urljoin( + traverse_obj(stream_info, 'rtmp_url'), traverse_obj(stream_info, 'rtmp_live')) + if stream_url: + rate_id = traverse_obj(stream_info, ('rate', {int_or_none})) + rate_info = traverse_obj(stream_info, ('multirates', lambda _, v: v['rate'] == rate_id), get_all=False) + ext = determine_ext(stream_url) + formats.append({ + 'url': stream_url, + 'format_id': str_or_none(rate_id), + 'ext': 'mp4' if ext == 'm3u8' else ext, + 'protocol': 'm3u8_native' if ext == 'm3u8' else 'https', + 'quality': rate_id % -10000 if rate_id is not None else None, + **traverse_obj(rate_info, { + 'format': ('name', {str_or_none}), + 'tbr': ('bit', {int_or_none}), + }), + }) + return formats + def _real_extract(self, url): video_id = self._match_id(url) - if video_id.isdigit(): - room_id = video_id - else: - page = self._download_webpage(url, video_id) - room_id = self._html_search_regex( - r'"room_id\\?"\s*:\s*(\d+),', page, 'room id') + webpage = self._download_webpage(url, video_id) + room_id = self._search_regex(r'\$ROOM\.room_id\s*=\s*(\d+)', webpage, 'room id') + + if self._search_regex(r'"videoLoop"\s*:\s*(\d+)', webpage, 'loop', default='') == '1': + raise UserNotLive('The channel is auto-playing VODs', video_id=video_id) + if self._search_regex(r'\$ROOM\.show_status\s*=\s*(\d+)', webpage, 'status', default='') == '2': + raise UserNotLive(video_id=video_id) # Grab metadata from API params = { @@ -102,110 +173,136 @@ def _real_extract(self, url): 'time': int(time.time()), } params['auth'] = hashlib.md5( - f'room/{video_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest() - room = self._download_json( + f'room/{room_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest() + room = traverse_obj(self._download_json( f'http://www.douyutv.com/api/v1/room/{room_id}', video_id, - note='Downloading room info', query=params)['data'] + note='Downloading room info', query=params, fatal=False), 'data') # 1 = live, 2 = offline - if room.get('show_status') == '2': - raise ExtractorError('Live stream is offline', expected=True) + if traverse_obj(room, 'show_status') == '2': + raise UserNotLive(video_id=video_id) - video_url = urljoin('https://hls3-akm.douyucdn.cn/', self._search_regex(r'(live/.*)', room['hls_url'], 'URL')) - formats, subs = self._extract_m3u8_formats_and_subtitles(video_url, room_id) + js_sign_func = self._search_js_sign_func(webpage, fatal=False) or self._get_sign_func(room_id, video_id) + form_data = { + 'rate': 0, + **self._calc_sign(js_sign_func, video_id, room_id), + } + stream_formats = [self._download_json( + f'https://www.douyu.com/lapi/live/getH5Play/{room_id}', + video_id, note='Downloading livestream format', + data=urlencode_postdata(form_data))] - title = unescapeHTML(room['room_name']) - description = room.get('show_details') - thumbnail = room.get('room_src') - uploader = room.get('nickname') + for rate_id in traverse_obj(stream_formats[0], ('data', 'multirates', ..., 'rate')): + if rate_id != traverse_obj(stream_formats[0], ('data', 'rate')): + form_data['rate'] = rate_id + stream_formats.append(self._download_json( + f'https://www.douyu.com/lapi/live/getH5Play/{room_id}', + video_id, note=f'Downloading livestream format {rate_id}', + data=urlencode_postdata(form_data))) return { 'id': room_id, - 'display_id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, + 'formats': self._extract_stream_formats(stream_formats), 'is_live': True, - 'subtitles': subs, - 'formats': formats, + **traverse_obj(room, { + 'display_id': ('url', {str}, {lambda i: i[1:]}), + 'title': ('room_name', {unescapeHTML}), + 'description': ('show_details', {str}), + 'uploader': ('nickname', {str}), + 'thumbnail': ('room_src', {url_or_none}), + }), } -class DouyuShowIE(InfoExtractor): +class DouyuShowIE(DouyuBaseIE): _VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P[0-9a-zA-Z]+)' _TESTS = [{ - 'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw', - 'md5': '0c2cfd068ee2afe657801269b2d86214', + 'url': 'https://v.douyu.com/show/mPyq7oVNe5Yv1gLY', 'info_dict': { - 'id': 'rjNBdvnVXNzvE2yw', + 'id': 'mPyq7oVNe5Yv1gLY', 'ext': 'mp4', - 'title': '陈一发儿:砒霜 我有个室友系列!04-01 22点场', - 'duration': 7150.08, - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': '陈一发儿', - 'uploader_id': 'XrZwYelr5wbK', - 'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK', - 'upload_date': '20170402', + 'title': '四川人小时候的味道“蒜苗回锅肉”,传统菜不能丢,要常做来吃', + 'duration': 633, + 'thumbnail': str, + 'uploader': '美食作家王刚V', + 'uploader_id': 'OVAO4NVx1m7Q', + 'timestamp': 1661850002, + 'upload_date': '20220830', + 'view_count': int, + 'tags': ['美食', '美食综合'], }, }, { 'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw', 'only_matching': True, }] + _FORMATS = { + 'super': '原画', + 'high': '超清', + 'normal': '高清', + } + + _QUALITIES = { + 'super': -1, + 'high': -2, + 'normal': -3, + } + + _RESOLUTIONS = { + 'super': '1920x1080', + 'high': '1280x720', + 'normal': '852x480', + } + def _real_extract(self, url): url = url.replace('vmobile.', 'v.') video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - room_info = self._parse_json(self._search_regex( - r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id) + video_info = self._search_json( + r' - ''' % PLAYER_JS_RE, webpage) + ''' % PLAYER_JS_RE, webpage) # noqa: UP031 if mobj is not None: - return [add_referer('eagleplatform:%(host)s:%(id)s' % mobj.groupdict())] + return [add_referer('eagleplatform:{host}:{id}'.format(**mobj.groupdict()))] @staticmethod def _handle_error(response): @@ -108,11 +108,11 @@ def _handle_error(response): def _download_json(self, url_or_request, video_id, *args, **kwargs): try: - response = super(EaglePlatformIE, self)._download_json( + response = super()._download_json( url_or_request, video_id, *args, **kwargs) except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError): - response = self._parse_json(ee.cause.read().decode('utf-8'), video_id) + if isinstance(ee.cause, HTTPError): + response = self._parse_json(ee.cause.response.read().decode('utf-8'), video_id) self._handle_error(response) raise return response @@ -137,7 +137,7 @@ def _real_extract(self, url): query['referrer'] = referrer player_data = self._download_json( - 'http://%s/api/player_data' % host, video_id, + f'http://{host}/api/player_data', video_id, headers=headers, query=query) media = player_data['data']['playlist']['viewports'][0]['medialist'][0] @@ -186,7 +186,7 @@ def _real_extract(self, url): }) else: f = { - 'format_id': 'http-%s' % format_id, + 'format_id': f'http-{format_id}', 'height': int_or_none(format_id), } f['url'] = format_url @@ -212,4 +212,4 @@ def _extract_embed_urls(cls, url, webpage): mobj = re.search( r']+src="https?://(?Pmedia\.clipyou\.ru)/index/player\?.*\brecord_id=(?P\d+).*"', webpage) if mobj is not None: - yield smuggle_url('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), {'referrer': url}) + yield smuggle_url('eagleplatform:{host}:{id}'.format(**mobj.groupdict()), {'referrer': url}) diff --git a/yt_dlp/extractor/ebaumsworld.py b/yt_dlp/extractor/ebaumsworld.py index 0854d03443..ac766b3809 100644 --- a/yt_dlp/extractor/ebaumsworld.py +++ b/yt_dlp/extractor/ebaumsworld.py @@ -18,7 +18,7 @@ class EbaumsWorldIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) config = self._download_xml( - 'http://www.ebaumsworld.com/video/player/%s' % video_id, video_id) + f'http://www.ebaumsworld.com/video/player/{video_id}', video_id) video_url = config.find('file').text return { diff --git a/yt_dlp/extractor/ebay.py b/yt_dlp/extractor/ebay.py index d0eb9fc51c..f1d122f887 100644 --- a/yt_dlp/extractor/ebay.py +++ b/yt_dlp/extractor/ebay.py @@ -11,7 +11,7 @@ class EbayIE(InfoExtractor): 'ext': 'mp4', 'title': 'WiFi internal antenna adhesive for wifi 2.4GHz wifi 5 wifi 6 wifi 6E full bands', }, - 'params': {'skip_download': 'm3u8'} + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): @@ -32,5 +32,5 @@ def _real_extract(self, url): return { 'id': video_id, 'title': remove_end(self._html_extract_title(webpage), ' | eBay'), - 'formats': formats + 'formats': formats, } diff --git a/yt_dlp/extractor/echomsk.py b/yt_dlp/extractor/echomsk.py deleted file mode 100644 index 850eabbff0..0000000000 --- a/yt_dlp/extractor/echomsk.py +++ /dev/null @@ -1,43 +0,0 @@ -import re - -from .common import InfoExtractor - - -class EchoMskIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?echo\.msk\.ru/sounds/(?P\d+)' - _TEST = { - 'url': 'http://www.echo.msk.ru/sounds/1464134.html', - 'md5': '2e44b3b78daff5b458e4dbc37f191f7c', - 'info_dict': { - 'id': '1464134', - 'ext': 'mp3', - 'title': 'Особое мнение - 29 декабря 2014, 19:08', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - audio_url = self._search_regex( - r'', webpage, 'audio URL') - - title = self._html_search_regex( - r'([^<]+)', - webpage, 'title') - - air_date = self._html_search_regex( - r'(?s)

(.+?)
', - webpage, 'date', fatal=False, default=None) - - if air_date: - air_date = re.sub(r'(\s)\1+', r'\1', air_date) - if air_date: - title = '%s - %s' % (title, air_date) - - return { - 'id': video_id, - 'url': audio_url, - 'title': title, - } diff --git a/yt_dlp/extractor/egghead.py b/yt_dlp/extractor/egghead.py index a4b2a12f68..62d2e544c9 100644 --- a/yt_dlp/extractor/egghead.py +++ b/yt_dlp/extractor/egghead.py @@ -1,5 +1,4 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, int_or_none, @@ -13,13 +12,13 @@ class EggheadBaseIE(InfoExtractor): def _call_api(self, path, video_id, resource, fatal=True): return self._download_json( 'https://app.egghead.io/api/v1/' + path, - video_id, 'Downloading %s JSON' % resource, fatal=fatal) + video_id, f'Downloading {resource} JSON', fatal=fatal) class EggheadCourseIE(EggheadBaseIE): IE_DESC = 'egghead.io course' IE_NAME = 'egghead:course' - _VALID_URL = r'https://(?:app\.)?egghead\.io/(?:course|playlist)s/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:app\.)?egghead\.io/(?:course|playlist)s/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript', 'playlist_count': 29, @@ -46,7 +45,7 @@ def _real_extract(self, url): continue lesson_id = lesson.get('id') if lesson_id: - lesson_id = compat_str(lesson_id) + lesson_id = str(lesson_id) entries.append(self.url_result( lesson_url, ie=EggheadLessonIE.ie_key(), video_id=lesson_id)) @@ -55,7 +54,7 @@ def _real_extract(self, url): playlist_id = course.get('id') if playlist_id: - playlist_id = compat_str(playlist_id) + playlist_id = str(playlist_id) return self.playlist_result( entries, playlist_id, course.get('title'), @@ -65,7 +64,7 @@ def _real_extract(self, url): class EggheadLessonIE(EggheadBaseIE): IE_DESC = 'egghead.io lesson' IE_NAME = 'egghead:lesson' - _VALID_URL = r'https://(?:app\.)?egghead\.io/(?:api/v1/)?lessons/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:app\.)?egghead\.io/(?:api/v1/)?lessons/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', 'info_dict': { @@ -98,7 +97,7 @@ def _real_extract(self, url): lesson = self._call_api( 'lessons/' + display_id, display_id, 'lesson') - lesson_id = compat_str(lesson['id']) + lesson_id = str(lesson['id']) title = lesson['title'] formats = [] @@ -129,6 +128,6 @@ def _real_extract(self, url): 'view_count': int_or_none(lesson.get('plays_count')), 'tags': try_get(lesson, lambda x: x['tag_list'], list), 'series': try_get( - lesson, lambda x: x['series']['title'], compat_str), + lesson, lambda x: x['series']['title'], str), 'formats': formats, } diff --git a/yt_dlp/extractor/ehow.py b/yt_dlp/extractor/ehow.py deleted file mode 100644 index 74469ce36f..0000000000 --- a/yt_dlp/extractor/ehow.py +++ /dev/null @@ -1,36 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote - - -class EHowIE(InfoExtractor): - IE_NAME = 'eHow' - _VALID_URL = r'https?://(?:www\.)?ehow\.com/[^/_?]*_(?P[0-9]+)' - _TEST = { - 'url': 'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html', - 'md5': '9809b4e3f115ae2088440bcb4efbf371', - 'info_dict': { - 'id': '12245069', - 'ext': 'flv', - 'title': 'Hardwood Flooring Basics', - 'description': 'Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...', - 'uploader': 'Erick Nathan', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - r'(?:file|source)=(http[^\'"&]*)', webpage, 'video URL') - final_url = compat_urllib_parse_unquote(video_url) - uploader = self._html_search_meta('uploader', webpage) - title = self._og_search_title(webpage).replace(' | eHow', '') - - return { - 'id': video_id, - 'url': final_url, - 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage), - 'description': self._og_search_description(webpage), - 'uploader': uploader, - } diff --git a/yt_dlp/extractor/eighttracks.py b/yt_dlp/extractor/eighttracks.py index 3dd9ab1b31..3ac4c56ae0 100644 --- a/yt_dlp/extractor/eighttracks.py +++ b/yt_dlp/extractor/eighttracks.py @@ -2,9 +2,6 @@ import random from .common import InfoExtractor -from ..compat import ( - compat_str, -) from ..utils import ( ExtractorError, ) @@ -29,8 +26,8 @@ class EightTracksIE(InfoExtractor): 'id': '11885610', 'ext': 'm4a', 'title': "youtue-dl project<>\"' - youtube-dl test track 1 \"'/\\\u00e4\u21ad", - 'uploader_id': 'ytdl' - } + 'uploader_id': 'ytdl', + }, }, { 'md5': '4ab26f05c1f7291ea460a3920be8021f', @@ -38,8 +35,8 @@ class EightTracksIE(InfoExtractor): 'id': '11885608', 'ext': 'm4a', 'title': "youtube-dl project - youtube-dl test track 2 \"'/\\\u00e4\u21ad", - 'uploader_id': 'ytdl' - } + 'uploader_id': 'ytdl', + }, }, { 'md5': 'd30b5b5f74217410f4689605c35d1fd7', @@ -47,8 +44,8 @@ class EightTracksIE(InfoExtractor): 'id': '11885679', 'ext': 'm4a', 'title': "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad", - 'uploader_id': 'ytdl' - } + 'uploader_id': 'ytdl', + }, }, { 'md5': '4eb0a669317cd725f6bbd336a29f923a', @@ -56,8 +53,8 @@ class EightTracksIE(InfoExtractor): 'id': '11885680', 'ext': 'm4a', 'title': "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad", - 'uploader_id': 'ytdl' - } + 'uploader_id': 'ytdl', + }, }, { 'md5': '1893e872e263a2705558d1d319ad19e8', @@ -65,8 +62,8 @@ class EightTracksIE(InfoExtractor): 'id': '11885682', 'ext': 'm4a', 'title': "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad", - 'uploader_id': 'ytdl' - } + 'uploader_id': 'ytdl', + }, }, { 'md5': 'b673c46f47a216ab1741ae8836af5899', @@ -74,8 +71,8 @@ class EightTracksIE(InfoExtractor): 'id': '11885683', 'ext': 'm4a', 'title': "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad", - 'uploader_id': 'ytdl' - } + 'uploader_id': 'ytdl', + }, }, { 'md5': '1d74534e95df54986da7f5abf7d842b7', @@ -83,8 +80,8 @@ class EightTracksIE(InfoExtractor): 'id': '11885684', 'ext': 'm4a', 'title': "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad", - 'uploader_id': 'ytdl' - } + 'uploader_id': 'ytdl', + }, }, { 'md5': 'f081f47af8f6ae782ed131d38b9cd1c0', @@ -92,10 +89,10 @@ class EightTracksIE(InfoExtractor): 'id': '11885685', 'ext': 'm4a', 'title': "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad", - 'uploader_id': 'ytdl' - } - } - ] + 'uploader_id': 'ytdl', + }, + }, + ], } def _real_extract(self, url): @@ -105,7 +102,7 @@ def _real_extract(self, url): data = self._parse_json( self._search_regex( - r"(?s)PAGE\.mix\s*=\s*({.+?});\n", webpage, 'trax information'), + r'(?s)PAGE\.mix\s*=\s*({.+?});\n', webpage, 'trax information'), playlist_id) session = str(random.randint(0, 1000000000)) @@ -116,7 +113,7 @@ def _real_extract(self, url): # duration is sometimes negative, use predefined avg duration if avg_song_duration <= 0: avg_song_duration = 300 - first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id) + first_url = f'http://8tracks.com/sets/{session}/play?player=sm&mix_id={mix_id}&format=jsonh' next_url = first_url entries = [] @@ -140,7 +137,7 @@ def _real_extract(self, url): api_data = json.loads(api_json) track_data = api_data['set']['track'] info = { - 'id': compat_str(track_data['id']), + 'id': str(track_data['id']), 'url': track_data['track_file_stream_url'], 'title': track_data['performer'] + ' - ' + track_data['name'], 'raw_title': track_data['name'], @@ -149,12 +146,12 @@ def _real_extract(self, url): } entries.append(info) - next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % ( + next_url = 'http://8tracks.com/sets/{}/next?player=sm&mix_id={}&format=jsonh&track_id={}'.format( session, mix_id, track_data['id']) return { '_type': 'playlist', 'entries': entries, - 'id': compat_str(mix_id), + 'id': str(mix_id), 'display_id': playlist_id, 'title': data.get('name'), 'description': data.get('description'), diff --git a/yt_dlp/extractor/einthusan.py b/yt_dlp/extractor/einthusan.py deleted file mode 100644 index 53bc2535d0..0000000000 --- a/yt_dlp/extractor/einthusan.py +++ /dev/null @@ -1,105 +0,0 @@ -import json - -from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_str, - compat_urlparse, -) -from ..utils import ( - extract_attributes, - ExtractorError, - get_elements_by_class, - urlencode_postdata, -) - - -class EinthusanIE(InfoExtractor): - _VALID_URL = r'https?://(?Peinthusan\.(?:tv|com|ca))/movie/watch/(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'https://einthusan.tv/movie/watch/9097/', - 'md5': 'ff0f7f2065031b8a2cf13a933731c035', - 'info_dict': { - 'id': '9097', - 'ext': 'mp4', - 'title': 'Ae Dil Hai Mushkil', - 'description': 'md5:33ef934c82a671a94652a9b4e54d931b', - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, { - 'url': 'https://einthusan.tv/movie/watch/51MZ/?lang=hindi', - 'only_matching': True, - }, { - 'url': 'https://einthusan.com/movie/watch/9097/', - 'only_matching': True, - }, { - 'url': 'https://einthusan.ca/movie/watch/4E9n/?lang=hindi', - 'only_matching': True, - }] - - # reversed from jsoncrypto.prototype.decrypt() in einthusan-PGMovieWatcher.js - def _decrypt(self, encrypted_data, video_id): - return self._parse_json(compat_b64decode(( - encrypted_data[:10] + encrypted_data[-1] + encrypted_data[12:-1] - )).decode('utf-8'), video_id) - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - host = mobj.group('host') - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex(r'

([^<]+)

', webpage, 'title') - - player_params = extract_attributes(self._search_regex( - r'(]+id="UIVideoPlayer"[^>]+>)', webpage, 'player parameters')) - - page_id = self._html_search_regex( - ']+data-pageid="([^"]+)"', webpage, 'page ID') - video_data = self._download_json( - 'https://%s/ajax/movie/watch/%s/' % (host, video_id), video_id, - data=urlencode_postdata({ - 'xEvent': 'UIVideoPlayer.PingOutcome', - 'xJson': json.dumps({ - 'EJOutcomes': player_params['data-ejpingables'], - 'NativeHLS': False - }), - 'arcVersion': 3, - 'appVersion': 59, - 'gorilla.csrf.Token': page_id, - }))['Data'] - - if isinstance(video_data, compat_str) and video_data.startswith('/ratelimited/'): - raise ExtractorError( - 'Download rate reached. Please try again later.', expected=True) - - ej_links = self._decrypt(video_data['EJLinks'], video_id) - - formats = [] - - m3u8_url = ej_links.get('HLSLink') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native')) - - mp4_url = ej_links.get('MP4Link') - if mp4_url: - formats.append({ - 'url': mp4_url, - }) - - description = get_elements_by_class('synopsis', webpage)[0] - thumbnail = self._html_search_regex( - r''']+src=(["'])(?P(?!\1).+?/moviecovers/(?!\1).+?)\1''', - webpage, 'thumbnail url', fatal=False, group='url') - if thumbnail is not None: - thumbnail = compat_urlparse.urljoin(url, thumbnail) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'description': description, - } diff --git a/yt_dlp/extractor/eitb.py b/yt_dlp/extractor/eitb.py index bd027da6b4..18b802eb15 100644 --- a/yt_dlp/extractor/eitb.py +++ b/yt_dlp/extractor/eitb.py @@ -1,9 +1,10 @@ from .common import InfoExtractor +from ..networking import Request from ..utils import ( float_or_none, int_or_none, + join_nonempty, parse_iso8601, - sanitized_Request, ) @@ -30,7 +31,7 @@ def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( - 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/Video/MULTIWEBTV/%s/' % video_id, + f'http://mam.eitb.eus/mam/REST/ServiceMultiweb/Video/MULTIWEBTV/{video_id}/', video_id, 'Downloading video JSON') media = video['web_media'][0] @@ -41,12 +42,9 @@ def _real_extract(self, url): if not video_url: continue tbr = float_or_none(rendition.get('ENCODING_RATE'), 1000) - format_id = 'http' - if tbr: - format_id += '-%d' % int(tbr) formats.append({ 'url': rendition['PMD_URL'], - 'format_id': format_id, + 'format_id': join_nonempty('http', int_or_none(tbr)), 'width': int_or_none(rendition.get('FRAME_WIDTH')), 'height': int_or_none(rendition.get('FRAME_HEIGHT')), 'tbr': tbr, @@ -54,7 +52,7 @@ def _real_extract(self, url): hls_url = media.get('HLS_SURL') if hls_url: - request = sanitized_Request( + request = Request( 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/DomainRestrictedSecurity/TokenAuth/', headers={'Referer': url}) token_data = self._download_json( @@ -63,12 +61,12 @@ def _real_extract(self, url): token = token_data.get('token') if token: formats.extend(self._extract_m3u8_formats( - '%s?hdnts=%s' % (hls_url, token), video_id, m3u8_id='hls', fatal=False)) + f'{hls_url}?hdnts={token}', video_id, m3u8_id='hls', fatal=False)) hds_url = media.get('HDS_SURL') if hds_url: formats.extend(self._extract_f4m_formats( - '%s?hdcore=3.7.0' % hds_url.replace('euskalsvod', 'euskalvod'), + '{}?hdcore=3.7.0'.format(hds_url.replace('euskalsvod', 'euskalvod')), video_id, f4m_id='hds', fatal=False)) return { diff --git a/yt_dlp/extractor/elementorembed.py b/yt_dlp/extractor/elementorembed.py new file mode 100644 index 0000000000..638893f6f6 --- /dev/null +++ b/yt_dlp/extractor/elementorembed.py @@ -0,0 +1,72 @@ +import re + +from .common import InfoExtractor +from .vimeo import VimeoIE +from .youtube import YoutubeIE +from ..utils import unescapeHTML, url_or_none +from ..utils.traversal import traverse_obj + + +class ElementorEmbedIE(InfoExtractor): + _VALID_URL = False + _WEBPAGE_TESTS = [{ + 'url': 'https://capitaltv.cy/2023/12/14/υγεια-και-ζωη-14-12-2023-δρ-ξενια-κωσταντινιδο/', + 'info_dict': { + 'id': 'KgzuxwuQwM4', + 'ext': 'mp4', + 'title': 'ΥΓΕΙΑ ΚΑΙ ΖΩΗ 14 12 2023 ΔΡ ΞΕΝΙΑ ΚΩΣΤΑΝΤΙΝΙΔΟΥ', + 'thumbnail': 'https://i.ytimg.com/vi/KgzuxwuQwM4/maxresdefault.jpg', + 'playable_in_embed': True, + 'tags': 'count:16', + 'like_count': int, + 'channel': 'Capital TV Cyprus', + 'channel_id': 'UCR8LwVKTLGEXt4ZAErpCMrg', + 'availability': 'public', + 'description': 'md5:7a3308a22881aea4612358c4ba121f77', + 'duration': 2891, + 'upload_date': '20231214', + 'uploader_id': '@capitaltvcyprus6389', + 'live_status': 'not_live', + 'channel_url': 'https://www.youtube.com/channel/UCR8LwVKTLGEXt4ZAErpCMrg', + 'uploader_url': 'https://www.youtube.com/@capitaltvcyprus6389', + 'uploader': 'Capital TV Cyprus', + 'age_limit': 0, + 'categories': ['News & Politics'], + 'view_count': int, + 'channel_follower_count': int, + }, + }, { + 'url': 'https://elementor.com/academy/theme-builder-collection/?playlist=76011151&video=9e59909', + 'info_dict': { + 'id': '?playlist=76011151&video=9e59909', + 'title': 'Theme Builder Collection - Academy', + 'age_limit': 0, + 'timestamp': 1702196984.0, + 'upload_date': '20231210', + 'description': 'md5:7f52c52715ee9e54fd7f82210511673d', + 'thumbnail': 'https://elementor.com/academy/wp-content/uploads/2021/07/Theme-Builder-1.png', + }, + 'playlist_count': 11, + 'params': { + 'skip_download': True, + }, + }] + _WIDGET_REGEX = r']+class="[^"]*elementor-widget-video(?:-playlist)?[^"]*"[^>]*data-settings="([^"]*)"' + + def _extract_from_webpage(self, url, webpage): + for data_settings in re.findall(self._WIDGET_REGEX, webpage): + data = self._parse_json(data_settings, None, fatal=False, transform_source=unescapeHTML) + if youtube_url := traverse_obj(data, ('youtube_url', {url_or_none})): + yield self.url_result(youtube_url, ie=YoutubeIE) + + for video in traverse_obj(data, ('tabs', lambda _, v: v['_id'], {dict})): + if youtube_url := traverse_obj(video, ('youtube_url', {url_or_none})): + yield self.url_result(youtube_url, ie=YoutubeIE) + if vimeo_url := traverse_obj(video, ('vimeo_url', {url_or_none})): + yield self.url_result(vimeo_url, ie=VimeoIE) + for direct_url in traverse_obj(video, (('hosted_url', 'external_url'), 'url', {url_or_none})): + yield { + 'id': video['_id'], + 'url': direct_url, + 'title': video.get('title'), + } diff --git a/yt_dlp/extractor/ellentube.py b/yt_dlp/extractor/ellentube.py deleted file mode 100644 index 6eb00f9c98..0000000000 --- a/yt_dlp/extractor/ellentube.py +++ /dev/null @@ -1,130 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - clean_html, - extract_attributes, - float_or_none, - int_or_none, - try_get, -) - - -class EllenTubeBaseIE(InfoExtractor): - def _extract_data_config(self, webpage, video_id): - details = self._search_regex( - r'(<[^>]+\bdata-component=(["\'])[Dd]etails.+?>
)', webpage, - 'details') - return self._parse_json( - extract_attributes(details)['data-config'], video_id) - - def _extract_video(self, data, video_id): - title = data['title'] - - formats = [] - duration = None - for entry in data.get('media'): - if entry.get('id') == 'm3u8': - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - entry['url'], video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls') - duration = int_or_none(entry.get('duration')) - break - - def get_insight(kind): - return int_or_none(try_get( - data, lambda x: x['insight']['%ss' % kind])) - - return { - 'extractor_key': EllenTubeIE.ie_key(), - 'id': video_id, - 'title': title, - 'description': data.get('description'), - 'duration': duration, - 'thumbnail': data.get('thumbnail'), - 'timestamp': float_or_none(data.get('publishTime'), scale=1000), - 'view_count': get_insight('view'), - 'like_count': get_insight('like'), - 'formats': formats, - 'subtitles': subtitles, - } - - -class EllenTubeIE(EllenTubeBaseIE): - _VALID_URL = r'''(?x) - (?: - ellentube:| - https://api-prod\.ellentube\.com/ellenapi/api/item/ - ) - (?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) - ''' - _TESTS = [{ - 'url': 'https://api-prod.ellentube.com/ellenapi/api/item/0822171c-3829-43bf-b99f-d77358ae75e3', - 'md5': '2fabc277131bddafdd120e0fc0f974c9', - 'info_dict': { - 'id': '0822171c-3829-43bf-b99f-d77358ae75e3', - 'ext': 'mp4', - 'title': 'Ellen Meets Las Vegas Survivors Jesus Campos and Stephen Schuck', - 'description': 'md5:76e3355e2242a78ad9e3858e5616923f', - 'thumbnail': r're:^https?://.+?', - 'duration': 514, - 'timestamp': 1508505120, - 'upload_date': '20171020', - 'view_count': int, - 'like_count': int, - } - }, { - 'url': 'ellentube:734a3353-f697-4e79-9ca9-bfc3002dc1e0', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - data = self._download_json( - 'https://api-prod.ellentube.com/ellenapi/api/item/%s' % video_id, - video_id) - return self._extract_video(data, video_id) - - -class EllenTubeVideoIE(EllenTubeBaseIE): - _VALID_URL = r'https?://(?:www\.)?ellentube\.com/video/(?P.+?)\.html' - _TEST = { - 'url': 'https://www.ellentube.com/video/ellen-meets-las-vegas-survivors-jesus-campos-and-stephen-schuck.html', - 'only_matching': True, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._extract_data_config(webpage, display_id)['id'] - return self.url_result( - 'ellentube:%s' % video_id, ie=EllenTubeIE.ie_key(), - video_id=video_id) - - -class EllenTubePlaylistIE(EllenTubeBaseIE): - _VALID_URL = r'https?://(?:www\.)?ellentube\.com/(?:episode|studios)/(?P.+?)\.html' - _TESTS = [{ - 'url': 'https://www.ellentube.com/episode/dax-shepard-jordan-fisher-haim.html', - 'info_dict': { - 'id': 'dax-shepard-jordan-fisher-haim', - 'title': "Dax Shepard, 'DWTS' Team Jordan Fisher & Lindsay Arnold, HAIM", - 'description': 'md5:bfc982194dabb3f4e325e43aa6b2e21c', - }, - 'playlist_count': 6, - }, { - 'url': 'https://www.ellentube.com/studios/macey-goes-rving0.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - data = self._extract_data_config(webpage, display_id)['data'] - feed = self._download_json( - 'https://api-prod.ellentube.com/ellenapi/api/feed/?%s' - % data['filter'], display_id) - entries = [ - self._extract_video(elem, elem['id']) - for elem in feed if elem.get('type') == 'VIDEO' and elem.get('id')] - return self.playlist_result( - entries, display_id, data.get('title'), - clean_html(data.get('description'))) diff --git a/yt_dlp/extractor/elpais.py b/yt_dlp/extractor/elpais.py index 7c6c880757..46fa29f8ae 100644 --- a/yt_dlp/extractor/elpais.py +++ b/yt_dlp/extractor/elpais.py @@ -15,7 +15,7 @@ class ElPaisIE(InfoExtractor): 'title': 'Tiempo nuevo, recetas viejas', 'description': 'De lunes a viernes, a partir de las ocho de la mañana, Iñaki Gabilondo nos cuenta su visión de la actualidad nacional e internacional.', 'upload_date': '20140206', - } + }, }, { 'url': 'http://elcomidista.elpais.com/elcomidista/2016/02/24/articulo/1456340311_668921.html#?id_externo_nwl=newsletter_diaria20160303t', 'md5': '3bd5b09509f3519d7d9e763179b013de', @@ -25,7 +25,7 @@ class ElPaisIE(InfoExtractor): 'title': 'Cómo hacer el mejor café con cafetera italiana', 'description': 'Que sí, que las cápsulas son cómodas. Pero si le pides algo más a la vida, quizá deberías aprender a usar bien la cafetera italiana. No tienes más que ver este vídeo y seguir sus siete normas básicas.', 'upload_date': '20160303', - } + }, }, { 'url': 'http://elpais.com/elpais/2017/01/26/ciencia/1485456786_417876.html', 'md5': '9c79923a118a067e1a45789e1e0b0f9c', diff --git a/yt_dlp/extractor/eltrecetv.py b/yt_dlp/extractor/eltrecetv.py new file mode 100644 index 0000000000..71cf921812 --- /dev/null +++ b/yt_dlp/extractor/eltrecetv.py @@ -0,0 +1,62 @@ +from .common import InfoExtractor + + +class ElTreceTVIE(InfoExtractor): + IE_DESC = 'El Trece TV (Argentina)' + _VALID_URL = r'https?://(?:www\.)?eltrecetv\.com\.ar/[\w-]+/capitulos/temporada-\d+/(?P[\w-]+)' + _TESTS = [ + { + 'url': 'https://www.eltrecetv.com.ar/ahora-caigo/capitulos/temporada-2023/programa-del-061023/', + 'md5': '71a66673dc63f9a5939d97bfe4b311ba', + 'info_dict': { + 'id': 'AHCA05102023145553329621094', + 'ext': 'mp4', + 'title': 'AHORA CAIGO - Programa 06/10/23', + 'thumbnail': 'https://thumbs.vodgc.net/AHCA05102023145553329621094.JPG?649339', + }, + }, + { + 'url': 'https://www.eltrecetv.com.ar/poco-correctos/capitulos/temporada-2023/programa-del-250923-invitada-dalia-gutmann/', + 'only_matching': True, + }, + { + 'url': 'https://www.eltrecetv.com.ar/argentina-tierra-de-amor-y-venganza/capitulos/temporada-2023/atav-2-capitulo-121-del-250923/', + 'only_matching': True, + }, + { + 'url': 'https://www.eltrecetv.com.ar/ahora-caigo/capitulos/temporada-2023/programa-del-250923/', + 'only_matching': True, + }, + { + 'url': 'https://www.eltrecetv.com.ar/pasaplatos/capitulos/temporada-2023/pasaplatos-el-restaurante-del-250923/', + 'only_matching': True, + }, + { + 'url': 'https://www.eltrecetv.com.ar/el-galpon/capitulos/temporada-2023/programa-del-160923-invitado-raul-lavie/', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + slug = self._match_id(url) + webpage = self._download_webpage(url, slug) + config = self._search_json( + r'Fusion.globalContent\s*=', webpage, 'content', slug)['promo_items']['basic']['embed']['config'] + video_url = config['m3u8'] + video_id = self._search_regex(r'/(\w+)\.m3u8', video_url, 'video id', default=slug) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4', m3u8_id='hls') + formats.extend([{ + 'url': f['url'][:-23], + 'format_id': f['format_id'].replace('hls', 'http'), + 'width': f.get('width'), + 'height': f.get('height'), + } for f in formats if f['url'].endswith('/tracks-v1a1/index.m3u8') and f.get('height') != 1080]) + + return { + 'id': video_id, + 'title': config.get('title'), + 'thumbnail': config.get('thumbnail'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/embedly.py b/yt_dlp/extractor/embedly.py index 458aaa0a08..a424b49df7 100644 --- a/yt_dlp/extractor/embedly.py +++ b/yt_dlp/extractor/embedly.py @@ -106,4 +106,4 @@ def _real_extract(self, url): return self.url_result(src, YoutubeTabIE) return self.url_result(smuggle_url( urllib.parse.unquote(traverse_obj(qs, ('src', 0), ('url', 0))), - {'http_headers': {'Referer': url}})) + {'referer': url})) diff --git a/yt_dlp/extractor/engadget.py b/yt_dlp/extractor/engadget.py deleted file mode 100644 index e7c5d7bf16..0000000000 --- a/yt_dlp/extractor/engadget.py +++ /dev/null @@ -1,15 +0,0 @@ -from .common import InfoExtractor - - -class EngadgetIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?engadget\.com/video/(?P[^/?#]+)' - - _TESTS = [{ - # video with vidible ID - 'url': 'https://www.engadget.com/video/57a28462134aa15a39f0421a/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result('aol-video:%s' % video_id) diff --git a/yt_dlp/extractor/epicon.py b/yt_dlp/extractor/epicon.py index 3bfcc5470c..696f3e11f0 100644 --- a/yt_dlp/extractor/epicon.py +++ b/yt_dlp/extractor/epicon.py @@ -14,7 +14,7 @@ class EpiconIE(InfoExtractor): 'title': 'Air Battle of Srinagar', 'description': 'md5:c4de2013af9bc05ae4392e4115d518d7', 'thumbnail': r're:^https?://.*\.jpg$', - } + }, }, { 'url': 'https://www.epicon.in/movies/krit', 'info_dict': { @@ -23,7 +23,7 @@ class EpiconIE(InfoExtractor): 'title': 'Krit', 'description': 'md5:c12b35dad915d48ccff7f013c79bab4a', 'thumbnail': r're:^https?://.*\.jpg$', - } + }, }, { 'url': 'https://www.epicon.in/tv-shows/paapnaashini-ganga/season-1/vardaan', 'info_dict': { @@ -32,7 +32,7 @@ class EpiconIE(InfoExtractor): 'title': 'Paapnaashini Ganga - Season 1 - Ep 1 - VARDAAN', 'description': 'md5:f517058c3d0402398eefa6242f4dd6ae', 'thumbnail': r're:^https?://.*\.jpg$', - } + }, }, { 'url': 'https://www.epicon.in/movies/jayadev', 'info_dict': { @@ -41,16 +41,17 @@ class EpiconIE(InfoExtractor): 'title': 'Jayadev', 'description': 'md5:09e349eecd8e585a3b6466904f19df6c', 'thumbnail': r're:^https?://.*\.jpg$', - } + }, }] def _real_extract(self, url): - id = self._match_id(url) - webpage = self._download_webpage(url, id) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) cid = self._search_regex(r'class=\"mylist-icon\ iconclick\"\ id=\"(\d+)', webpage, 'cid') headers = {'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'} data = f'cid={cid}&action=st&type=video'.encode() - data_json = self._parse_json(self._download_json('https://www.epicon.in/ajaxplayer/', id, headers=headers, data=data), id) + data_json = self._parse_json( + self._download_json('https://www.epicon.in/ajaxplayer/', video_id, headers=headers, data=data), video_id) if not data_json['success']: raise ExtractorError(data_json['message'], expected=True) @@ -58,7 +59,7 @@ def _real_extract(self, url): title = self._search_regex(r'setplaytitle=\"([^\"]+)', webpage, 'title') description = self._og_search_description(webpage) or None thumbnail = self._og_search_thumbnail(webpage) or None - formats = self._extract_m3u8_formats(data_json['url']['video_url'], id) + formats = self._extract_m3u8_formats(data_json['url']['video_url'], video_id) subtitles = {} for subtitle in data_json.get('subtitles', []): @@ -70,7 +71,7 @@ def _real_extract(self, url): }) return { - 'id': id, + 'id': video_id, 'formats': formats, 'title': title, 'description': description, @@ -108,8 +109,8 @@ class EpiconSeriesIE(InfoExtractor): }] def _real_extract(self, url): - id = self._match_id(url) - webpage = self._download_webpage(url, id) - episodes = re.findall(r'ct-tray-url=\"(tv-shows/%s/[^\"]+)' % id, webpage) - entries = [self.url_result('https://www.epicon.in/%s' % episode, ie=EpiconIE.ie_key()) for episode in episodes] - return self.playlist_result(entries, playlist_id=id) + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + episodes = re.findall(rf'ct-tray-url=\"(tv-shows/{playlist_id}/[^\"]+)', webpage) + entries = [self.url_result(f'https://www.epicon.in/{episode}', EpiconIE) for episode in episodes] + return self.playlist_result(entries, playlist_id=playlist_id) diff --git a/yt_dlp/extractor/epidemicsound.py b/yt_dlp/extractor/epidemicsound.py new file mode 100644 index 0000000000..75b0f052b2 --- /dev/null +++ b/yt_dlp/extractor/epidemicsound.py @@ -0,0 +1,124 @@ +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, + join_nonempty, + orderedSet, + parse_iso8601, + parse_qs, + parse_resolution, + str_or_none, + traverse_obj, + url_or_none, +) + + +class EpidemicSoundIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?epidemicsound\.com/(?:(?Psound-effects/tracks)|track)/(?P[0-9a-zA-Z-]+)' + _TESTS = [{ + 'url': 'https://www.epidemicsound.com/track/yFfQVRpSPz/', + 'md5': 'd98ff2ddb49e8acab9716541cbc9dfac', + 'info_dict': { + 'id': '45014', + 'display_id': 'yFfQVRpSPz', + 'ext': 'mp3', + 'title': 'Door Knock Door 1', + 'alt_title': 'Door Knock Door 1', + 'tags': ['foley', 'door', 'knock', 'glass', 'window', 'glass door knock'], + 'categories': ['Misc. Door'], + 'duration': 1, + 'thumbnail': 'https://cdn.epidemicsound.com/curation-assets/commercial-release-cover-images/default-sfx/3000x3000.jpg', + 'timestamp': 1415320353, + 'upload_date': '20141107', + }, + }, { + 'url': 'https://www.epidemicsound.com/track/mj8GTTwsZd/', + 'md5': 'c82b745890f9baf18dc2f8d568ee3830', + 'info_dict': { + 'id': '148700', + 'display_id': 'mj8GTTwsZd', + 'ext': 'mp3', + 'title': 'Noplace', + 'tags': ['liquid drum n bass', 'energetic'], + 'categories': ['drum and bass'], + 'duration': 237, + 'timestamp': 1694426482, + 'thumbnail': 'https://cdn.epidemicsound.com/curation-assets/commercial-release-cover-images/11138/3000x3000.jpg', + 'upload_date': '20230911', + 'release_timestamp': 1700535606, + 'release_date': '20231121', + }, + }, { + 'url': 'https://www.epidemicsound.com/sound-effects/tracks/2f02f54b-9faa-4daf-abac-1cfe9e9cef69/', + 'md5': '35d7cf05bd8b614a84f0495a05de9388', + 'info_dict': { + 'id': '208931', + 'ext': 'mp3', + 'upload_date': '20240603', + 'timestamp': 1717436529, + 'categories': ['appliance'], + 'display_id': '6b2NXLURPr', + 'duration': 1.0, + 'title': 'Oven, Grill, Door Open 01', + 'thumbnail': 'https://cdn.epidemicsound.com/curation-assets/commercial-release-cover-images/default-sfx/3000x3000.jpg', + }, + }] + + @staticmethod + def _epidemic_parse_thumbnail(url: str): + if not url_or_none(url): + return None + + return { + 'url': url, + **(traverse_obj(url, ({parse_qs}, { + 'width': ('width', 0, {int_or_none}), + 'height': ('height', 0, {int_or_none}), + })) or parse_resolution(url)), + } + + @staticmethod + def _epidemic_fmt_or_none(f): + if not f.get('format'): + f['format'] = f.get('format_id') + elif not f.get('format_id'): + f['format_id'] = f['format'] + if not f['url'] or not f['format']: + return None + if f.get('format_note'): + f['format_note'] = f'track ID {f["format_note"]}' + if f['format'] != 'full': + f['preference'] = -2 + return f + + def _real_extract(self, url): + video_id, is_sfx = self._match_valid_url(url).group('id', 'sfx') + json_data = self._download_json(join_nonempty( + 'https://www.epidemicsound.com/json/track', + is_sfx and 'kosmos-id', video_id, delim='/'), video_id) + + thumbnails = traverse_obj(json_data, [('imageUrl', 'cover')]) + thumb_base_url = traverse_obj(json_data, ('coverArt', 'baseUrl', {url_or_none})) + if thumb_base_url: + thumbnails.extend(traverse_obj(json_data, ( + 'coverArt', 'sizes', ..., {thumb_base_url.__add__}))) + + return traverse_obj(json_data, { + 'id': ('id', {str_or_none}), + 'display_id': ('publicSlug', {str}), + 'title': ('title', {str}), + 'alt_title': ('oldTitle', {str}), + 'duration': ('length', {float_or_none}), + 'timestamp': ('added', {parse_iso8601}), + 'release_timestamp': ('releaseDate', {parse_iso8601}), + 'categories': ('genres', ..., 'tag', {str}), + 'tags': ('metadataTags', ..., {str}), + 'age_limit': ('isExplicit', {lambda b: 18 if b else None}), + 'thumbnails': ({lambda _: thumbnails}, {orderedSet}, ..., {self._epidemic_parse_thumbnail}), + 'formats': ('stems', {dict.items}, ..., { + 'format': (0, {str_or_none}), + 'format_note': (1, 's3TrackId', {str_or_none}), + 'format_id': (1, 'stemType', {str}), + 'url': (1, 'lqMp3Url', {url_or_none}), + }, {self._epidemic_fmt_or_none}), + }) diff --git a/yt_dlp/extractor/eplus.py b/yt_dlp/extractor/eplus.py new file mode 100644 index 0000000000..d2ad5b441e --- /dev/null +++ b/yt_dlp/extractor/eplus.py @@ -0,0 +1,205 @@ +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + try_call, + unified_timestamp, + urlencode_postdata, +) + + +class EplusIbIE(InfoExtractor): + _NETRC_MACHINE = 'eplus' + IE_NAME = 'eplus' + IE_DESC = 'e+ (イープラス)' + _VALID_URL = [r'https?://live\.eplus\.jp/ex/player\?ib=(?P(?:\w|%2B|%2F){86}%3D%3D)', + r'https?://live\.eplus\.jp/(?Psample|\d+)'] + _TESTS = [{ + 'url': 'https://live.eplus.jp/ex/player?ib=41K6Wzbr3PlcMD%2FOKHFlC%2FcZCe2Eaw7FK%2BpJS1ooUHki8d0vGSy2mYqxillQBe1dSnOxU%2B8%2FzXKls4XPBSb3vw%3D%3D', + 'info_dict': { + 'id': '335699-0001-006', + 'title': '少女☆歌劇 レヴュースタァライト -The LIVE 青嵐- BLUE GLITTER <定点映像配信>【Streaming+(配信)】', + 'live_status': 'was_live', + 'release_date': '20201221', + 'release_timestamp': 1608544800, + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': [ + 'This event may not be accessible', + 'No video formats found', + 'Requested format is not available', + ], + }, { + 'url': 'https://live.eplus.jp/ex/player?ib=6QSsQdyRAwOFZrEHWlhRm7vocgV%2FO0YzBZ%2BaBEBg1XR%2FmbLn0R%2F048dUoAY038%2F%2F92MJ73BsoAtvUpbV6RLtDQ%3D%3D&show_id=2371511', + 'info_dict': { + 'id': '348021-0054-001', + 'title': 'ラブライブ!スーパースター!! Liella! First LoveLive! Tour ~Starlines~【東京/DAY.1】', + 'live_status': 'was_live', + 'release_date': '20220115', + 'release_timestamp': 1642233600, + 'description': str, + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': [ + 'Could not find the playlist URL. This event may not be accessible', + 'No video formats found!', + 'Requested format is not available', + ], + }, { + 'url': 'https://live.eplus.jp/sample', + 'info_dict': { + 'id': 'stream1ng20210719-test-005', + 'title': 'Online streaming test for DRM', + 'live_status': 'was_live', + 'release_date': '20210719', + 'release_timestamp': 1626703200, + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': [ + 'Could not find the playlist URL. This event may not be accessible', + 'No video formats found!', + 'Requested format is not available', + 'This video is DRM protected', + ], + }, { + 'url': 'https://live.eplus.jp/2053935', + 'info_dict': { + 'id': '331320-0001-001', + 'title': '丘みどり2020配信LIVE Vol.2 ~秋麗~ 【Streaming+(配信チケット)】', + 'live_status': 'was_live', + 'release_date': '20200920', + 'release_timestamp': 1600596000, + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': [ + 'Could not find the playlist URL. This event may not be accessible', + 'No video formats found!', + 'Requested format is not available', + ], + }] + + _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0' + + def _login(self, username, password, urlh): + if not self._get_cookies('https://live.eplus.jp/').get('ci_session'): + raise ExtractorError('Unable to get ci_session cookie') + + cltft_token = urlh.headers.get('X-CLTFT-Token') + if not cltft_token: + raise ExtractorError('Unable to get X-CLTFT-Token') + self._set_cookie('live.eplus.jp', 'X-CLTFT-Token', cltft_token) + + login_json = self._download_json( + 'https://live.eplus.jp/member/api/v1/FTAuth/idpw', None, + note='Sending pre-login info', errnote='Unable to send pre-login info', headers={ + 'Content-Type': 'application/json; charset=UTF-8', + 'Referer': urlh.url, + 'X-Cltft-Token': cltft_token, + 'Accept': '*/*', + }, data=json.dumps({ + 'loginId': username, + 'loginPassword': password, + }).encode()) + if not login_json.get('isSuccess'): + raise ExtractorError('Login failed: Invalid id or password', expected=True) + + self._request_webpage( + urlh.url, None, note='Logging in', errnote='Unable to log in', + data=urlencode_postdata({ + 'loginId': username, + 'loginPassword': password, + 'Token.Default': cltft_token, + 'op': 'nextPage', + }), headers={'Referer': urlh.url}) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage, urlh = self._download_webpage_handle( + url, video_id, headers={'User-Agent': self._USER_AGENT}) + if urlh.url.startswith('https://live.eplus.jp/member/auth'): + username, password = self._get_login_info() + if not username: + self.raise_login_required() + self._login(username, password, urlh) + webpage = self._download_webpage( + url, video_id, headers={'User-Agent': self._USER_AGENT}) + + data_json = self._search_json(r'') + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats( + data.get('file_url') or data['stream_url'], video_id, 'm4a', m3u8_id='hls'), + 'age_limit': 18, + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'release_timestamp': ('created_at', {parse_iso8601}), + 'modified_timestamp': ('updated_at', {parse_iso8601}), + 'uploader': ('user', 'name', {str}), + 'uploader_id': ('user', 'id', {str_or_none}), + 'uploader_url': ('user', 'permalink_url', {url_or_none}), + 'thumbnail': ('artwork_url', {url_or_none}), + 'duration': ('duration', {int_or_none}), + 'view_count': ('plays', {int_or_none}), + 'comment_count': ('comment_count', {int_or_none}), + 'webpage_url': ('permalink_url', {url_or_none}), + }), + } diff --git a/yt_dlp/extractor/eroprofile.py b/yt_dlp/extractor/eroprofile.py index 2b61f3be7d..2067217e7c 100644 --- a/yt_dlp/extractor/eroprofile.py +++ b/yt_dlp/extractor/eroprofile.py @@ -1,7 +1,7 @@ import re +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode from ..utils import ( ExtractorError, merge_dicts, @@ -38,7 +38,7 @@ class EroProfileIE(InfoExtractor): }] def _perform_login(self, username, password): - query = compat_urllib_parse_urlencode({ + query = urllib.parse.urlencode({ 'username': username, 'password': password, 'url': 'http://www.eroprofile.com/', @@ -91,7 +91,7 @@ class EroProfileAlbumIE(InfoExtractor): 'url': 'https://www.eroprofile.com/m/videos/album/BBW-2-893', 'info_dict': { 'id': 'BBW-2-893', - 'title': 'BBW 2' + 'title': 'BBW 2', }, 'playlist_mincount': 486, }, diff --git a/yt_dlp/extractor/err.py b/yt_dlp/extractor/err.py new file mode 100644 index 0000000000..7896cdbdc0 --- /dev/null +++ b/yt_dlp/extractor/err.py @@ -0,0 +1,224 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + str_or_none, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class ERRJupiterIE(InfoExtractor): + _VALID_URL = r'https?://(?:jupiter(?:pluss)?|lasteekraan)\.err\.ee/(?P\d+)' + _TESTS = [{ + 'note': 'Jupiter: Movie: siin-me-oleme', + 'url': 'https://jupiter.err.ee/1211107/siin-me-oleme', + 'md5': '9b45d1682a98853acaa1e1b0c791f425', + 'info_dict': { + 'id': '1211107', + 'ext': 'mp4', + 'title': 'Siin me oleme!', + 'alt_title': '', + 'description': 'md5:1825b795f5f7584241aeb59e5bbb4f70', + 'release_date': '20231226', + 'upload_date': '20201217', + 'modified_date': '20201217', + 'release_timestamp': 1703577600, + 'timestamp': 1608210000, + 'modified_timestamp': 1608220800, + 'release_year': 1978, + }, + }, { + 'note': 'Jupiter: Series: Impulss', + 'url': 'https://jupiter.err.ee/1609145945/impulss', + 'md5': 'a378486df07ed1ba74e46cc861886243', + 'info_dict': { + 'id': '1609145945', + 'ext': 'mp4', + 'title': 'Impulss', + 'alt_title': 'Loteriipilet hooldekodusse', + 'description': 'md5:fa8a2ed0cdccb130211513443ee4d571', + 'release_date': '20231107', + 'upload_date': '20231026', + 'modified_date': '20231118', + 'release_timestamp': 1699380000, + 'timestamp': 1698327601, + 'modified_timestamp': 1700311802, + 'series': 'Impulss', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Loteriipilet hooldekodusse', + 'episode_number': 6, + 'series_id': '1609108187', + 'release_year': 2023, + 'episode_id': '1609145945', + }, + }, { + 'note': 'Jupiter: Radio Show: mnemoturniir episode', + 'url': 'https://jupiter.err.ee/1037919/mnemoturniir', + 'md5': 'f1eb95fe66f9620ff84e81bbac37076a', + 'info_dict': { + 'id': '1037919', + 'ext': 'm4a', + 'title': 'Mnemoturniir', + 'alt_title': '', + 'description': 'md5:626db52394e7583c26ab74d6a34d9982', + 'release_date': '20240121', + 'upload_date': '20240108', + 'modified_date': '20240121', + 'release_timestamp': 1705827900, + 'timestamp': 1704675602, + 'modified_timestamp': 1705827601, + 'series': 'Mnemoturniir', + 'season': 'Season 0', + 'season_number': 0, + 'episode': 'Episode 0', + 'episode_number': 0, + 'series_id': '1037919', + 'release_year': 2024, + 'episode_id': '1609215101', + }, + }, { + 'note': 'Jupiter+: Clip: bolee-zelenyj-tallinn', + 'url': 'https://jupiterpluss.err.ee/1609180445/bolee-zelenyj-tallinn', + 'md5': '1b812270c4daf6ce51c06bfeaf33ed95', + 'info_dict': { + 'id': '1609180445', + 'ext': 'mp4', + 'title': 'Более зеленый Таллинн', + 'alt_title': '', + 'description': 'md5:fd34d9bf939c28c4a725b19a7f0d6320', + 'release_date': '20231224', + 'upload_date': '20231130', + 'modified_date': '20231207', + 'release_timestamp': 1703423400, + 'timestamp': 1701338400, + 'modified_timestamp': 1701967200, + 'release_year': 2023, + }, + }, { + 'note': 'Jupiter+: Series: The Sniffer', + 'url': 'https://jupiterpluss.err.ee/1608311387/njuhach', + 'md5': '2abdeb7131ce551bce49e8d0cea08536', + 'info_dict': { + 'id': '1608311387', + 'ext': 'mp4', + 'title': 'Нюхач', + 'alt_title': '', + 'description': 'md5:8c5c7d8f32ec6e54cd498c9e59ca83bc', + 'release_date': '20230601', + 'upload_date': '20210818', + 'modified_date': '20210903', + 'release_timestamp': 1685633400, + 'timestamp': 1629318000, + 'modified_timestamp': 1630686000, + 'release_year': 2013, + 'episode': 'Episode 1', + 'episode_id': '1608311390', + 'episode_number': 1, + 'season': 'Season 1', + 'season_number': 1, + 'series': 'Нюхач', + 'series_id': '1608311387', + }, + }, { + 'note': 'Jupiter+: Podcast: lesnye-istorii-aisty', + 'url': 'https://jupiterpluss.err.ee/1608990335/lesnye-istorii-aisty', + 'md5': '8b46d7e4510b254a14b7a52211b5bf96', + 'info_dict': { + 'id': '1608990335', + 'ext': 'm4a', + 'title': 'Лесные истории | Аисты', + 'alt_title': '', + 'description': 'md5:065e721623e271e7a63e6540d409ca6b', + 'release_date': '20230609', + 'upload_date': '20230527', + 'modified_date': '20230608', + 'release_timestamp': 1686308700, + 'timestamp': 1685145600, + 'modified_timestamp': 1686252600, + 'release_year': 2023, + 'episode': 'Episode 0', + 'episode_id': '1608990335', + 'episode_number': 0, + 'season': 'Season 0', + 'season_number': 0, + 'series': 'Лесные истории | Аисты', + 'series_id': '1037497', + }, + }, { + 'note': 'Lasteekraan: Pätu', + 'url': 'https://lasteekraan.err.ee/1092243/patu', + 'md5': 'a67eb9b9bcb3d201718c15d1638edf77', + 'info_dict': { + 'id': '1092243', + 'ext': 'mp4', + 'title': 'Pätu', + 'alt_title': '', + 'description': 'md5:64a7b5a80afd7042d3f8ec48c77befd9', + 'release_date': '20230614', + 'upload_date': '20200520', + 'modified_date': '20200520', + 'release_timestamp': 1686745800, + 'timestamp': 1589975640, + 'modified_timestamp': 1589975640, + 'release_year': 1990, + 'episode': 'Episode 1', + 'episode_id': '1092243', + 'episode_number': 1, + 'season': 'Season 1', + 'season_number': 1, + 'series': 'Pätu', + 'series_id': '1092236', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._download_json( + 'https://services.err.ee/api/v2/vodContent/getContentPageData', video_id, + query={'contentId': video_id})['data']['mainContent'] + + media_data = traverse_obj(data, ('medias', ..., {dict}), get_all=False) + if traverse_obj(media_data, ('restrictions', 'drm', {bool})): + self.report_drm(video_id) + + formats, subtitles = [], {} + for format_url in set(traverse_obj(media_data, ('src', ('hls', 'hls2', 'hlsNew'), {url_or_none}))): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + format_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + for format_url in set(traverse_obj(media_data, ('src', ('dash', 'dashNew'), {url_or_none}))): + fmts, subs = self._extract_mpd_formats_and_subtitles( + format_url, video_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + if format_url := traverse_obj(media_data, ('src', 'file', {url_or_none})): + formats.append({ + 'url': format_url, + 'format_id': 'http', + }) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(data, { + 'title': ('heading', {str}), + 'alt_title': ('subHeading', {str}), + 'description': (('lead', 'body'), {clean_html}, {lambda x: x or None}), + 'timestamp': ('created', {int_or_none}), + 'modified_timestamp': ('updated', {int_or_none}), + 'release_timestamp': (('scheduleStart', 'publicStart'), {int_or_none}), + 'release_year': ('year', {int_or_none}), + }, get_all=False), + **(traverse_obj(data, { + 'series': ('heading', {str}), + 'series_id': ('rootContentId', {str_or_none}), + 'episode': ('subHeading', {str}), + 'season_number': ('season', {int_or_none}), + 'episode_number': ('episode', {int_or_none}), + 'episode_id': ('id', {str_or_none}), + }) if data.get('type') == 'episode' else {}), + } diff --git a/yt_dlp/extractor/ertgr.py b/yt_dlp/extractor/ertgr.py index 9ecdf5d3b7..864aa6dc5a 100644 --- a/yt_dlp/extractor/ertgr.py +++ b/yt_dlp/extractor/ertgr.py @@ -2,17 +2,16 @@ import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( + ExtractorError, clean_html, determine_ext, - ExtractorError, dict_get, int_or_none, merge_dicts, - parse_qs, parse_age_limit, parse_iso8601, + parse_qs, str_or_none, try_get, url_or_none, @@ -30,19 +29,19 @@ def _call_api( headers = headers or {} if data: headers['Content-Type'] = headers_as_param['Content-Type'] = 'application/json;charset=utf-8' - data = json.dumps(merge_dicts(platform_codename, data)).encode('utf-8') + data = json.dumps(merge_dicts(platform_codename, data)).encode() query = merge_dicts( {} if data else platform_codename, {'$headers': json.dumps(headers_as_param)}, params) response = self._download_json( - 'https://api.app.ertflix.gr/v%s/%s' % (str(api_version), method), + f'https://api.app.ertflix.gr/v{api_version!s}/{method}', video_id, fatal=False, query=query, data=data, headers=headers) if try_get(response, lambda x: x['Result']['Success']) is True: return response def _call_api_get_tiles(self, video_id, *tile_ids): - requested_tile_ids = [video_id] + list(tile_ids) + requested_tile_ids = [video_id, *tile_ids] requested_tiles = [{'Id': tile_id} for tile_id in requested_tile_ids] tiles_response = self._call_api( video_id, method='Tile/GetTiles', api_version=2, @@ -174,9 +173,9 @@ class ERTFlixIE(ERTFlixBaseIE): }] def _extract_episode(self, episode): - codename = try_get(episode, lambda x: x['Codename'], compat_str) + codename = try_get(episode, lambda x: x['Codename'], str) title = episode.get('Title') - description = clean_html(dict_get(episode, ('ShortDescription', 'TinyDescription', ))) + description = clean_html(dict_get(episode, ('ShortDescription', 'TinyDescription'))) if not codename or not title or not episode.get('HasPlayableStream', True): return thumbnail = next(( @@ -195,7 +194,7 @@ def _extract_episode(self, episode): 'timestamp': parse_iso8601(episode.get('PublishDate')), 'duration': episode.get('DurationSeconds'), 'age_limit': self._parse_age_rating(episode), - 'url': 'ertflix:%s' % (codename, ), + 'url': f'ertflix:{codename}', } @staticmethod @@ -212,7 +211,7 @@ def _extract_series(self, video_id, season_titles=None, season_numbers=None): series_info = { 'age_limit': self._parse_age_rating(series), 'title': series.get('Title'), - 'description': dict_get(series, ('ShortDescription', 'TinyDescription', )), + 'description': dict_get(series, ('ShortDescription', 'TinyDescription')), } if season_numbers: season_titles = season_titles or [] @@ -281,7 +280,7 @@ class ERTWebtvEmbedIE(InfoExtractor): 'id': 'trailers/E2251_TO_DIKTYO_E09_16-01_1900.mp4', 'title': 'md5:914f06a73cd8b62fbcd6fb90c636e497', 'ext': 'mp4', - 'thumbnail': 'https://program.ert.gr/photos/2022/1/to_diktio_ep09_i_istoria_tou_diadiktiou_stin_Ellada_1021x576.jpg' + 'thumbnail': 'https://program.ert.gr/photos/2022/1/to_diktio_ep09_i_istoria_tou_diadiktiou_stin_Ellada_1021x576.jpg', }, }] diff --git a/yt_dlp/extractor/escapist.py b/yt_dlp/extractor/escapist.py deleted file mode 100644 index 85a1cbf400..0000000000 --- a/yt_dlp/extractor/escapist.py +++ /dev/null @@ -1,108 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - determine_ext, - clean_html, - int_or_none, - float_or_none, -) - - -def _decrypt_config(key, string): - a = '' - i = '' - r = '' - - while len(a) < (len(string) / 2): - a += key - - a = a[0:int(len(string) / 2)] - - t = 0 - while t < len(string): - i += chr(int(string[t] + string[t + 1], 16)) - t += 2 - - icko = [s for s in i] - - for t, c in enumerate(a): - r += chr(ord(c) ^ ord(icko[t])) - - return r - - -class EscapistIE(InfoExtractor): - _VALID_URL = r'https?://?(?:(?:www|v1)\.)?escapistmagazine\.com/videos/view/[^/]+/(?P[0-9]+)' - _TESTS = [{ - 'url': 'http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate', - 'md5': 'ab3a706c681efca53f0a35f1415cf0d1', - 'info_dict': { - 'id': '6618', - 'ext': 'mp4', - 'description': "Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.", - 'title': "Breaking Down Baldur's Gate", - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 264, - 'uploader': 'The Escapist', - } - }, { - 'url': 'http://www.escapistmagazine.com/videos/view/zero-punctuation/10044-Evolve-One-vs-Multiplayer', - 'md5': '9e8c437b0dbb0387d3bd3255ca77f6bf', - 'info_dict': { - 'id': '10044', - 'ext': 'mp4', - 'description': 'This week, Zero Punctuation reviews Evolve.', - 'title': 'Evolve - One vs Multiplayer', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 304, - 'uploader': 'The Escapist', - } - }, { - 'url': 'http://escapistmagazine.com/videos/view/the-escapist-presents/6618', - 'only_matching': True, - }, { - 'url': 'https://v1.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - ims_video = self._parse_json( - self._search_regex( - r'imsVideo\.play\(({.+?})\);', webpage, 'imsVideo'), - video_id) - video_id = ims_video['videoID'] - key = ims_video['hash'] - - config = self._download_webpage( - 'http://www.escapistmagazine.com/videos/vidconfig.php', - video_id, 'Downloading video config', headers={ - 'Referer': url, - }, query={ - 'videoID': video_id, - 'hash': key, - }) - - data = self._parse_json(_decrypt_config(key, config), video_id) - - video_data = data['videoData'] - - title = clean_html(video_data['title']) - - formats = [{ - 'url': video['src'], - 'format_id': '%s-%sp' % (determine_ext(video['src']), video['res']), - 'height': int_or_none(video.get('res')), - } for video in data['files']['videos']] - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage) or data.get('poster'), - 'description': self._og_search_description(webpage), - 'duration': float_or_none(video_data.get('duration'), 1000), - 'uploader': video_data.get('publisher'), - 'series': video_data.get('show'), - } diff --git a/yt_dlp/extractor/espn.py b/yt_dlp/extractor/espn.py index f4b0134ab8..4e9b63524e 100644 --- a/yt_dlp/extractor/espn.py +++ b/yt_dlp/extractor/espn.py @@ -100,13 +100,13 @@ class ESPNIE(OnceIE): }, { 'url': 'http://www.espn.com/watch/player?bucketId=257&id=19505875', 'only_matching': True, - }, ] + }] def _real_extract(self, url): video_id = self._match_id(url) clip = self._download_json( - 'http://api-app.espn.com/v1/video/clips/%s' % video_id, + f'http://api-app.espn.com/v1/video/clips/{video_id}', video_id)['videos'][0] title = clip['headline'] @@ -115,16 +115,16 @@ def _real_extract(self, url): formats = [] def traverse_source(source, base_source_id=None): - for source_id, source in source.items(): - if source_id == 'alert': + for src_id, src_item in source.items(): + if src_id == 'alert': continue - elif isinstance(source, str): - extract_source(source, base_source_id) - elif isinstance(source, dict): + elif isinstance(src_item, str): + extract_source(src_item, base_source_id) + elif isinstance(src_item, dict): traverse_source( - source, - '%s-%s' % (base_source_id, source_id) - if base_source_id else source_id) + src_item, + f'{base_source_id}-{src_id}' + if base_source_id else src_id) def extract_source(source_url, source_id=None): if source_url in format_urls: @@ -209,7 +209,7 @@ def _real_extract(self, url): webpage, 'video id', group='id') return self.url_result( - 'http://espn.go.com/video/clip?id=%s' % video_id, ESPNIE.ie_key()) + f'http://espn.go.com/video/clip?id={video_id}', ESPNIE.ie_key()) class FiveThirtyEightIE(InfoExtractor): @@ -240,7 +240,7 @@ def _real_extract(self, url): class ESPNCricInfoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?espncricinfo\.com/video/[^#$&?/]+-(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?espncricinfo\.com/(?:cricket-)?videos?/[^#$&?/]+-(?P\d+)' _TESTS = [{ 'url': 'https://www.espncricinfo.com/video/finch-chasing-comes-with-risks-despite-world-cup-trend-1289135', 'info_dict': { @@ -251,16 +251,28 @@ class ESPNCricInfoIE(InfoExtractor): 'upload_date': '20211113', 'duration': 96, }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, + }, { + 'url': 'https://www.espncricinfo.com/cricket-videos/daryl-mitchell-mitchell-santner-is-one-of-the-best-white-ball-spinners-india-vs-new-zealand-1356225', + 'info_dict': { + 'id': '1356225', + 'ext': 'mp4', + 'description': '"Santner has done it for a long time for New Zealand - we\'re lucky to have him"', + 'upload_date': '20230128', + 'title': 'Mitchell: \'Santner is one of the best white-ball spinners at the moment\'', + 'duration': 87, + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): - id = self._match_id(url) - data_json = self._download_json(f'https://hs-consumer-api.espncricinfo.com/v1/pages/video/video-details?videoId={id}', id)['video'] + video_id = self._match_id(url) + data_json = self._download_json( + f'https://hs-consumer-api.espncricinfo.com/v1/pages/video/video-details?videoId={video_id}', video_id)['video'] formats, subtitles = [], {} for item in data_json.get('playbacks') or []: if item.get('type') == 'HLS' and item.get('url'): - m3u8_frmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(item['url'], id) + m3u8_frmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(item['url'], video_id) formats.extend(m3u8_frmts) subtitles = self._merge_subtitles(subtitles, m3u8_subs) elif item.get('type') == 'AUDIO' and item.get('url'): @@ -269,7 +281,7 @@ def _real_extract(self, url): 'vcodec': 'none', }) return { - 'id': id, + 'id': video_id, 'title': data_json.get('title'), 'description': data_json.get('summary'), 'upload_date': unified_strdate(dict_get(data_json, ('publishedAt', 'recordedAt'))), @@ -355,28 +367,28 @@ def _real_extract(self, url): 'subject_token': assertion, 'subject_token_type': 'urn:bamtech:params:oauth:token-type:device', 'platform': 'android', - 'grant_type': 'urn:ietf:params:oauth:grant-type:token-exchange' + 'grant_type': 'urn:ietf:params:oauth:grant-type:token-exchange', })['access_token'] assertion = self._call_bamgrid_api( 'accounts/grant', video_id, payload={'id_token': cookie.value.split('|')[1]}, headers={ 'Authorization': token, - 'Content-Type': 'application/json; charset=UTF-8' + 'Content-Type': 'application/json; charset=UTF-8', })['assertion'] token = self._call_bamgrid_api( 'token', video_id, payload={ 'subject_token': assertion, 'subject_token_type': 'urn:bamtech:params:oauth:token-type:account', 'platform': 'android', - 'grant_type': 'urn:ietf:params:oauth:grant-type:token-exchange' + 'grant_type': 'urn:ietf:params:oauth:grant-type:token-exchange', })['access_token'] playback = self._download_json( video_data['videoHref'].format(scenario='browser~ssai'), video_id, headers={ 'Accept': 'application/vnd.media-service+json; version=5', - 'Authorization': token + 'Authorization': token, }) m3u8_url, headers = playback['stream']['complete'][0]['url'], {'authorization': token} diff --git a/yt_dlp/extractor/esri.py b/yt_dlp/extractor/esri.py deleted file mode 100644 index 02e7efaf0d..0000000000 --- a/yt_dlp/extractor/esri.py +++ /dev/null @@ -1,70 +0,0 @@ -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - int_or_none, - parse_filesize, - unified_strdate, -) - - -class EsriVideoIE(InfoExtractor): - _VALID_URL = r'https?://video\.esri\.com/watch/(?P[0-9]+)' - _TEST = { - 'url': 'https://video.esri.com/watch/1124/arcgis-online-_dash_-developing-applications', - 'md5': 'd4aaf1408b221f1b38227a9bbaeb95bc', - 'info_dict': { - 'id': '1124', - 'ext': 'mp4', - 'title': 'ArcGIS Online - Developing Applications', - 'description': 'Jeremy Bartley demonstrates how to develop applications with ArcGIS Online.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 185, - 'upload_date': '20120419', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - formats = [] - for width, height, content in re.findall( - r'(?s)
  • (\d+)x(\d+):(.+?)
  • ', webpage): - for video_url, ext, filesize in re.findall( - r']+href="([^"]+)">([^<]+) \(([^<]+)\)', content): - formats.append({ - 'url': compat_urlparse.urljoin(url, video_url), - 'ext': ext.lower(), - 'format_id': '%s-%s' % (ext.lower(), height), - 'width': int(width), - 'height': int(height), - 'filesize_approx': parse_filesize(filesize), - }) - - title = self._html_search_meta('title', webpage, 'title') - description = self._html_search_meta( - 'description', webpage, 'description', fatal=False) - - thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail', fatal=False) - if thumbnail: - thumbnail = re.sub(r'_[st]\.jpg$', '_x.jpg', thumbnail) - - duration = int_or_none(self._search_regex( - [r'var\s+videoSeconds\s*=\s*(\d+)', r"'duration'\s*:\s*(\d+)"], - webpage, 'duration', fatal=False)) - - upload_date = unified_strdate(self._html_search_meta( - 'last-modified', webpage, 'upload date', fatal=False)) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'upload_date': upload_date, - 'formats': formats - } diff --git a/yt_dlp/extractor/ettutv.py b/yt_dlp/extractor/ettutv.py new file mode 100644 index 0000000000..73671776f5 --- /dev/null +++ b/yt_dlp/extractor/ettutv.py @@ -0,0 +1,60 @@ +from .common import InfoExtractor +from ..utils import bool_or_none, traverse_obj, unified_timestamp, url_or_none + + +class EttuTvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ettu\.tv/[^?#]+/playerpage/(?P[0-9]+)' + + _TESTS = [{ + 'url': 'https://www.ettu.tv/en-int/playerpage/1573849', + 'md5': '5874b7639a2aa866d1f6c3a4037c7c09', + 'info_dict': { + 'id': '1573849', + 'title': 'Ni Xia Lian - Shao Jieni', + 'description': 'ITTF Europe Top 16 Cup', + 'timestamp': 1677348600, + 'upload_date': '20230225', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'ext': 'mp4', + }, + }, { + 'url': 'https://www.ettu.tv/en-int/playerpage/1573753', + 'md5': '1fc094bf96cf2d5ec0f434d3a6dec9aa', + 'info_dict': { + 'id': '1573753', + 'title': 'Qiu Dang - Jorgic Darko', + 'description': 'ITTF Europe Top 16 Cup', + 'timestamp': 1677423600, + 'upload_date': '20230226', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'ext': 'mp4', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + player_settings = self._download_json( + f'https://www.ettu.tv/api/v3/contents/{video_id}/player-settings', video_id, query={ + 'language': 'en', + 'showTitle': 'true', + 'device': 'desktop', + }) + + stream_response = self._download_json(player_settings['streamAccess'], video_id, data=b'') + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + stream_response['data']['stream'], video_id, 'mp4') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(player_settings, { + 'title': 'title', + 'description': ('metaInformation', 'competition'), + 'thumbnail': ('image', {url_or_none}), + 'timestamp': ('date', {unified_timestamp}), + 'is_live': ('isLivestream', {bool_or_none}), + }), + } diff --git a/yt_dlp/extractor/europa.py b/yt_dlp/extractor/europa.py index 29daabe4a3..aa8baf2f78 100644 --- a/yt_dlp/extractor/europa.py +++ b/yt_dlp/extractor/europa.py @@ -6,12 +6,14 @@ parse_iso8601, parse_qs, qualities, + traverse_obj, unified_strdate, - xpath_text + xpath_text, ) class EuropaIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://ec\.europa\.eu/avservices/(?:video/player|audio/audioDetails)\.cfm\?.*?\bref=(?P[A-Za-z0-9-]+)' _TESTS = [{ 'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758', @@ -26,7 +28,7 @@ class EuropaIE(InfoExtractor): 'duration': 34, 'view_count': int, 'formats': 'mincount:3', - } + }, }, { 'url': 'http://ec.europa.eu/avservices/video/player.cfm?sitelang=en&ref=I107786', 'only_matching': True, @@ -39,11 +41,11 @@ def _real_extract(self, url): video_id = self._match_id(url) playlist = self._download_xml( - 'http://ec.europa.eu/avservices/video/player/playlist.cfm?ID=%s' % video_id, video_id) + f'http://ec.europa.eu/avservices/video/player/playlist.cfm?ID={video_id}', video_id) def get_item(type_, preference): items = {} - for item in playlist.findall('./info/%s/item' % type_): + for item in playlist.findall(f'./info/{type_}/item'): lang, label = xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None) if lang and label: items[lang] = label.strip() @@ -75,7 +77,7 @@ def get_item(type_, preference): 'url': video_url, 'format_id': lang, 'format_note': xpath_text(file_, './lglabel'), - 'language_preference': language_preference(lang) + 'language_preference': language_preference(lang), }) return { @@ -86,52 +88,28 @@ def get_item(type_, preference): 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, - 'formats': formats + 'formats': formats, } class EuroParlWebstreamIE(InfoExtractor): _VALID_URL = r'''(?x) - https?://(?:multimedia|webstreaming)\.europarl\.europa\.eu/[^/#?]+/ - (?:embed/embed\.html\?event=|(?!video)[^/#?]+/[\w-]+_)(?P[\w-]+) + https?://multimedia\.europarl\.europa\.eu/ + (?:\w+/)?webstreaming/(?:[\w-]+_)?(?P[\w-]+) ''' _TESTS = [{ 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY', 'info_dict': { - 'id': 'bcaa1db4-76ef-7e06-8da7-839bd0ad1dbe', + 'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', + 'display_id': '20220914-0900-PLENARY', 'ext': 'mp4', - 'release_timestamp': 1663137900, 'title': 'Plenary session', + 'release_timestamp': 1663139069, 'release_date': '20220914', }, 'params': { 'skip_download': True, - } - }, { - 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/eu-cop27-un-climate-change-conference-in-sharm-el-sheikh-egypt-ep-delegation-meets-with-ngo-represen_20221114-1600-SPECIAL-OTHER', - 'info_dict': { - 'id': 'a8428de8-b9cd-6a2e-11e4-3805d9c9ff5c', - 'ext': 'mp4', - 'release_timestamp': 1668434400, - 'release_date': '20221114', - 'title': 'md5:d3550280c33cc70e0678652e3d52c028', }, - 'params': { - 'skip_download': True, - } - }, { - # embed webpage - 'url': 'https://webstreaming.europarl.europa.eu/ep/embed/embed.html?event=20220914-0900-PLENARY&language=en&autoplay=true&logo=true', - 'info_dict': { - 'id': 'bcaa1db4-76ef-7e06-8da7-839bd0ad1dbe', - 'ext': 'mp4', - 'title': 'Plenary session', - 'release_date': '20220914', - 'release_timestamp': 1663137900, - }, - 'params': { - 'skip_download': True, - } }, { # live webstream 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20221115-1000-SPECIAL-EUROSCOLA', @@ -143,31 +121,70 @@ class EuroParlWebstreamIE(InfoExtractor): 'release_date': '20221115', 'live_status': 'is_live', }, - 'skip': 'not live anymore' + 'skip': 'not live anymore', + }, { + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-culture-and-education_20230301-1130-COMMITTEE-CULT', + 'info_dict': { + 'id': '7355662c-8eac-445e-4bb9-08db14b0ddd7', + 'display_id': '20230301-1130-COMMITTEE-CULT', + 'ext': 'mp4', + 'release_date': '20230301', + 'title': 'Committee on Culture and Education', + 'release_timestamp': 1677666641, + }, + }, { + # live stream + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-environment-public-health-and-food-safety_20230524-0900-COMMITTEE-ENVI', + 'info_dict': { + 'id': 'e4255f56-10aa-4b3c-6530-08db56d5b0d9', + 'ext': 'mp4', + 'release_date': '20230524', + 'title': r're:Committee on Environment, Public Health and Food Safety \d{4}-\d{2}-\d{2}\s\d{2}:\d{2}', + 'release_timestamp': 1684911541, + 'live_status': 'is_live', + }, + 'skip': 'Not live anymore', + }, { + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/20240320-1345-SPECIAL-PRESSER', + 'info_dict': { + 'id': 'c1f11567-5b52-470a-f3e1-08dc3c216ace', + 'display_id': '20240320-1345-SPECIAL-PRESSER', + 'ext': 'mp4', + 'release_date': '20240320', + 'title': 'md5:7c6c814cac55dea5e2d87bf8d3db2234', + 'release_timestamp': 1710939767, + }, + }, { + 'url': 'https://multimedia.europarl.europa.eu/webstreaming/briefing-for-media-on-2024-european-elections_20240429-1000-SPECIAL-OTHER', + 'only_matching': True, }] def _real_extract(self, url): display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + webpage_nextjs = self._search_nextjs_data(webpage, display_id)['props']['pageProps'] json_info = self._download_json( - 'https://vis-api.vuplay.co.uk/event/external', display_id, + 'https://acs-api.europarl.connectedviews.eu/api/FullMeeting', display_id, query={ - 'player_key': 'europarl|718f822c-a48c-4841-9947-c9cb9bb1743c', - 'external_id': display_id, + 'api-version': 1.0, + 'tenantId': 'bae646ca-1fc8-4363-80ba-2c04f06b4968', + 'externalReference': display_id, }) - formats, subtitles = self._extract_mpd_formats_and_subtitles(json_info['streaming_url'], display_id) - fmts, subs = self._extract_m3u8_formats_and_subtitles( - json_info['streaming_url'].replace('.mpd', '.m3u8'), display_id) - - formats.extend(fmts) - self._merge_subtitles(subs, target=subtitles) + formats, subtitles = [], {} + for hls_url in traverse_obj(json_info, ((('meetingVideo'), ('meetingVideos', ...)), 'hlsUrl')): + fmt, subs = self._extract_m3u8_formats_and_subtitles(hls_url, display_id) + formats.extend(fmt) + self._merge_subtitles(subs, target=subtitles) return { 'id': json_info['id'], - 'title': json_info.get('title'), + 'display_id': display_id, + 'title': traverse_obj(webpage_nextjs, (('mediaItem', 'title'), ('title', )), get_all=False), 'formats': formats, 'subtitles': subtitles, - 'release_timestamp': parse_iso8601(json_info.get('published_start')), - 'is_live': 'LIVE' in json_info.get('state', '') + 'release_timestamp': parse_iso8601(json_info.get('startDateTime')), + 'is_live': traverse_obj(webpage_nextjs, ('mediaItem', 'mediaSubType')) == 'Live', } diff --git a/yt_dlp/extractor/europeantour.py b/yt_dlp/extractor/europeantour.py index 1995a745d0..a5503dbc52 100644 --- a/yt_dlp/extractor/europeantour.py +++ b/yt_dlp/extractor/europeantour.py @@ -17,16 +17,16 @@ class EuropeanTourIE(InfoExtractor): 'uploader_id': '5136026580001', 'tags': ['prod-imported'], 'thumbnail': 'md5:fdac52bc826548860edf8145ee74e71a', - 'upload_date': '20211220' + 'upload_date': '20211220', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' def _real_extract(self, url): - id = self._match_id(url) - webpage = self._download_webpage(url, id) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) vid, aid = re.search(r'(?s)brightcove-player\s?video-id="([^"]+)".*"ACCOUNT_ID":"([^"]+)"', webpage).groups() if not aid: aid = '5136026580001' diff --git a/yt_dlp/extractor/eurosport.py b/yt_dlp/extractor/eurosport.py index 654e112064..0c5e1238d9 100644 --- a/yt_dlp/extractor/eurosport.py +++ b/yt_dlp/extractor/eurosport.py @@ -3,7 +3,7 @@ class EurosportIE(InfoExtractor): - _VALID_URL = r'https?://www\.eurosport\.com/\w+/[\w-]+/\d+/[\w-]+_(?Pvid\d+)' + _VALID_URL = r'https?://www\.eurosport\.com/\w+/(?:[\w-]+/[\d-]+/)?[\w-]+_(?Pvid\d+)' _TESTS = [{ 'url': 'https://www.eurosport.com/tennis/roland-garros/2022/highlights-rafael-nadal-brushes-aside-caper-ruud-to-win-record-extending-14th-french-open-title_vid1694147/video.shtml', 'info_dict': { @@ -16,7 +16,7 @@ class EurosportIE(InfoExtractor): 'display_id': 'vid1694147', 'timestamp': 1654446698, 'upload_date': '20220605', - } + }, }, { 'url': 'https://www.eurosport.com/tennis/roland-garros/2022/watch-the-top-five-shots-from-men-s-final-as-rafael-nadal-beats-casper-ruud-to-seal-14th-french-open_vid1694283/video.shtml', 'info_dict': { @@ -29,7 +29,7 @@ class EurosportIE(InfoExtractor): 'display_id': 'vid1694283', 'timestamp': 1654456090, 'upload_date': '20220605', - } + }, }, { # geo-fence but can bypassed by xff 'url': 'https://www.eurosport.com/cycling/tour-de-france-femmes/2022/incredible-ride-marlen-reusser-storms-to-stage-4-win-at-tour-de-france-femmes_vid1722221/video.shtml', @@ -43,7 +43,33 @@ class EurosportIE(InfoExtractor): 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2022/07/27/3423347-69852108-2560-1440.jpg', 'description': 'md5:32bbe3a773ac132c57fb1e8cca4b7c71', 'upload_date': '20220727', - } + }, + }, { + 'url': 'https://www.eurosport.com/football/champions-league/2022-2023/pep-guardiola-emotionally-destroyed-after-manchester-city-win-over-bayern-munich-in-champions-league_vid1896254/video.shtml', + 'info_dict': { + 'id': '3096477', + 'ext': 'mp4', + 'title': 'md5:82edc17370124c7a19b3cf518517583b', + 'duration': 84.0, + 'description': 'md5:b3f44ef7f5b5b95b24a273b163083feb', + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2023/04/12/3682873-74947393-2560-1440.jpg', + 'timestamp': 1681292028, + 'upload_date': '20230412', + 'display_id': 'vid1896254', + }, + }, { + 'url': 'https://www.eurosport.com/football/last-year-s-semi-final-pain-was-still-there-pep-guardiola-after-man-city-reach-cl-final_vid1914115/video.shtml', + 'info_dict': { + 'id': '3149108', + 'ext': 'mp4', + 'title': '\'Last year\'s semi-final pain was still there\' - Pep Guardiola after Man City reach CL final', + 'description': 'md5:89ef142fe0170a66abab77fac2955d8e', + 'display_id': 'vid1914115', + 'timestamp': 1684403618, + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2023/05/18/3707254-75435008-2560-1440.jpg', + 'duration': 105.0, + 'upload_date': '20230518', + }, }] _TOKEN = None diff --git a/yt_dlp/extractor/euscreen.py b/yt_dlp/extractor/euscreen.py index 65a1dc7c50..f08938fc9b 100644 --- a/yt_dlp/extractor/euscreen.py +++ b/yt_dlp/extractor/euscreen.py @@ -1,8 +1,7 @@ from .common import InfoExtractor - from ..utils import ( - parse_duration, js_to_json, + parse_duration, ) @@ -21,37 +20,37 @@ class EUScreenIE(InfoExtractor): 'series': 'JA2 DERNIERE', 'episode': '-', 'uploader': 'INA / France', - 'thumbnail': 'http://images3.noterik.com/domain/euscreenxl/user/eu_ina/video/EUS_0EBCBF356BFC4E12A014023BA41BD98C/image.jpg' + 'thumbnail': 'http://images3.noterik.com/domain/euscreenxl/user/eu_ina/video/EUS_0EBCBF356BFC4E12A014023BA41BD98C/image.jpg', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }] _payload = b'-1Win32MozillaNetscape5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36true784758undefinedSat, 07 Oct 2021 08:56:50 GMT1633769810758' def _real_extract(self, url): - id = self._match_id(url) + video_id = self._match_id(url) args_for_js_request = self._download_webpage( 'https://euscreen.eu/lou/LouServlet/domain/euscreenxl/html5application/euscreenxlitem', - id, data=self._payload, query={'actionlist': 'itempage', 'id': id}) + video_id, data=self._payload, query={'actionlist': 'itempage', 'id': video_id}) info_js = self._download_webpage( 'https://euscreen.eu/lou/LouServlet/domain/euscreenxl/html5application/euscreenxlitem', - id, data=args_for_js_request.replace('screenid', 'screenId').encode()) + video_id, data=args_for_js_request.replace('screenid', 'screenId').encode()) video_json = self._parse_json( self._search_regex(r'setVideo\(({.+})\)\(\$end\$\)put', info_js, 'Video JSON'), - id, transform_source=js_to_json) + video_id, transform_source=js_to_json) meta_json = self._parse_json( self._search_regex(r'setData\(({.+})\)\(\$end\$\)', info_js, 'Metadata JSON'), - id, transform_source=js_to_json) + video_id, transform_source=js_to_json) formats = [{ 'url': source['src'], } for source in video_json.get('sources', [])] return { - 'id': id, + 'id': video_id, 'title': meta_json.get('originalTitle'), 'alt_title': meta_json.get('title'), 'duration': parse_duration(meta_json.get('duration')), - 'description': '%s\n%s' % (meta_json.get('summaryOriginal', ''), meta_json.get('summaryEnglish', '')), + 'description': '{}\n{}'.format(meta_json.get('summaryOriginal', ''), meta_json.get('summaryEnglish', '')), 'series': meta_json.get('series') or meta_json.get('seriesEnglish'), 'episode': meta_json.get('episodeNumber'), 'uploader': meta_json.get('provider'), diff --git a/yt_dlp/extractor/expotv.py b/yt_dlp/extractor/expotv.py deleted file mode 100644 index bda6e3cb29..0000000000 --- a/yt_dlp/extractor/expotv.py +++ /dev/null @@ -1,74 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, -) - - -class ExpoTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?expotv\.com/videos/[^?#]*/(?P[0-9]+)($|[?#])' - _TEST = { - 'url': 'http://www.expotv.com/videos/reviews/3/40/NYX-Butter-lipstick/667916', - 'md5': 'fe1d728c3a813ff78f595bc8b7a707a8', - 'info_dict': { - 'id': '667916', - 'ext': 'mp4', - 'title': 'NYX Butter Lipstick Little Susie', - 'description': 'Goes on like butter, but looks better!', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Stephanie S.', - 'upload_date': '20150520', - 'view_count': int, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - player_key = self._search_regex( - r'Plays: ([0-9]+)', webpage, 'view counts')) - uploader = self._search_regex( - r'
    \s*([^Reviewed on ([0-9/.]+)', webpage, 'upload date', - fatal=False), day_first=False) - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': description, - 'view_count': view_count, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, - } diff --git a/yt_dlp/extractor/expressen.py b/yt_dlp/extractor/expressen.py index 86967b631b..33b829845b 100644 --- a/yt_dlp/extractor/expressen.py +++ b/yt_dlp/extractor/expressen.py @@ -11,8 +11,8 @@ class ExpressenIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?:www\.)?(?:expressen|di)\.se/ - (?:(?:tvspelare/video|videoplayer/embed)/)? - tv/(?:[^/]+/)* + (?:(?:tvspelare/video|video-?player/embed)/)? + (?:tv|nyheter)/(?:[^/?#]+/)* (?P[^/?#&]+) ''' _EMBED_REGEX = [r']+\bsrc=(["\'])(?P(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1'] @@ -42,6 +42,12 @@ class ExpressenIE(InfoExtractor): }, { 'url': 'https://www.di.se/videoplayer/embed/tv/ditv/borsmorgon/implantica-rusar-70--under-borspremiaren-hor-styrelsemedlemmen/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di', 'only_matching': True, + }, { + 'url': 'https://www.expressen.se/video-player/embed/tv/nyheter/ekero-fodda-olof-gustafsson-forvaltar-knarkbaronen-pablo-escobars-namn', + 'only_matching': True, + }, { + 'url': 'https://www.expressen.se/nyheter/efter-egna-telefonbluffen-escobar-stammer-klarna/', + 'only_matching': True, }] def _real_extract(self, url): @@ -52,7 +58,7 @@ def _real_extract(self, url): def extract_data(name): return self._parse_json( self._search_regex( - r'data-%s=(["\'])(?P(?:(?!\1).)+)\1' % name, + rf'data-{name}=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'info', group='value'), display_id, transform_source=unescapeHTML) diff --git a/yt_dlp/extractor/extremetube.py b/yt_dlp/extractor/extremetube.py deleted file mode 100644 index 2c1969899e..0000000000 --- a/yt_dlp/extractor/extremetube.py +++ /dev/null @@ -1,48 +0,0 @@ -from ..utils import str_to_int -from .keezmovies import KeezMoviesIE - - -class ExtremeTubeIE(KeezMoviesIE): # XXX: Do not subclass from concrete IE - _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P[^/#?&]+)' - _TESTS = [{ - 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', - 'md5': '92feaafa4b58e82f261e5419f39c60cb', - 'info_dict': { - 'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431', - 'ext': 'mp4', - 'title': 'Music Video 14 british euro brit european cumshots swallow', - 'uploader': 'anonim', - 'view_count': int, - 'age_limit': 18, - } - }, { - 'url': 'http://www.extremetube.com/gay/video/abcde-1234', - 'only_matching': True, - }, { - 'url': 'http://www.extremetube.com/video/latina-slut-fucked-by-fat-black-dick', - 'only_matching': True, - }, { - 'url': 'http://www.extremetube.com/video/652431', - 'only_matching': True, - }] - - def _real_extract(self, url): - webpage, info = self._extract_info(url) - - if not info['title']: - info['title'] = self._search_regex( - r']+title="([^"]+)"[^>]*>', webpage, 'title') - - uploader = self._html_search_regex( - r'Uploaded by:\s*]+>\s*]+>(.+?)', - webpage, 'uploader', fatal=False) - view_count = str_to_int(self._search_regex( - r'Views:\s*]+>\s*<[^>]+>([\d,\.]+)[0-9]+) + (?Ppfbid[A-Za-z0-9]+|\d+) ''' _EMBED_REGEX = [ r']+?src=(["\'])(?Phttps?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1', @@ -73,6 +73,22 @@ class FacebookIE(InfoExtractor): _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' _TESTS = [{ + 'url': 'https://www.facebook.com/radiokicksfm/videos/3676516585958356/', + 'info_dict': { + 'id': '3676516585958356', + 'ext': 'mp4', + 'title': 'dr Adam Przygoda', + 'description': 'md5:34675bda53336b1d16400265c2bb9b3b', + 'uploader': 'RADIO KICKS FM', + 'upload_date': '20230818', + 'timestamp': 1692346159, + 'thumbnail': r're:^https?://.*', + 'uploader_id': '100063551323670', + 'duration': 3132.184, + 'view_count': int, + 'concurrent_view_count': 0, + }, + }, { 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', 'md5': '6a40d33c0eccbb1af76cf0485a052659', 'info_dict': { @@ -90,16 +106,16 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '274175099429670', 'ext': 'mp4', - 'title': 'Asif Nawab Butt', - 'description': 'Asif Nawab Butt', + 'title': 'Asif', + 'description': '', 'uploader': 'Asif Nawab Butt', 'upload_date': '20140506', 'timestamp': 1399398998, 'thumbnail': r're:^https?://.*', + 'uploader_id': 'pfbid028wxorhX2ErLFJ578N6P3crHD3PHmXTCqCvfBpsnbSLmbokwSY75p5hWBjHGkG4zxl', + 'duration': 131.03, + 'concurrent_view_count': int, }, - 'expected_warnings': [ - 'title' - ] }, { 'note': 'Video with DASH manifest', 'url': 'https://www.facebook.com/video.php?v=957955867617029', @@ -151,7 +167,7 @@ class FacebookIE(InfoExtractor): # have 1080P, but only up to 720p in swf params # data.video.story.attachments[].media 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', - 'md5': '3f3798adb2b73423263e59376f1f5eb7', + 'md5': 'ca63897a90c9452efee5f8c40d080e25', 'info_dict': { 'id': '10155529876156509', 'ext': 'mp4', @@ -162,6 +178,9 @@ class FacebookIE(InfoExtractor): 'uploader': 'CNN', 'thumbnail': r're:^https?://.*', 'view_count': int, + 'uploader_id': '100059479812265', + 'concurrent_view_count': int, + 'duration': 44.478, }, }, { # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall @@ -170,18 +189,22 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '1417995061575415', 'ext': 'mp4', - 'title': 'Ukrainian Scientists Worldwide | Довгоочікуване відео', + 'title': 'Довгоочікуване відео | By Yaroslav - Facebook', 'description': 'Довгоочікуване відео', - 'timestamp': 1486648771, + 'timestamp': 1486648217, 'upload_date': '20170209', 'uploader': 'Yaroslav Korpan', - 'uploader_id': '100000948048708', + 'uploader_id': 'pfbid06AScABAWcW91qpiuGrLt99Ef9tvwHoXP6t8KeFYEqkSfreMtfa9nTveh8b2ZEVSWl', + 'concurrent_view_count': int, + 'thumbnail': r're:^https?://.*', + 'view_count': int, + 'duration': 11736.446, }, 'params': { 'skip_download': True, }, }, { - # FIXME + # FIXME: Cannot parse data error 'url': 'https://www.facebook.com/LaGuiaDelVaron/posts/1072691702860471', 'info_dict': { 'id': '1072691702860471', @@ -192,9 +215,7 @@ class FacebookIE(InfoExtractor): 'uploader': 'La Guía Del Varón', 'thumbnail': r're:^https?://.*', }, - 'params': { - 'skip_download': True, - }, + 'skip': 'Requires logging in', }, { # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/', @@ -208,9 +229,57 @@ class FacebookIE(InfoExtractor): 'uploader': 'Elisabeth Ahtn', 'uploader_id': '100013949973717', }, - 'params': { - 'skip_download': True, + 'skip': 'Requires logging in', + }, { + # data.node.comet_sections.content.story.attachments[].throwbackStyles.attachment_target_renderer.attachment.target.attachments[].styles.attachment.media + 'url': 'https://www.facebook.com/groups/1645456212344334/posts/3737828833107051/', + 'info_dict': { + 'id': '1569199726448814', + 'ext': 'mp4', + 'title': 'Pence MUST GO!', + 'description': 'Vickie Gentry shared a memory.', + 'timestamp': 1511548260, + 'upload_date': '20171124', + 'uploader': 'Vickie Gentry', + 'uploader_id': 'pfbid0FuZhHCeWDAxWxEbr3yKPFaRstXvRxgsp9uCPG6GjD4J2AitB35NUAuJ4Q75KcjiDl', + 'thumbnail': r're:^https?://.*', + 'duration': 148.435, }, + }, { + # data.node.comet_sections.content.story.attachments[].styles.attachment.media + 'url': 'https://www.facebook.com/attn/posts/pfbid0j1Czf2gGDVqeQ8KiMLFm3pWN8GxsQmeRrVhimWDzMuKQoR8r4b1knNsejELmUgyhl', + 'info_dict': { + 'id': '6968553779868435', + 'ext': 'mp4', + 'description': 'md5:2f2fcf93e97ac00244fe64521bbdb0cb', + 'uploader': 'ATTN:', + 'upload_date': '20231207', + 'title': 'ATTN:', + 'duration': 132.675, + 'uploader_id': '100064451419378', + 'view_count': int, + 'thumbnail': r're:^https?://.*', + 'timestamp': 1701975646, + }, + }, { + # data.node.comet_sections.content.story.attachments[].styles.attachment.media + 'url': 'https://www.facebook.com/permalink.php?story_fbid=pfbid0fqQuVEQyXRa9Dp4RcaTR14KHU3uULHV1EK7eckNXSH63JMuoALsAvVCJ97zAGitil&id=100068861234290', + 'info_dict': { + 'id': '270103405756416', + 'ext': 'mp4', + 'title': 'Lela Evans', + 'description': 'Today Makkovik\'s own Pilot Mandy Smith made her inaugural landing on the airstrip in her hometown. What a proud moment as we all cheered and...', + 'thumbnail': r're:^https?://.*', + 'uploader': 'Lela Evans', + 'uploader_id': 'pfbid0shZJipuigyy5mqrUJn9ub5LJFWNHvan5prtyi3LrDuuuJ4NwrURgnQHYR9fywBepl', + 'upload_date': '20231228', + 'timestamp': 1703804085, + 'duration': 394.347, + 'view_count': int, + }, + }, { + 'url': 'https://www.facebook.com/story.php?story_fbid=pfbid0Fnzhm8UuzjBYpPMNFzaSpFE9UmLdU4fJN8qTANi1Dmtj5q7DNrL5NERXfsAzDEV7l&id=100073071055552', + 'only_matching': True, }, { 'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'only_matching': True, @@ -252,7 +321,11 @@ class FacebookIE(InfoExtractor): 'timestamp': 1527084179, 'upload_date': '20180523', 'uploader': 'ESL One Dota 2', - 'uploader_id': '234218833769558', + 'uploader_id': '100066514874195', + 'duration': 4524.212, + 'view_count': int, + 'thumbnail': r're:^https?://.*', + 'concurrent_view_count': int, }, 'params': { 'skip_download': True, @@ -262,8 +335,17 @@ class FacebookIE(InfoExtractor): 'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/', 'info_dict': { 'id': '106560053808006', + 'ext': 'mp4', + 'title': 'Josef', + 'thumbnail': r're:^https?://.*', + 'concurrent_view_count': int, + 'uploader_id': 'pfbid0cibUN6tV7DYgdbJdsUFN46wc4jKpVSPAvJQhFofGqBGmVn3V3JtAs2tfUwziw2hUl', + 'timestamp': 1549275572, + 'duration': 3.413, + 'uploader': 'Josef Novak', + 'description': '', + 'upload_date': '20190204', }, - 'playlist_count': 2, }, { # data.video.story.attachments[].media 'url': 'https://www.facebook.com/watch/?v=647537299265662', @@ -276,6 +358,7 @@ class FacebookIE(InfoExtractor): 'id': '10157667649866271', }, 'playlist_count': 3, + 'skip': 'Requires logging in', }, { # data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330', @@ -312,14 +395,26 @@ class FacebookIE(InfoExtractor): }, 'playlist_count': 1, 'skip': 'Requires logging in', + }, { + # data.event.cover_media_renderer.cover_video + 'url': 'https://m.facebook.com/events/1509582499515440', + 'info_dict': { + 'id': '637246984455045', + 'ext': 'mp4', + 'title': 'ANALISI IN CAMPO OSCURO " Coaguli nel sangue dei vaccinati"', + 'description': 'Other event by Comitato Liberi Pensatori on Tuesday, October 18 2022', + 'thumbnail': r're:^https?://.*', + 'uploader': 'Comitato Liberi Pensatori', + 'uploader_id': '100065709540881', + }, }] _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' _api_config = { - 'graphURI': '/api/graphql/' + 'graphURI': '/api/graphql/', } def _perform_login(self, username, password): - login_page_req = sanitized_Request(self._LOGIN_URL) + login_page_req = Request(self._LOGIN_URL) self._set_cookie('facebook.com', 'locale', 'en_US') login_page = self._download_webpage(login_page_req, None, note='Downloading login page', @@ -340,8 +435,8 @@ def _perform_login(self, username, password): 'timezone': '-60', 'trynum': '1', } - request = sanitized_Request(self._LOGIN_URL, urlencode_postdata(login_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') + request = Request(self._LOGIN_URL, urlencode_postdata(login_form)) + request.headers['Content-Type'] = 'application/x-www-form-urlencoded' try: login_results = self._download_webpage(request, None, note='Logging in', errnote='unable to fetch login page') @@ -350,7 +445,7 @@ def _perform_login(self, username, password): r'(?s)]+class=(["\']).*?login_error_box.*?\1[^>]*>]*>.*?
    ]*>(?P.+?)', login_results, 'login error', default=None, group='error') if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError(f'Unable to login: {error}', expected=True) self.report_warning('unable to log in: bad username/password, or exceeded login rate limit (~3/min). Check credentials or wait.') return @@ -367,14 +462,14 @@ def _perform_login(self, username, password): 'h': h, 'name_action_selected': 'dont_save', } - check_req = sanitized_Request(self._CHECKPOINT_URL, urlencode_postdata(check_form)) - check_req.add_header('Content-Type', 'application/x-www-form-urlencoded') + check_req = Request(self._CHECKPOINT_URL, urlencode_postdata(check_form)) + check_req.headers['Content-Type'] = 'application/x-www-form-urlencoded' check_response = self._download_webpage(check_req, None, note='Confirming login') if re.search(r'id="checkpointSubmitButton"', check_response) is not None: self.report_warning('Unable to confirm login, you have to login in your browser and authorize the login.') except network_exceptions as err: - self.report_warning('unable to log in: %s' % error_to_compat_str(err)) + self.report_warning(f'unable to log in: {err}') return def _extract_from_url(self, url, video_id): @@ -383,28 +478,31 @@ def _extract_from_url(self, url, video_id): def extract_metadata(webpage): post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall( - r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)] + r'data-sjs>({.*?ScheduledServerJS.*?})', webpage)] post = traverse_obj(post_data, ( - ..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] + ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) title = get_first(media, ('title', 'text')) description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) - uploader_data = get_first(media, 'owner') or get_first(post, ('node', 'actors', ...)) or {} - page_title = title or self._html_search_regex(( r']*class="uiHeaderTitle"[^>]*>(?P[^<]*)', r'(?s)(?P.*?)', - self._meta_regex('og:title'), self._meta_regex('twitter:title'), r'(?P<content>.+?)' + self._meta_regex('og:title'), self._meta_regex('twitter:title'), r'(?P<content>.+?)', ), webpage, 'title', default=None, group='content') description = description or self._html_search_meta( ['description', 'og:description', 'twitter:description'], webpage, 'description', default=None) + uploader_data = ( + get_first(media, ('owner', {dict})) + or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name'])) + or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) + or get_first(post, ('node', 'actors', ..., {dict})) + or get_first(post, ('event', 'event_creator', {dict})) or {}) uploader = uploader_data.get('name') or ( clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) or self._search_regex( (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False)) - timestamp = int_or_none(self._search_regex( r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)) @@ -415,16 +513,17 @@ def extract_metadata(webpage): # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/ if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail): thumbnail = None - view_count = parse_count(self._search_regex( - r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', - default=None)) info_dict = { 'description': description, 'uploader': uploader, 'uploader_id': uploader_data.get('id'), 'timestamp': timestamp, 'thumbnail': thumbnail, - 'view_count': view_count, + 'view_count': parse_count(self._search_regex( + (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)'), + webpage, 'view count', default=None)), + 'concurrent_view_count': get_first(post, ( + ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), } info_json_ld = self._search_json_ld(webpage, video_id, default={}) @@ -456,47 +555,65 @@ def extract_from_jsmods_instances(js_data): js_data, lambda x: x['jsmods']['instances'], list) or []) def extract_dash_manifest(video, formats): - dash_manifest = video.get('dash_manifest') + dash_manifest = traverse_obj(video, 'dash_manifest', 'playlist', expected_type=str) if dash_manifest: formats.extend(self._parse_mpd_formats( - compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)))) + compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), + mpd_url=video.get('dash_manifest_url'))) def process_formats(info): # Downloads with browser's User-Agent are rate limited. Working around # with non-browser User-Agent. for f in info['formats']: + # Downloads with browser's User-Agent are rate limited. Working around + # with non-browser User-Agent. f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' - info['_format_sort_fields'] = ('res', 'quality') + # Formats larger than ~500MB will return error 403 unless chunk size is regulated + f.setdefault('downloader_options', {})['http_chunk_size'] = 250 << 20 + + def yield_all_relay_data(_filter): + for relay_data in re.findall(rf'data-sjs>({{.*?{_filter}.*?}})', webpage): + yield self._parse_json(relay_data, video_id, fatal=False) or {} def extract_relay_data(_filter): - return self._parse_json(self._search_regex( - r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter, - webpage, 'replay data', default='{}'), video_id, fatal=False) or {} + return next(filter(None, yield_all_relay_data(_filter)), {}) - def extract_relay_prefetched_data(_filter): - replay_data = extract_relay_data(_filter) - for require in (replay_data.get('require') or []): - if require[0] == 'RelayPrefetchedStreamCache': - return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} + def extract_relay_prefetched_data(_filter, target_keys=None): + path = 'data' + if target_keys is not None: + path = lambda k, v: k == 'data' and any(target in v for target in variadic(target_keys)) + return traverse_obj(yield_all_relay_data(_filter), ( + ..., 'require', (None, (..., ..., ..., '__bbox', 'require')), + lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v), + ..., ..., '__bbox', 'result', path, {dict}), get_all=False) or {} if not video_data: server_js_data = self._parse_json(self._search_regex([ r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX, - r'bigPipe\.onPageletArrive\(({.*?id\s*:\s*"%s".*?})\);' % self._SUPPORTED_PAGLETS_REGEX + rf'bigPipe\.onPageletArrive\(({{.*?id\s*:\s*"{self._SUPPORTED_PAGLETS_REGEX}".*?}})\);', ], webpage, 'js data', default='{}'), video_id, js_to_json, False) video_data = extract_from_jsmods_instances(server_js_data) if not video_data: data = extract_relay_prefetched_data( - r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"') + r'"(?:dash_manifest|playable_url(?:_quality_hd)?)', + target_keys=('video', 'event', 'nodes', 'node', 'mediaset')) if data: entries = [] def parse_graphql_video(video): + v_id = video.get('videoId') or video.get('id') or video_id + reel_info = traverse_obj( + video, ('creation_story', 'short_form_video_context', 'playback_video', {dict})) + if reel_info: + video = video['creation_story'] + video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner')) + video.update(reel_info) formats = [] q = qualities(['sd', 'hd']) for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), - ('playable_url_dash', '')): + ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), + ('browser_native_sd_url', 'sd')): playable_url = video.get(key) if not playable_url: continue @@ -505,19 +622,48 @@ def parse_graphql_video(video): else: formats.append({ 'format_id': format_id, - 'quality': q(format_id), + # sd, hd formats w/o resolution info should be deprioritized below DASH + 'quality': q(format_id) - 3, 'url': playable_url, }) extract_dash_manifest(video, formats) - v_id = video.get('videoId') or video.get('id') or video_id + if not formats: + # Do not append false positive entry w/o any formats + return + + automatic_captions, subtitles = {}, {} + is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool})) + for caption in traverse_obj(video, ( + 'video_available_captions_locales', + {lambda x: sorted(x, key=lambda c: c['locale'])}, + lambda _, v: url_or_none(v['captions_url']), + )): + lang = caption.get('localized_language') or 'und' + subs = { + 'url': caption['captions_url'], + 'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang), + } + if caption.get('localized_creation_method') or is_broadcast: + automatic_captions.setdefault(caption['locale'], []).append(subs) + else: + subtitles.setdefault(caption['locale'], []).append(subs) + captions_url = traverse_obj(video, ('captions_url', {url_or_none})) + if captions_url and not automatic_captions and not subtitles: + locale = self._html_search_meta( + ['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US') + (automatic_captions if is_broadcast else subtitles)[locale] = [{'url': captions_url}] + info = { 'id': v_id, 'formats': formats, 'thumbnail': traverse_obj( video, ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri')), - 'uploader_id': try_get(video, lambda x: x['owner']['id']), - 'timestamp': int_or_none(video.get('publish_time')), - 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), + 'uploader_id': traverse_obj(video, ('owner', 'id', {str_or_none})), + 'timestamp': traverse_obj(video, 'publish_time', 'creation_time', expected_type=int_or_none), + 'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000) + or float_or_none(video.get('length_in_second'))), + 'automatic_captions': automatic_captions, + 'subtitles': subtitles, } process_formats(info) description = try_get(video, lambda x: x['savable_description']['text']) @@ -528,7 +674,7 @@ def parse_graphql_video(video): 'description': description, }) else: - info['title'] = description or 'Facebook video #%s' % v_id + info['title'] = description or f'Facebook video #{v_id}' entries.append(info) def parse_attachment(attachment, key='media'): @@ -539,9 +685,11 @@ def parse_attachment(attachment, key='media'): nodes = variadic(traverse_obj(data, 'nodes', 'node') or []) attachments = traverse_obj(nodes, ( ..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments', - ..., ('styles', 'style_type_renderer'), 'attachment'), expected_type=dict) or [] + ..., ('styles', 'style_type_renderer', ('throwbackStyles', 'attachment_target_renderer')), + 'attachment', {dict})) for attachment in attachments: - ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] + ns = traverse_obj(attachment, ('all_subattachments', 'nodes', ..., {dict}), + ('target', 'attachments', ..., 'styles', 'attachment', {dict})) for n in ns: parse_attachment(n) parse_attachment(attachment) @@ -550,11 +698,12 @@ def parse_attachment(attachment, key='media'): for edge in edges: parse_attachment(edge, key='node') - video = data.get('video') or {} + video = traverse_obj(data, ( + 'event', 'cover_media_renderer', 'cover_video'), 'video', expected_type=dict) or {} if video: attachments = try_get(video, [ lambda x: x['story']['attachments'], - lambda x: x['creation_story']['attachments'] + lambda x: x['creation_story']['attachments'], ], list) or [] for attachment in attachments: parse_attachment(attachment) @@ -564,18 +713,21 @@ def parse_attachment(attachment, key='media'): if len(entries) > 1: return self.playlist_result(entries, video_id) - video_info = entries[0] + video_info = entries[0] if entries else {'id': video_id} webpage_info = extract_metadata(webpage) # honor precise duration in video info if video_info.get('duration'): webpage_info['duration'] = video_info['duration'] + # preserve preferred_thumbnail in video info + if video_info.get('thumbnail'): + webpage_info['thumbnail'] = video_info['thumbnail'] return merge_dicts(webpage_info, video_info) if not video_data: m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">
    (.*?)
    ', webpage) if m_msg is not None: raise ExtractorError( - 'The video is not available, Facebook said: "%s"' % m_msg.group(1), + f'The video is not available, Facebook said: "{m_msg.group(1)}"', expected=True) elif any(p in webpage for p in ( '>You must log in to continue', @@ -612,7 +764,7 @@ def parse_attachment(attachment, key='media'): v_id = video.get('id') if not v_id: continue - v_id = compat_str(v_id) + v_id = str(v_id) entries.append(self.url_result( self._VIDEO_PAGE_TEMPLATE % v_id, self.ie_key(), v_id, video.get('name'))) @@ -670,16 +822,18 @@ def parse_attachment(attachment, key='media'): continue for quality in ('sd', 'hd'): for src_type in ('src', 'src_no_ratelimit'): - src = f[0].get('%s_%s' % (quality, src_type)) + src = f[0].get(f'{quality}_{src_type}') if src: - preference = -10 if format_id == 'progressive' else -1 + # sd, hd formats w/o resolution info should be deprioritized below DASH + # TODO: investigate if progressive or src formats still exist + preference = -10 if format_id == 'progressive' else -3 if quality == 'hd': - preference += 5 + preference += 1 formats.append({ - 'format_id': '%s_%s_%s' % (format_id, quality, src_type), + 'format_id': f'{format_id}_{quality}_{src_type}', 'url': src, 'quality': preference, - 'height': 720 if quality == 'hd' else None + 'height': 720 if quality == 'hd' else None, }) extract_dash_manifest(f[0], formats) subtitles_src = f[0].get('subtitles_src') @@ -729,7 +883,7 @@ class FacebookPluginsVideoIE(InfoExtractor): def _real_extract(self, url): return self.url_result( - compat_urllib_parse_unquote(self._match_id(url)), + urllib.parse.unquote(self._match_id(url)), FacebookIE.ie_key()) @@ -778,22 +932,158 @@ class FacebookReelIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.facebook.com/reel/1195289147628387', - 'md5': 'c4ff9a7182ff9ff7d6f7a83603bae831', + 'md5': 'f13dd37f2633595982db5ed8765474d3', 'info_dict': { 'id': '1195289147628387', 'ext': 'mp4', - 'title': 'md5:9f5b142921b2dc57004fa13f76005f87', - 'description': 'md5:24ea7ef062215d295bdde64e778f5474', - 'uploader': 'Beast Camp Training', - 'uploader_id': '1738535909799870', - 'duration': 9.536, - 'thumbnail': r're:^https?://.*', + 'title': 'md5:b05800b5b1ad56c0ca78bd3807b6a61e', + 'description': 'md5:22f03309b216ac84720183961441d8db', + 'uploader': 'md5:723e6cb3091241160f20b3c5dc282af1', + 'uploader_id': '100040874179269', + 'duration': 9.579, + 'timestamp': 1637502609, 'upload_date': '20211121', - 'timestamp': 1637502604, - } + 'thumbnail': r're:^https?://.*', + }, }] def _real_extract(self, url): video_id = self._match_id(url) return self.url_result( f'https://m.facebook.com/watch/?v={video_id}&_rdr', FacebookIE, video_id) + + +class FacebookAdsIE(InfoExtractor): + _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/ads/library/?\?(?:[^#]+&)?id=(?P\d+)' + IE_NAME = 'facebook:ads' + + _TESTS = [{ + 'url': 'https://www.facebook.com/ads/library/?id=899206155126718', + 'info_dict': { + 'id': '899206155126718', + 'ext': 'mp4', + 'title': 'video by Kandao', + 'description': 'md5:0822724069e3aca97cbed5dabbab282e', + 'uploader': 'Kandao', + 'uploader_id': '774114102743284', + 'uploader_url': r're:^https?://.*', + 'timestamp': 1702548330, + 'thumbnail': r're:^https?://.*', + 'upload_date': '20231214', + 'like_count': int, + }, + }, { + # key 'watermarked_video_sd_url' missing + 'url': 'https://www.facebook.com/ads/library/?id=501152689226254', + 'info_dict': { + 'id': '501152689226254', + 'ext': 'mp4', + 'title': 'video by mat.nawrocki', + 'description': 'md5:02a446ace7ff8c3c37a2892922492490', + 'uploader': 'mat.nawrocki', + 'uploader_id': '148586968341456', + 'uploader_url': r're:^https?://.*', + 'timestamp': 1723452305, + 'thumbnail': r're:^https?://.*', + 'upload_date': '20240812', + 'like_count': int, + }, + }, { + 'url': 'https://www.facebook.com/ads/library/?id=893637265423481', + 'info_dict': { + 'id': '893637265423481', + 'title': 'Jusqu\u2019\u00e0 -25% sur une s\u00e9lection de vins p\u00e9tillants italiens ', + 'uploader': 'Eataly Paris Marais', + 'uploader_id': '2086668958314152', + 'uploader_url': r're:^https?://.*', + 'timestamp': 1703571529, + 'upload_date': '20231226', + 'like_count': int, + }, + 'playlist_count': 3, + }, { + 'url': 'https://es-la.facebook.com/ads/library/?id=901230958115569', + 'only_matching': True, + }, { + 'url': 'https://m.facebook.com/ads/library/?id=901230958115569', + 'only_matching': True, + }] + + _FORMATS_MAP = { + 'watermarked_video_sd_url': ('sd-wmk', 'SD, watermarked'), + 'video_sd_url': ('sd', None), + 'watermarked_video_hd_url': ('hd-wmk', 'HD, watermarked'), + 'video_hd_url': ('hd', None), + } + + def _extract_formats(self, video_dict): + formats = [] + for format_key, format_url in traverse_obj(video_dict, ( + {dict.items}, lambda _, v: v[0] in self._FORMATS_MAP and url_or_none(v[1]), + )): + formats.append({ + 'format_id': self._FORMATS_MAP[format_key][0], + 'format_note': self._FORMATS_MAP[format_key][1], + 'url': format_url, + 'ext': 'mp4', + 'quality': qualities(tuple(self._FORMATS_MAP))(format_key), + }) + return formats + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + post_data = traverse_obj( + re.findall(r'data-sjs>({.*?ScheduledServerJS.*?})', webpage), (..., {json.loads})) + data = get_first(post_data, ( + 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., + 'entryPointRoot', 'otherProps', 'deeplinkAdCard', 'snapshot', {dict})) + if not data: + raise ExtractorError('Unable to extract ad data') + + title = data.get('title') + if not title or title == '{{product.name}}': + title = join_nonempty('display_format', 'page_name', delim=' by ', from_dict=data) + markup_id = traverse_obj(data, ('body', '__m', {str})) + markup = traverse_obj(post_data, ( + ..., 'require', ..., ..., ..., '__bbox', 'markup', lambda _, v: v[0].startswith(markup_id), + ..., '__html', {clean_html}, {lambda x: not x.startswith('{{product.') and x}, any)) + + info_dict = merge_dicts({ + 'title': title, + 'description': markup or None, + }, traverse_obj(data, { + 'description': ('link_description', {lambda x: x if not x.startswith('{{product.') else None}), + 'uploader': ('page_name', {str}), + 'uploader_id': ('page_id', {str_or_none}), + 'uploader_url': ('page_profile_uri', {url_or_none}), + 'timestamp': ('creation_time', {int_or_none}), + 'like_count': ('page_like_count', {int_or_none}), + })) + + entries = [] + for idx, entry in enumerate(traverse_obj( + data, (('videos', 'cards'), lambda _, v: any(url_or_none(v.get(f)) for f in self._FORMATS_MAP))), 1, + ): + entries.append({ + 'id': f'{video_id}_{idx}', + 'title': entry.get('title') or title, + 'description': traverse_obj(entry, 'body', 'link_description') or info_dict.get('description'), + 'thumbnail': url_or_none(entry.get('video_preview_image_url')), + 'formats': self._extract_formats(entry), + }) + + if len(entries) == 1: + info_dict.update(entries[0]) + + elif len(entries) > 1: + info_dict.update({ + 'title': entries[0]['title'], + 'entries': entries, + '_type': 'playlist', + }) + + info_dict['id'] = video_id + + return info_dict diff --git a/yt_dlp/extractor/fancode.py b/yt_dlp/extractor/fancode.py index 1b5db818a1..1b1ed3956b 100644 --- a/yt_dlp/extractor/fancode.py +++ b/yt_dlp/extractor/fancode.py @@ -1,15 +1,9 @@ from .common import InfoExtractor - -from ..compat import compat_str -from ..utils import ( - parse_iso8601, - ExtractorError, - try_get, - mimetype2ext -) +from ..utils import ExtractorError, mimetype2ext, parse_iso8601, try_get class FancodeVodIE(InfoExtractor): + _WORKING = False IE_NAME = 'fancode:vod' _VALID_URL = r'https?://(?:www\.)?fancode\.com/video/(?P[0-9]+)\b' @@ -24,12 +18,12 @@ class FancodeVodIE(InfoExtractor): 'ext': 'mp4', 'title': 'Match Preview: PBKS vs MI', 'thumbnail': r're:^https?://.*\.jpg$', - "timestamp": 1619081590, + 'timestamp': 1619081590, 'view_count': int, 'like_count': int, 'upload_date': '20210422', - 'uploader_id': '6008340455001' - } + 'uploader_id': '6008340455001', + }, }, { 'url': 'https://fancode.com/video/15043', 'only_matching': True, @@ -58,14 +52,14 @@ def _perform_login(self, username, password): "refreshToken":"%s" }, "operationName":"RefreshToken" - }''' % password + }''' % password # noqa: UP031 - token_json = self.download_gql('refresh token', data, "Getting the Access token") + token_json = self.download_gql('refresh token', data, 'Getting the Access token') self._ACCESS_TOKEN = try_get(token_json, lambda x: x['data']['refreshToken']['accessToken']) if self._ACCESS_TOKEN is None: self.report_warning('Failed to get Access token') else: - self.headers.update({'Authorization': 'Bearer %s' % self._ACCESS_TOKEN}) + self.headers.update({'Authorization': f'Bearer {self._ACCESS_TOKEN}'}) def _check_login_required(self, is_available, is_premium): msg = None @@ -97,12 +91,12 @@ def _real_extract(self, url): } }, "operationName":"Video" - }''' % video_id + }''' % video_id # noqa: UP031 metadata_json = self.download_gql(video_id, data, note='Downloading metadata') media = try_get(metadata_json, lambda x: x['data']['media'], dict) or {} - brightcove_video_id = try_get(media, lambda x: x['mediaSource']['brightcove'], compat_str) + brightcove_video_id = try_get(media, lambda x: x['mediaSource']['brightcove'], str) if brightcove_video_id is None: raise ExtractorError('Unable to extract brightcove Video ID') @@ -126,6 +120,7 @@ def _real_extract(self, url): class FancodeLiveIE(FancodeVodIE): # XXX: Do not subclass from concrete IE + _WORKING = False IE_NAME = 'fancode:live' _VALID_URL = r'https?://(www\.)?fancode\.com/match/(?P[0-9]+).+' @@ -136,11 +131,11 @@ class FancodeLiveIE(FancodeVodIE): # XXX: Do not subclass from concrete IE 'id': '35328', 'ext': 'mp4', 'title': 'BUB vs BLB', - "timestamp": 1624863600, + 'timestamp': 1624863600, 'is_live': True, 'upload_date': '20210628', }, - 'skip': 'Ended' + 'skip': 'Ended', }, { 'url': 'https://fancode.com/match/35328/', 'only_matching': True, @@ -151,7 +146,7 @@ class FancodeLiveIE(FancodeVodIE): # XXX: Do not subclass from concrete IE def _real_extract(self, url): - id = self._match_id(url) + video_id = self._match_id(url) data = '''{ "query":"query MatchResponse($id: Int\\u0021, $isLoggedIn: Boolean\\u0021) { match: matchWithScores(id: $id) { id matchDesc mediaId videoStreamId videoStreamUrl { ...VideoSource } liveStreams { videoStreamId videoStreamUrl { ...VideoSource } contentId } name startTime streamingStatus isPremium isUserEntitled @include(if: $isLoggedIn) status metaTags bgImage { src } sport { name slug } tour { id name } squads { name shortName } liveStreams { contentId } mediaId }}fragment VideoSource on VideoSource { title description posterUrl url deliveryType playerType}", "variables":{ @@ -159,21 +154,21 @@ def _real_extract(self, url): "isLoggedIn":true }, "operationName":"MatchResponse" - }''' % id + }''' % video_id # noqa: UP031 - info_json = self.download_gql(id, data, "Info json") + info_json = self.download_gql(video_id, data, 'Info json') match_info = try_get(info_json, lambda x: x['data']['match']) - if match_info.get('streamingStatus') != "STARTED": + if match_info.get('streamingStatus') != 'STARTED': raise ExtractorError('The stream can\'t be accessed', expected=True) self._check_login_required(match_info.get('isUserEntitled'), True) # all live streams are premium only return { - 'id': id, + 'id': video_id, 'title': match_info.get('name'), - 'formats': self._extract_akamai_formats(try_get(match_info, lambda x: x['videoStreamUrl']['url']), id), + 'formats': self._extract_akamai_formats(try_get(match_info, lambda x: x['videoStreamUrl']['url']), video_id), 'ext': mimetype2ext(try_get(match_info, lambda x: x['videoStreamUrl']['deliveryType'])), 'is_live': True, - 'release_timestamp': parse_iso8601(match_info.get('startTime')) + 'release_timestamp': parse_iso8601(match_info.get('startTime')), } diff --git a/yt_dlp/extractor/fathom.py b/yt_dlp/extractor/fathom.py new file mode 100644 index 0000000000..1df7d96fe8 --- /dev/null +++ b/yt_dlp/extractor/fathom.py @@ -0,0 +1,54 @@ +import json + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + float_or_none, + get_element_html_by_id, + parse_iso8601, +) +from ..utils.traversal import traverse_obj + + +class FathomIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fathom\.video/share/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://fathom.video/share/G9mkjkspnohVVZ_L5nrsoPycyWcB8y7s', + 'md5': '0decd5343b8f30ae268625e79a02b60f', + 'info_dict': { + 'id': '47200596', + 'ext': 'mp4', + 'title': 'eCom Inucbator - Coaching Session', + 'duration': 8125.380507, + 'timestamp': 1699048914, + 'upload_date': '20231103', + }, + }, { + 'url': 'https://fathom.video/share/mEws3bybftHL2QLymxYEDeE21vtLxGVm', + 'md5': '4f5cb382126c22d1aba8a939f9c49690', + 'info_dict': { + 'id': '46812957', + 'ext': 'mp4', + 'title': 'Jon, Lawrence, Neman chat about practice', + 'duration': 3571.517847, + 'timestamp': 1698933600, + 'upload_date': '20231102', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + props = traverse_obj( + get_element_html_by_id('app', webpage), ({extract_attributes}, 'data-page', {json.loads}, 'props')) + video_id = str(props['call']['id']) + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(props['call']['video_url'], video_id, 'mp4'), + **traverse_obj(props, { + 'title': ('head', 'title', {str}), + 'duration': ('duration', {float_or_none}), + 'timestamp': ('call', 'started_at', {parse_iso8601}), + }), + } diff --git a/yt_dlp/extractor/faz.py b/yt_dlp/extractor/faz.py index bca62add9f..796bac3c31 100644 --- a/yt_dlp/extractor/faz.py +++ b/yt_dlp/extractor/faz.py @@ -3,9 +3,9 @@ from .common import InfoExtractor from ..compat import compat_etree_fromstring from ..utils import ( + int_or_none, xpath_element, xpath_text, - int_or_none, ) diff --git a/yt_dlp/extractor/fc2.py b/yt_dlp/extractor/fc2.py index dd5e088fc1..eac70f6a96 100644 --- a/yt_dlp/extractor/fc2.py +++ b/yt_dlp/extractor/fc2.py @@ -1,13 +1,11 @@ import re +import urllib.parse from .common import InfoExtractor -from ..compat import compat_parse_qs -from ..dependencies import websockets +from ..networking import Request from ..utils import ( ExtractorError, - WebSocketsWrapper, js_to_json, - sanitized_Request, traverse_obj, update_url_query, urlencode_postdata, @@ -57,7 +55,7 @@ def _login(self): } login_data = urlencode_postdata(login_form_strs) - request = sanitized_Request( + request = Request( 'https://secure.id.fc2.com/index.php?mode=login&switch_language=en', login_data) login_results = self._download_webpage(request, None, note='Logging in', errnote='Unable to log in') @@ -66,7 +64,7 @@ def _login(self): return False # this is also needed - login_redir = sanitized_Request('http://id.fc2.com/?mode=redirect&login=done') + login_redir = Request('http://id.fc2.com/?mode=redirect&login=done') self._download_webpage( login_redir, None, note='Login redirect', errnote='Login redirect failed') @@ -94,7 +92,7 @@ def _real_extract(self, url): description = self._og_search_description(webpage, default=None) vidplaylist = self._download_json( - 'https://video.fc2.com/api/v3/videoplaylist/%s?sh=1&fs=0' % video_id, video_id, + f'https://video.fc2.com/api/v3/videoplaylist/{video_id}?sh=1&fs=0', video_id, note='Downloading info page') vid_url = traverse_obj(vidplaylist, ('playlist', 'nq')) if not vid_url: @@ -129,22 +127,22 @@ class FC2EmbedIE(InfoExtractor): def _real_extract(self, url): mobj = self._match_valid_url(url) - query = compat_parse_qs(mobj.group('query')) + query = urllib.parse.parse_qs(mobj.group('query')) video_id = query['i'][-1] - title = query.get('tl', ['FC2 video %s' % video_id])[0] + title = query.get('tl', [f'FC2 video {video_id}'])[0] sj = query.get('sj', [None])[0] thumbnail = None if sj: # See thumbnailImagePath() in ServerConst.as of flv2.swf - thumbnail = 'http://video%s-thumbnail.fc2.com/up/pic/%s.jpg' % ( + thumbnail = 'http://video{}-thumbnail.fc2.com/up/pic/{}.jpg'.format( sj, '/'.join((video_id[:6], video_id[6:8], video_id[-2], video_id[-1], video_id))) return { '_type': 'url_transparent', 'ie_key': FC2IE.ie_key(), - 'url': 'fc2:%s' % video_id, + 'url': f'fc2:{video_id}', 'title': title, 'thumbnail': thumbnail, } @@ -167,10 +165,8 @@ class FC2LiveIE(InfoExtractor): }] def _real_extract(self, url): - if not websockets: - raise ExtractorError('websockets library is not available. Please install it.', expected=True) video_id = self._match_id(url) - webpage = self._download_webpage('https://live.fc2.com/%s/' % video_id, video_id) + webpage = self._download_webpage(f'https://live.fc2.com/{video_id}/', video_id) self._set_cookie('live.fc2.com', 'js-player_size', '1') @@ -179,7 +175,7 @@ def _real_extract(self, url): 'channel': '1', 'profile': '1', 'user': '1', - 'streamid': video_id + 'streamid': video_id, }), note='Requesting member info') control_server = self._download_json( @@ -199,13 +195,9 @@ def _real_extract(self, url): ws_url = update_url_query(control_server['url'], {'control_token': control_server['control_token']}) playlist_data = None - self.to_screen('%s: Fetching HLS playlist info via WebSocket' % video_id) - ws = WebSocketsWrapper(ws_url, { - 'Cookie': str(self._get_cookies('https://live.fc2.com/'))[12:], + ws = self._request_webpage(Request(ws_url, headers={ 'Origin': 'https://live.fc2.com', - 'Accept': '*/*', - 'User-Agent': self.get_param('http_headers')['User-Agent'], - }) + }), video_id, note='Fetching HLS playlist info via WebSocket') self.write_debug('Sending HLS server request') @@ -232,7 +224,7 @@ def _real_extract(self, url): self.write_debug('Goodbye') playlist_data = data break - self.write_debug('Server said: %s%s' % (recv[:100], '...' if len(recv) > 100 else '')) + self.write_debug('Server said: {}{}'.format(recv[:100], '...' if len(recv) > 100 else '')) if not playlist_data: raise ExtractorError('Unable to fetch HLS playlist info via WebSocket') diff --git a/yt_dlp/extractor/fczenit.py b/yt_dlp/extractor/fczenit.py index 8175b6b0f7..b2dbb92d5e 100644 --- a/yt_dlp/extractor/fczenit.py +++ b/yt_dlp/extractor/fczenit.py @@ -1,7 +1,7 @@ from .common import InfoExtractor from ..utils import ( - int_or_none, float_or_none, + int_or_none, ) diff --git a/yt_dlp/extractor/fifa.py b/yt_dlp/extractor/fifa.py index 8b4db3a8ae..ae837f6a02 100644 --- a/yt_dlp/extractor/fifa.py +++ b/yt_dlp/extractor/fifa.py @@ -1,5 +1,4 @@ from .common import InfoExtractor - from ..utils import ( int_or_none, traverse_obj, @@ -8,7 +7,7 @@ class FifaIE(InfoExtractor): - _VALID_URL = r'https?://www.fifa.com/fifaplus/(?P\w{2})/watch/([^#?]+/)?(?P\w+)' + _VALID_URL = r'https?://www\.fifa\.com/fifaplus/(?P\w{2})/watch/([^#?]+/)?(?P\w+)' _TESTS = [{ 'url': 'https://www.fifa.com/fifaplus/en/watch/7on10qPcnyLajDDU3ntg6y', 'info_dict': { diff --git a/yt_dlp/extractor/filmmodu.py b/yt_dlp/extractor/filmmodu.py deleted file mode 100644 index 9eb550eed5..0000000000 --- a/yt_dlp/extractor/filmmodu.py +++ /dev/null @@ -1,69 +0,0 @@ -from .common import InfoExtractor -from ..utils import int_or_none - - -class FilmmoduIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?filmmodu.org/(?P[^/]+-(?:turkce-dublaj-izle|altyazili-izle))' - _TESTS = [{ - 'url': 'https://www.filmmodu.org/f9-altyazili-izle', - 'md5': 'aeefd955c2a508a5bdaa3bcec8eeb0d4', - 'info_dict': { - 'id': '10804', - 'ext': 'mp4', - 'title': 'F9', - 'description': 'md5:2713f584a4d65afa2611e2948d0b953c', - 'subtitles': { - 'tr': [{ - 'ext': 'vtt', - }], - }, - 'thumbnail': r're:https://s[0-9]+.filmmodu.org/uploads/movie/cover/10804/xXHZeb1yhJvnSHPzZDqee0zfMb6.jpg', - }, - }, { - 'url': 'https://www.filmmodu.org/the-godfather-turkce-dublaj-izle', - 'md5': '109f2fcb9c941330eed133971c035c00', - 'info_dict': { - 'id': '3646', - 'ext': 'mp4', - 'title': 'Baba', - 'description': 'md5:d43fd651937cd75cc650883ebd8d8461', - 'thumbnail': r're:https://s[0-9]+.filmmodu.org/uploads/movie/cover/3646/6xKCYgH16UuwEGAyroLU6p8HLIn.jpg', - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage, fatal=True) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - real_video_id = self._search_regex(r'var\s*videoId\s*=\s*\'([0-9]+)\'', webpage, 'video_id') - video_type = self._search_regex(r'var\s*videoType\s*=\s*\'([a-z]+)\'', webpage, 'video_type') - data = self._download_json('https://www.filmmodu.org/get-source', real_video_id, query={ - 'movie_id': real_video_id, - 'type': video_type, - }) - formats = [{ - 'url': source['src'], - 'ext': 'mp4', - 'format_id': source['label'], - 'height': int_or_none(source.get('res')), - 'protocol': 'm3u8_native', - } for source in data['sources']] - - subtitles = {} - - if data.get('subtitle'): - subtitles['tr'] = [{ - 'url': data['subtitle'], - }] - - return { - 'id': real_video_id, - 'display_id': video_id, - 'title': title, - 'description': description, - 'formats': formats, - 'subtitles': subtitles, - 'thumbnail': thumbnail, - } diff --git a/yt_dlp/extractor/filmon.py b/yt_dlp/extractor/filmon.py index 9a93cb9840..af1de7ac83 100644 --- a/yt_dlp/extractor/filmon.py +++ b/yt_dlp/extractor/filmon.py @@ -1,13 +1,10 @@ from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_HTTPError, -) +from ..networking.exceptions import HTTPError from ..utils import ( + ExtractorError, + int_or_none, qualities, strip_or_none, - int_or_none, - ExtractorError, ) @@ -37,12 +34,12 @@ def _real_extract(self, url): try: response = self._download_json( - 'https://www.filmon.com/api/vod/movie?id=%s' % video_id, + f'https://www.filmon.com/api/vod/movie?id={video_id}', video_id)['response'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - errmsg = self._parse_json(e.cause.read().decode(), video_id)['reason'] - raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True) + if isinstance(e.cause, HTTPError): + errmsg = self._parse_json(e.cause.response.read().decode(), video_id)['reason'] + raise ExtractorError(f'{self.IE_NAME} said: {errmsg}', expected=True) raise title = response['title'] @@ -124,12 +121,12 @@ def _real_extract(self, url): channel_data = self._download_json( 'http://www.filmon.com/api-v2/channel/' + channel_id, channel_id)['data'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - errmsg = self._parse_json(e.cause.read().decode(), channel_id)['message'] - raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True) + if isinstance(e.cause, HTTPError): + errmsg = self._parse_json(e.cause.response.read().decode(), channel_id)['message'] + raise ExtractorError(f'{self.IE_NAME} said: {errmsg}', expected=True) raise - channel_id = compat_str(channel_data['id']) + channel_id = str(channel_data['id']) is_live = not channel_data.get('is_vod') and not channel_data.get('is_vox') title = channel_data['title'] @@ -157,7 +154,7 @@ def _real_extract(self, url): for name, width, height in self._THUMBNAIL_RES: thumbnails.append({ 'id': name, - 'url': 'http://static.filmon.com/assets/channels/%s/%s.png' % (channel_id, name), + 'url': f'http://static.filmon.com/assets/channels/{channel_id}/{name}.png', 'width': width, 'height': height, }) diff --git a/yt_dlp/extractor/filmweb.py b/yt_dlp/extractor/filmweb.py index cfea1f2fb6..6dde6c3f00 100644 --- a/yt_dlp/extractor/filmweb.py +++ b/yt_dlp/extractor/filmweb.py @@ -14,7 +14,7 @@ class FilmwebIE(InfoExtractor): 'timestamp': 1458140101, 'uploader_id': '12639966', 'uploader': 'Live Roaldset', - } + }, } def _real_extract(self, url): diff --git a/yt_dlp/extractor/firsttv.py b/yt_dlp/extractor/firsttv.py index f74bd132f5..2d47ee561f 100644 --- a/yt_dlp/extractor/firsttv.py +++ b/yt_dlp/extractor/firsttv.py @@ -1,8 +1,6 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) from ..utils import ( int_or_none, qualities, @@ -60,12 +58,12 @@ def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - playlist_url = compat_urlparse.urljoin(url, self._search_regex( + playlist_url = urllib.parse.urljoin(url, self._search_regex( r'data-playlist-url=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'playlist url', group='url')) - parsed_url = compat_urlparse.urlparse(playlist_url) - qs = compat_urlparse.parse_qs(parsed_url.query) + parsed_url = urllib.parse.urlparse(playlist_url) + qs = urllib.parse.parse_qs(parsed_url.query) item_ids = qs.get('videos_ids[]') or qs.get('news_ids[]') items = self._download_json(playlist_url, display_id) @@ -73,12 +71,12 @@ def _real_extract(self, url): if item_ids: items = [ item for item in items - if item.get('uid') and compat_str(item['uid']) in item_ids] + if item.get('uid') and str(item['uid']) in item_ids] else: items = [items[0]] entries = [] - QUALITIES = ('ld', 'sd', 'hd', ) + QUALITIES = ('ld', 'sd', 'hd') for item in items: title = item['title'] @@ -116,11 +114,10 @@ def _real_extract(self, url): if len(formats) == 1: m3u8_path = ',' else: - tbrs = [compat_str(t) for t in sorted(f['tbr'] for f in formats)] - m3u8_path = '_,%s,%s' % (','.join(tbrs), '.mp4') + tbrs = [str(t) for t in sorted(f['tbr'] for f in formats)] + m3u8_path = '_,{},{}'.format(','.join(tbrs), '.mp4') formats.extend(self._extract_m3u8_formats( - 'http://balancer-vod.1tv.ru/%s%s.urlset/master.m3u8' - % (path, m3u8_path), + f'http://balancer-vod.1tv.ru/{path}{m3u8_path}.urlset/master.m3u8', display_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) @@ -131,12 +128,12 @@ def _real_extract(self, url): 'ya:ovs:upload_date', webpage, 'upload date', default=None)) entries.append({ - 'id': compat_str(item.get('id') or item['uid']), + 'id': str(item.get('id') or item['uid']), 'thumbnail': thumbnail, 'title': title, 'upload_date': upload_date, 'duration': int_or_none(duration), - 'formats': formats + 'formats': formats, }) title = self._html_search_regex( diff --git a/yt_dlp/extractor/flextv.py b/yt_dlp/extractor/flextv.py new file mode 100644 index 0000000000..f3d3eff85f --- /dev/null +++ b/yt_dlp/extractor/flextv.py @@ -0,0 +1,62 @@ +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + UserNotLive, + parse_iso8601, + str_or_none, + traverse_obj, + url_or_none, +) + + +class FlexTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?flextv\.co\.kr/channels/(?P\d+)/live' + _TESTS = [{ + 'url': 'https://www.flextv.co.kr/channels/231638/live', + 'info_dict': { + 'id': '231638', + 'ext': 'mp4', + 'title': r're:^214하나만\.\.\. ', + 'thumbnail': r're:^https?://.+\.jpg', + 'upload_date': r're:\d{8}', + 'timestamp': int, + 'live_status': 'is_live', + 'channel': 'Hi별', + 'channel_id': '244396', + }, + 'skip': 'The channel is offline', + }, { + 'url': 'https://www.flextv.co.kr/channels/746/live', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_id = self._match_id(url) + + try: + stream_data = self._download_json( + f'https://api.flextv.co.kr/api/channels/{channel_id}/stream', + channel_id, query={'option': 'all'}) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: + raise UserNotLive(video_id=channel_id) + raise + + playlist_url = stream_data['sources'][0]['url'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + playlist_url, channel_id, 'mp4') + + return { + 'id': channel_id, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + **traverse_obj(stream_data, { + 'title': ('stream', 'title', {str}), + 'timestamp': ('stream', 'createdAt', {parse_iso8601}), + 'thumbnail': ('thumbUrl', {url_or_none}), + 'channel': ('owner', 'name', {str}), + 'channel_id': ('owner', 'id', {str_or_none}), + }), + } diff --git a/yt_dlp/extractor/flickr.py b/yt_dlp/extractor/flickr.py index 89a40d7e23..507bfe9d41 100644 --- a/yt_dlp/extractor/flickr.py +++ b/yt_dlp/extractor/flickr.py @@ -1,8 +1,6 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_urlencode, -) from ..utils import ( ExtractorError, format_field, @@ -31,7 +29,7 @@ class FlickrIE(InfoExtractor): 'view_count': int, 'tags': list, 'license': 'Attribution-ShareAlike', - } + }, } _API_BASE_URL = 'https://api.flickr.com/services/rest?' # https://help.yahoo.com/kb/flickr/SLN25525.html @@ -52,14 +50,14 @@ class FlickrIE(InfoExtractor): def _call_api(self, method, video_id, api_key, note, secret=None): query = { 'photo_id': video_id, - 'method': 'flickr.%s' % method, + 'method': f'flickr.{method}', 'api_key': api_key, 'format': 'json', 'nojsoncallback': 1, } if secret: query['secret'] = secret - data = self._download_json(self._API_BASE_URL + compat_urllib_parse_urlencode(query), video_id, note) + data = self._download_json(self._API_BASE_URL + urllib.parse.urlencode(query), video_id, note) if data['stat'] != 'ok': raise ExtractorError(data['message']) return data @@ -83,7 +81,7 @@ def _real_extract(self, url): formats = [] for stream in streams['stream']: - stream_type = compat_str(stream.get('type')) + stream_type = str(stream.get('type')) formats.append({ 'format_id': stream_type, 'url': stream['_content'], diff --git a/yt_dlp/extractor/floatplane.py b/yt_dlp/extractor/floatplane.py new file mode 100644 index 0000000000..b7ee160a44 --- /dev/null +++ b/yt_dlp/extractor/floatplane.py @@ -0,0 +1,333 @@ +import functools + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + OnDemandPagedList, + clean_html, + determine_ext, + format_field, + int_or_none, + join_nonempty, + parse_codecs, + parse_iso8601, + url_or_none, + urljoin, +) +from ..utils.traversal import traverse_obj + + +class FloatplaneIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|beta)\.)?floatplane\.com/post/(?P\w+)' + _TESTS = [{ + 'url': 'https://www.floatplane.com/post/2Yf3UedF7C', + 'info_dict': { + 'id': 'yuleLogLTT', + 'ext': 'mp4', + 'display_id': '2Yf3UedF7C', + 'title': '8K Yule Log Fireplace with Crackling Fire Sounds - 10 Hours', + 'description': 'md5:adf2970e0de1c5e3df447818bb0309f6', + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'duration': 36035, + 'comment_count': int, + 'like_count': int, + 'dislike_count': int, + 'release_date': '20191206', + 'release_timestamp': 1575657000, + 'uploader': 'LinusTechTips', + 'uploader_id': '59f94c0bdd241b70349eb72b', + 'uploader_url': 'https://www.floatplane.com/channel/linustechtips/home', + 'channel': 'Linus Tech Tips', + 'channel_id': '63fe42c309e691e4e36de93d', + 'channel_url': 'https://www.floatplane.com/channel/linustechtips/home/main', + 'availability': 'subscriber_only', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.floatplane.com/post/j2jqG3JmgJ', + 'info_dict': { + 'id': 'j2jqG3JmgJ', + 'title': 'TJM: Does Anyone Care About Avatar: The Way of Water?', + 'description': 'md5:00bf17dc5733e4031e99b7fd6489f274', + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'comment_count': int, + 'like_count': int, + 'dislike_count': int, + 'release_timestamp': 1671915900, + 'release_date': '20221224', + 'uploader': 'LinusTechTips', + 'uploader_id': '59f94c0bdd241b70349eb72b', + 'uploader_url': 'https://www.floatplane.com/channel/linustechtips/home', + 'channel': "They're Just Movies", + 'channel_id': '64135f82fc76ab7f9fbdc876', + 'channel_url': 'https://www.floatplane.com/channel/linustechtips/home/tajm', + 'availability': 'subscriber_only', + }, + 'playlist_count': 2, + }, { + 'url': 'https://www.floatplane.com/post/3tK2tInhoN', + 'info_dict': { + 'id': '3tK2tInhoN', + 'title': 'Extras - How Linus Communicates with Editors (Compensator 4)', + 'description': 'md5:83cd40aae1ce124df33769600c80ca5b', + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'comment_count': int, + 'like_count': int, + 'dislike_count': int, + 'release_timestamp': 1700529120, + 'release_date': '20231121', + 'uploader': 'LinusTechTips', + 'uploader_id': '59f94c0bdd241b70349eb72b', + 'uploader_url': 'https://www.floatplane.com/channel/linustechtips/home', + 'channel': 'FP Exclusives', + 'channel_id': '6413623f5b12cca228a28e78', + 'channel_url': 'https://www.floatplane.com/channel/linustechtips/home/fpexclusive', + 'availability': 'subscriber_only', + }, + 'playlist_count': 2, + }, { + 'url': 'https://beta.floatplane.com/post/d870PEFXS1', + 'info_dict': { + 'id': 'bg9SuYKEww', + 'ext': 'mp4', + 'display_id': 'd870PEFXS1', + 'title': 'LCS Drama, TLOU 2 Remaster, Destiny 2 Player Count Drops, + More!', + 'description': 'md5:80d612dcabf41b17487afcbe303ec57d', + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'release_timestamp': 1700622000, + 'release_date': '20231122', + 'duration': 513, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'uploader': 'LinusTechTips', + 'uploader_id': '59f94c0bdd241b70349eb72b', + 'uploader_url': 'https://www.floatplane.com/channel/linustechtips/home', + 'channel': 'GameLinked', + 'channel_id': '649dbade3540dbc3945eeda7', + 'channel_url': 'https://www.floatplane.com/channel/linustechtips/home/gamelinked', + 'availability': 'subscriber_only', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.floatplane.com/post/65B5PNoBtf', + 'info_dict': { + 'id': '65B5PNoBtf', + 'description': 'I recorded the inbuilt demo mode for your 90\'s enjoyment, thanks for being Floaties!', + 'display_id': '65B5PNoBtf', + 'like_count': int, + 'release_timestamp': 1701249480, + 'uploader': 'The Trash Network', + 'availability': 'subscriber_only', + 'uploader_id': '61bc20c9a131fb692bf2a513', + 'uploader_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home', + 'channel_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home/thedrumthing', + 'comment_count': int, + 'title': 'The $50 electronic drum kit.', + 'channel_id': '64424fe73cd58cbcf8d8e131', + 'thumbnail': 'https://pbs.floatplane.com/blogPost_thumbnails/65B5PNoBtf/725555379422705_1701247052743.jpeg', + 'dislike_count': int, + 'channel': 'The Drum Thing', + 'release_date': '20231129', + }, + 'playlist_count': 2, + 'playlist': [{ + 'info_dict': { + 'id': 'ISPJjexylS', + 'ext': 'mp4', + 'release_date': '20231129', + 'release_timestamp': 1701249480, + 'title': 'The $50 electronic drum kit. .mov', + 'channel_id': '64424fe73cd58cbcf8d8e131', + 'thumbnail': 'https://pbs.floatplane.com/video_thumbnails/ISPJjexylS/335202812134041_1701249383392.jpeg', + 'availability': 'subscriber_only', + 'uploader': 'The Trash Network', + 'duration': 622, + 'channel': 'The Drum Thing', + 'uploader_id': '61bc20c9a131fb692bf2a513', + 'channel_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home/thedrumthing', + 'uploader_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home', + }, + }, { + 'info_dict': { + 'id': 'qKfxu6fEpu', + 'ext': 'aac', + 'release_date': '20231129', + 'release_timestamp': 1701249480, + 'title': 'Roland TD-7 Demo.m4a', + 'channel_id': '64424fe73cd58cbcf8d8e131', + 'availability': 'subscriber_only', + 'uploader': 'The Trash Network', + 'duration': 114, + 'channel': 'The Drum Thing', + 'uploader_id': '61bc20c9a131fb692bf2a513', + 'channel_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home/thedrumthing', + 'uploader_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home', + }, + }], + 'skip': 'requires subscription: "The Trash Network"', + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_initialize(self): + if not self._get_cookies('https://www.floatplane.com').get('sails.sid'): + self.raise_login_required() + + def _real_extract(self, url): + post_id = self._match_id(url) + + post_data = self._download_json( + 'https://www.floatplane.com/api/v3/content/post', post_id, query={'id': post_id}, + note='Downloading post data', errnote='Unable to download post data') + + if not any(traverse_obj(post_data, ('metadata', ('hasVideo', 'hasAudio')))): + raise ExtractorError('Post does not contain a video or audio track', expected=True) + + uploader_url = format_field( + post_data, [('creator', 'urlname')], 'https://www.floatplane.com/channel/%s/home') or None + + common_info = { + 'uploader_url': uploader_url, + 'channel_url': urljoin(f'{uploader_url}/', traverse_obj(post_data, ('channel', 'urlname'))), + 'availability': self._availability(needs_subscription=True), + **traverse_obj(post_data, { + 'uploader': ('creator', 'title', {str}), + 'uploader_id': ('creator', 'id', {str}), + 'channel': ('channel', 'title', {str}), + 'channel_id': ('channel', 'id', {str}), + 'release_timestamp': ('releaseDate', {parse_iso8601}), + }), + } + + items = [] + for media in traverse_obj(post_data, (('videoAttachments', 'audioAttachments'), ...)): + media_id = media['id'] + media_typ = media.get('type') or 'video' + + metadata = self._download_json( + f'https://www.floatplane.com/api/v3/content/{media_typ}', media_id, query={'id': media_id}, + note=f'Downloading {media_typ} metadata') + + stream = self._download_json( + 'https://www.floatplane.com/api/v2/cdn/delivery', media_id, query={ + 'type': 'vod' if media_typ == 'video' else 'aod', + 'guid': metadata['guid'], + }, note=f'Downloading {media_typ} stream data') + + path_template = traverse_obj(stream, ('resource', 'uri', {str})) + + def format_path(params): + path = path_template + for i, val in (params or {}).items(): + path = path.replace(f'{{qualityLevelParams.{i}}}', val) + return path + + formats = [] + for quality in traverse_obj(stream, ('resource', 'data', 'qualityLevels', ...)): + url = urljoin(stream['cdn'], format_path(traverse_obj( + stream, ('resource', 'data', 'qualityLevelParams', quality['name'], {dict})))) + formats.append({ + **traverse_obj(quality, { + 'format_id': ('name', {str}), + 'format_note': ('label', {str}), + 'width': ('width', {int}), + 'height': ('height', {int}), + }), + **parse_codecs(quality.get('codecs')), + 'url': url, + 'ext': determine_ext(url.partition('/chunk.m3u8')[0], 'mp4'), + }) + + items.append({ + **common_info, + 'id': media_id, + **traverse_obj(metadata, { + 'title': ('title', {str}), + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('thumbnail', 'path', {url_or_none}), + }), + 'formats': formats, + }) + + post_info = { + **common_info, + 'id': post_id, + 'display_id': post_id, + **traverse_obj(post_data, { + 'title': ('title', {str}), + 'description': ('text', {clean_html}), + 'like_count': ('likes', {int_or_none}), + 'dislike_count': ('dislikes', {int_or_none}), + 'comment_count': ('comments', {int_or_none}), + 'thumbnail': ('thumbnail', 'path', {url_or_none}), + }), + } + + if len(items) > 1: + return self.playlist_result(items, **post_info) + + post_info.update(items[0]) + return post_info + + +class FloatplaneChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|beta)\.)?floatplane\.com/channel/(?P[\w-]+)/home(?:/(?P[\w-]+))?' + _PAGE_SIZE = 20 + _TESTS = [{ + 'url': 'https://www.floatplane.com/channel/linustechtips/home/ltxexpo', + 'info_dict': { + 'id': 'linustechtips/ltxexpo', + 'title': 'LTX Expo', + 'description': 'md5:9819002f9ebe7fd7c75a3a1d38a59149', + }, + 'playlist_mincount': 51, + }, { + 'url': 'https://www.floatplane.com/channel/ShankMods/home', + 'info_dict': { + 'id': 'ShankMods', + 'title': 'Shank Mods', + 'description': 'md5:6dff1bb07cad8e5448e04daad9be1b30', + }, + 'playlist_mincount': 14, + }, { + 'url': 'https://beta.floatplane.com/channel/bitwit_ultra/home', + 'info_dict': { + 'id': 'bitwit_ultra', + 'title': 'Bitwit Ultra', + 'description': 'md5:1452f280bb45962976d4789200f676dd', + }, + 'playlist_mincount': 200, + }] + + def _fetch_page(self, display_id, creator_id, channel_id, page): + query = { + 'id': creator_id, + 'limit': self._PAGE_SIZE, + 'fetchAfter': page * self._PAGE_SIZE, + } + if channel_id: + query['channel'] = channel_id + page_data = self._download_json( + 'https://www.floatplane.com/api/v3/content/creator', display_id, + query=query, note=f'Downloading page {page + 1}') + for post in page_data or []: + yield self.url_result( + f'https://www.floatplane.com/post/{post["id"]}', + FloatplaneIE, id=post['id'], title=post.get('title'), + release_timestamp=parse_iso8601(post.get('releaseDate'))) + + def _real_extract(self, url): + creator, channel = self._match_valid_url(url).group('id', 'channel') + display_id = join_nonempty(creator, channel, delim='/') + + creator_data = self._download_json( + 'https://www.floatplane.com/api/v3/creator/named', + display_id, query={'creatorURL[0]': creator})[0] + + channel_data = traverse_obj( + creator_data, ('channels', lambda _, v: v['urlname'] == channel), get_all=False) or {} + + return self.playlist_result(OnDemandPagedList(functools.partial( + self._fetch_page, display_id, creator_data['id'], channel_data.get('id')), self._PAGE_SIZE), + display_id, title=channel_data.get('title') or creator_data.get('title'), + description=channel_data.get('about') or creator_data.get('about')) diff --git a/yt_dlp/extractor/folketinget.py b/yt_dlp/extractor/folketinget.py index 55a11e591b..95095701bc 100644 --- a/yt_dlp/extractor/folketinget.py +++ b/yt_dlp/extractor/folketinget.py @@ -1,5 +1,6 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import compat_parse_qs from ..utils import ( int_or_none, parse_duration, @@ -42,7 +43,7 @@ def _real_extract(self, url): r'(?s)
    ]*>(.*?)<', webpage, 'description', fatal=False) - player_params = compat_parse_qs(self._search_regex( + player_params = urllib.parse.parse_qs(self._search_regex( r'(?:(?!\1).)+)\1', webpage, 'data', group='value'), video_id, - transform_source=lambda x: compat_urllib_parse_unquote( - compat_b64decode(x).decode('utf-8')))['page']['video'] + transform_source=lambda x: urllib.parse.unquote( + base64.b64decode(x).decode('utf-8')))['page']['video'] title = video['title'] media_id = video['mediaId'] - sources = [compat_str(e['height']) + sources = [str(e['height']) for e in video['encodings'] if e.get('height')] formats = self._extract_formats(url, video_id, media_id, sources) thumbnail = url_or_none(video.get('masterThumb')) - uploader = try_get(video, lambda x: x['user']['username'], compat_str) + uploader = try_get(video, lambda x: x['user']['username'], str) uploader_id = str_or_none(try_get( video, lambda x: x['user']['id'], int)) - channel = try_get(video, lambda x: x['channel']['name'], compat_str) + channel = try_get(video, lambda x: x['channel']['name'], str) channel_id = str_or_none(try_get( video, lambda x: x['channel']['id'], int)) like_count = int_or_none(video.get('likes')) diff --git a/yt_dlp/extractor/fourzerostudio.py b/yt_dlp/extractor/fourzerostudio.py deleted file mode 100644 index c388a3a072..0000000000 --- a/yt_dlp/extractor/fourzerostudio.py +++ /dev/null @@ -1,106 +0,0 @@ -from .common import InfoExtractor -from ..utils import traverse_obj, unified_timestamp - - -class FourZeroStudioArchiveIE(InfoExtractor): - _VALID_URL = r'https?://0000\.studio/(?P[^/]+)/broadcasts/(?P[^/]+)/archive' - IE_NAME = '0000studio:archive' - _TESTS = [{ - 'url': 'https://0000.studio/mumeijiten/broadcasts/1290f433-fce0-4909-a24a-5f7df09665dc/archive', - 'info_dict': { - 'id': '1290f433-fce0-4909-a24a-5f7df09665dc', - 'title': 'noteで『canape』様へのファンレターを執筆します。(数秘術その2)', - 'timestamp': 1653802534, - 'release_timestamp': 1653796604, - 'thumbnails': 'count:1', - 'comments': 'count:7', - 'uploader': '『中崎雄心』の執務室。', - 'uploader_id': 'mumeijiten', - } - }] - - def _real_extract(self, url): - video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') - webpage = self._download_webpage(url, video_id) - nuxt_data = self._search_nuxt_data(webpage, video_id, traverse=None) - - pcb = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorBroadcast'), get_all=False) - uploader_internal_id = traverse_obj(nuxt_data, ( - 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'id'), get_all=False) - - formats, subs = self._extract_m3u8_formats_and_subtitles(pcb['archiveUrl'], video_id, ext='mp4') - - return { - 'id': video_id, - 'title': pcb.get('title'), - 'age_limit': 18 if pcb.get('isAdult') else None, - 'timestamp': unified_timestamp(pcb.get('finishTime')), - 'release_timestamp': unified_timestamp(pcb.get('createdAt')), - 'thumbnails': [{ - 'url': pcb['thumbnailUrl'], - 'ext': 'png', - }] if pcb.get('thumbnailUrl') else None, - 'formats': formats, - 'subtitles': subs, - 'comments': [{ - 'author': c.get('username'), - 'author_id': c.get('postedUserId'), - 'author_thumbnail': c.get('userThumbnailUrl'), - 'id': c.get('id'), - 'text': c.get('body'), - 'timestamp': unified_timestamp(c.get('createdAt')), - 'like_count': c.get('likeCount'), - 'is_favorited': c.get('isLikedByOwner'), - 'author_is_uploader': c.get('postedUserId') == uploader_internal_id, - } for c in traverse_obj(nuxt_data, ( - 'ssrRefs', ..., lambda _, v: v['__typename'] == 'PublicCreatorBroadcastComment')) or []], - 'uploader_id': uploader_id, - 'uploader': traverse_obj(nuxt_data, ( - 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'username'), get_all=False), - } - - -class FourZeroStudioClipIE(InfoExtractor): - _VALID_URL = r'https?://0000\.studio/(?P[^/]+)/archive-clip/(?P[^/]+)' - IE_NAME = '0000studio:clip' - _TESTS = [{ - 'url': 'https://0000.studio/soeji/archive-clip/e46b0278-24cd-40a8-92e1-b8fc2b21f34f', - 'info_dict': { - 'id': 'e46b0278-24cd-40a8-92e1-b8fc2b21f34f', - 'title': 'わたベーさんからイラスト差し入れいただきました。ありがとうございました!', - 'timestamp': 1652109105, - 'like_count': 1, - 'uploader': 'ソエジマケイタ', - 'uploader_id': 'soeji', - } - }] - - def _real_extract(self, url): - video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') - webpage = self._download_webpage(url, video_id) - nuxt_data = self._search_nuxt_data(webpage, video_id, traverse=None) - - clip_info = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorArchivedClip'), get_all=False) - - info = next(( - m for m in self._parse_html5_media_entries(url, webpage, video_id) - if 'mp4' in traverse_obj(m, ('formats', ..., 'ext')) - ), None) - if not info: - self.report_warning('Failed to find a desired media element. Falling back to using NUXT data.') - info = { - 'formats': [{ - 'ext': 'mp4', - 'url': url, - } for url in clip_info.get('mediaFiles') or [] if url], - } - return { - **info, - 'id': video_id, - 'title': clip_info.get('clipComment'), - 'timestamp': unified_timestamp(clip_info.get('createdAt')), - 'like_count': clip_info.get('likeCount'), - 'uploader_id': uploader_id, - 'uploader': traverse_obj(nuxt_data, ( - 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'username'), get_all=False), - } diff --git a/yt_dlp/extractor/fox.py b/yt_dlp/extractor/fox.py index 15c0c48c17..e3cf22d4d6 100644 --- a/yt_dlp/extractor/fox.py +++ b/yt_dlp/extractor/fox.py @@ -1,12 +1,9 @@ import json +import urllib.parse import uuid from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, - compat_urllib_parse_unquote, -) +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -20,7 +17,7 @@ class FOXIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P[\da-fA-F]+)' + _VALID_URL = r'https?://(?:www\.)?fox(?:sports)?\.com/(?:watch|replay)/(?P[\da-fA-F]+)' _TESTS = [{ # clip 'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/', @@ -50,12 +47,16 @@ class FOXIE(InfoExtractor): # sports event, geo-restricted 'url': 'https://www.fox.com/watch/b057484dade738d1f373b3e46216fa2c/', 'only_matching': True, + }, { + # fox sports replay, geo-restricted + 'url': 'https://www.foxsports.com/replay/561f3e071347a24e5e877abc56b22e89', + 'only_matching': True, }] _GEO_BYPASS = False _HOME_PAGE_URL = 'https://www.fox.com/' _API_KEY = '6E9S4bmcoNnZwVLOHywOv8PJEdu76cM9' _access_token = None - _device_id = compat_str(uuid.uuid4()) + _device_id = str(uuid.uuid4()) def _call_api(self, path, video_id, data=None): headers = { @@ -68,9 +69,9 @@ def _call_api(self, path, video_id, data=None): 'https://api3.fox.com/v2.0/' + path, video_id, data=data, headers=headers) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: entitlement_issues = self._parse_json( - e.cause.read().decode(), video_id)['entitlementIssues'] + e.cause.response.read().decode(), video_id)['entitlementIssues'] for e in entitlement_issues: if e.get('errorCode') == 1005: raise ExtractorError( @@ -84,7 +85,7 @@ def _real_initialize(self): if not self._access_token: mvpd_auth = self._get_cookies(self._HOME_PAGE_URL).get('mvpd-auth') if mvpd_auth: - self._access_token = (self._parse_json(compat_urllib_parse_unquote( + self._access_token = (self._parse_json(urllib.parse.unquote( mvpd_auth.value), None, fatal=False) or {}).get('accessToken') if not self._access_token: self._access_token = self._call_api( @@ -96,7 +97,7 @@ def _real_extract(self, url): video_id = self._match_id(url) self._access_token = self._call_api( - 'previewpassmvpd?device_id=%s&mvpd_id=TempPass_fbcfox_60min' % self._device_id, + f'previewpassmvpd?device_id={self._device_id}&mvpd_id=TempPass_fbcfox_60min', video_id)['accessToken'] video = self._call_api('watch', video_id, data=json.dumps({ @@ -109,13 +110,13 @@ def _real_extract(self, url): 'provider': { 'freewheel': {'did': self._device_id}, 'vdms': {'rays': ''}, - 'dmp': {'kuid': '', 'seg': ''} + 'dmp': {'kuid': '', 'seg': ''}, }, 'playlist': '', 'privacy': {'us': '1---'}, 'siteSection': '', 'streamType': 'vod', - 'streamId': video_id}).encode('utf-8')) + 'streamId': video_id}).encode()) title = video['name'] release_url = video['url'] @@ -123,8 +124,8 @@ def _real_extract(self, url): try: m3u8_url = self._download_json(release_url, video_id)['playURL'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - error = self._parse_json(e.cause.read().decode(), video_id) + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + error = self._parse_json(e.cause.response.read().decode(), video_id) if error.get('exception') == 'GeoLocationBlocked': self.raise_geo_restricted(countries=['US']) raise ExtractorError(error['description'], expected=True) diff --git a/yt_dlp/extractor/foxgay.py b/yt_dlp/extractor/foxgay.py deleted file mode 100644 index f4f29c65d1..0000000000 --- a/yt_dlp/extractor/foxgay.py +++ /dev/null @@ -1,58 +0,0 @@ -import itertools - -from .common import InfoExtractor -from ..utils import ( - get_element_by_id, - int_or_none, - remove_end, -) - - -class FoxgayIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P\d+)\.shtml' - _TEST = { - 'url': 'http://foxgay.com/videos/fuck-turkish-style-2582.shtml', - 'md5': '344558ccfea74d33b7adbce22e577f54', - 'info_dict': { - 'id': '2582', - 'ext': 'mp4', - 'title': 'Fuck Turkish-style', - 'description': 'md5:6ae2d9486921891efe89231ace13ffdf', - 'age_limit': 18, - 'thumbnail': r're:https?://.*\.jpg$', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = remove_end(self._html_extract_title(webpage), ' - Foxgay.com') - description = get_element_by_id('inf_tit', webpage) - - # The default user-agent with foxgay cookies leads to pages without videos - self.cookiejar.clear('.foxgay.com') - # Find the URL for the iFrame which contains the actual video. - iframe_url = self._html_search_regex( - r']+src=([\'"])(?P[^\'"]+)\1', webpage, - 'video frame', group='url') - iframe = self._download_webpage( - iframe_url, video_id, headers={'User-Agent': 'curl/7.50.1'}, - note='Downloading video frame') - video_data = self._parse_json(self._search_regex( - r'video_data\s*=\s*([^;]+);', iframe, 'video data'), video_id) - - formats = [{ - 'url': source, - 'height': int_or_none(resolution), - } for source, resolution in zip( - video_data['sources'], video_data.get('resolutions', itertools.repeat(None)))] - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'description': description, - 'thumbnail': video_data.get('act_vid', {}).get('thumb'), - 'age_limit': 18, - } diff --git a/yt_dlp/extractor/foxnews.py b/yt_dlp/extractor/foxnews.py index 52172aacef..6aa63614ef 100644 --- a/yt_dlp/extractor/foxnews.py +++ b/yt_dlp/extractor/foxnews.py @@ -7,8 +7,37 @@ class FoxNewsIE(AMPIE): IE_NAME = 'foxnews' IE_DESC = 'Fox News and Fox Business Video' - _VALID_URL = r'https?://(?Pvideo\.(?:insider\.)?fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P\d+)' + _VALID_URL = r'https?://video\.(?:insider\.)?fox(?:news|business)\.com/v/(?:video-embed\.html\?video_id=)?(?P\d+)' _TESTS = [ + { + 'url': 'https://video.foxnews.com/v/6320653836112', + 'info_dict': { + 'id': '6320653836112', + 'ext': 'mp4', + 'title': 'Tucker Carlson joins \'Gutfeld!\' to discuss his new documentary', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 404, + 'upload_date': '20230217', + 'description': 'md5:858a8a36f59e9ca897d758855bcdfa02', + 'timestamp': 1676611344.0, + }, + 'params': {'skip_download': 'm3u8'}, + }, + { + # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words + 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true', + 'info_dict': { + 'id': '5099377331001', + 'ext': 'mp4', + 'title': '82416_censoring', + 'description': '82416_censoring', + 'upload_date': '20160826', + 'timestamp': 1472169708.0, + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 521, + }, + 'params': {'skip_download': 'm3u8'}, + }, { 'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips', 'md5': '32aaded6ba3ef0d1c04e238d01031e5e', @@ -22,6 +51,7 @@ class FoxNewsIE(AMPIE): 'upload_date': '20110503', 'thumbnail': r're:^https?://.*\.jpg$', }, + 'skip': '404 page', }, { 'url': 'http://video.foxnews.com/v/3922535568001/rep-luis-gutierrez-on-if-obamas-immigration-plan-is-legal/#sp=show-clips', @@ -36,10 +66,7 @@ class FoxNewsIE(AMPIE): 'upload_date': '20141204', 'thumbnail': r're:^https?://.*\.jpg$', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, + 'skip': 'm3u8 HTTP error 400 in web browser', }, { 'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com', @@ -49,11 +76,6 @@ class FoxNewsIE(AMPIE): 'url': 'http://video.foxbusiness.com/v/4442309889001', 'only_matching': True, }, - { - # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words - 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true', - 'only_matching': True, - }, ] @classmethod @@ -67,10 +89,10 @@ def _extract_embed_urls(cls, url, webpage): yield f'https://video.foxnews.com/v/video-embed.html?video_id={mobj.group("video_id")}' def _real_extract(self, url): - host, video_id = self._match_valid_url(url).groups() + video_id = self._match_id(url) info = self._extract_feed_info( - 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) + f'https://api.foxnews.com/v3/video-player/{video_id}?callback=uid_{video_id}') info['id'] = video_id return info @@ -78,6 +100,19 @@ def _real_extract(self, url): class FoxNewsVideoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?foxnews\.com/video/(?P\d+)' _TESTS = [{ + 'url': 'https://www.foxnews.com/video/6328632286112', + 'info_dict': { + 'id': '6328632286112', + 'ext': 'mp4', + 'title': 'Review: 2023 Toyota Prius Prime', + 'duration': 155, + 'thumbnail': r're:^https://.+\.jpg$', + 'timestamp': 1685720177.0, + 'upload_date': '20230602', + 'description': 'md5:b69aafb125b41c1402e9744f53d6edc4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { 'url': 'https://www.foxnews.com/video/6313058664112', 'info_dict': { 'id': '6313058664112', @@ -89,8 +124,7 @@ class FoxNewsVideoIE(InfoExtractor): 'title': 'Gutfeld! - Thursday, September 29', 'timestamp': 1664527538, }, - 'expected_warnings': ['Ignoring subtitle tracks'], - 'params': {'skip_download': 'm3u8'}, + 'skip': '404 page', }] def _real_extract(self, url): @@ -104,19 +138,22 @@ class FoxNewsArticleIE(InfoExtractor): _TESTS = [{ # data-video-id - 'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', - 'md5': '83d44e1aff1433e7a29a7b537d1700b5', + 'url': 'https://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', + 'md5': 'd2dd6ce809cedeefa96460e964821437', 'info_dict': { 'id': '5116295019001', 'ext': 'mp4', 'title': 'Trump and Clinton asked to defend positions on Iraq War', - 'description': 'Veterans react on \'The Kelly File\'', + 'description': 'Veterans and Fox News host Dana Perino react on \'The Kelly File\' to NBC\'s presidential forum', 'timestamp': 1473301045, 'upload_date': '20160908', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 426, }, + 'params': {'skip_download': 'm3u8'}, }, { # iframe embed - 'url': 'http://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', + 'url': 'https://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', 'info_dict': { 'id': '5748266721001', 'ext': 'flv', @@ -127,9 +164,7 @@ class FoxNewsArticleIE(InfoExtractor): 'timestamp': 1520594670, 'upload_date': '20180309', }, - 'params': { - 'skip_download': True, - }, + 'skip': '404 page', }, { 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', 'only_matching': True, diff --git a/yt_dlp/extractor/foxsports.py b/yt_dlp/extractor/foxsports.py index f906a1718d..8e89ccf841 100644 --- a/yt_dlp/extractor/foxsports.py +++ b/yt_dlp/extractor/foxsports.py @@ -1,6 +1,7 @@ from .common import InfoExtractor from .uplynk import UplynkPreplayIE -from ..utils import HEADRequest, float_or_none, make_archive_id, smuggle_url +from ..networking import HEADRequest +from ..utils import float_or_none, make_archive_id, smuggle_url class FoxSportsIE(InfoExtractor): @@ -35,7 +36,7 @@ def _real_extract(self, url): 'x-api-key': 'cf289e299efdfa39fb6316f259d1de93', }) preplay_url = self._request_webpage( - HEADRequest(data['url']), video_id, 'Fetching preplay URL').geturl() + HEADRequest(data['url']), video_id, 'Fetching preplay URL').url return { '_type': 'url_transparent', diff --git a/yt_dlp/extractor/fptplay.py b/yt_dlp/extractor/fptplay.py index 85613bafe5..db9b2e1535 100644 --- a/yt_dlp/extractor/fptplay.py +++ b/yt_dlp/extractor/fptplay.py @@ -84,7 +84,7 @@ def convert(e): a = [0, 0, 0, 0] s = len(e) c = 0 - for z in range(s, 0, -1): + for _ in range(s, 0, -1): if n <= 3: i[n] = e[c] n += 1 diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py index 052317204e..ab08f1c6bf 100644 --- a/yt_dlp/extractor/francetv.py +++ b/yt_dlp/extractor/francetv.py @@ -1,58 +1,64 @@ +import re +import urllib.parse + from .common import InfoExtractor -from ..utils import ( - determine_ext, - ExtractorError, - format_field, - parse_iso8601, - parse_qs, -) from .dailymotion import DailymotionIE +from ..networking import HEADRequest +from ..utils import ( + clean_html, + determine_ext, + filter_dict, + format_field, + int_or_none, + join_nonempty, + parse_iso8601, + smuggle_url, + unsmuggle_url, + url_or_none, +) +from ..utils.traversal import traverse_obj class FranceTVBaseInfoExtractor(InfoExtractor): - def _make_url_result(self, video_or_full_id, catalog=None): - full_id = 'francetv:%s' % video_or_full_id - if '@' not in video_or_full_id and catalog: - full_id += '@%s' % catalog - return self.url_result( - full_id, ie=FranceTVIE.ie_key(), - video_id=video_or_full_id.split('@')[0]) + def _make_url_result(self, video_id, url=None): + video_id = video_id.split('@')[0] # for compat with old @catalog IDs + full_id = f'francetv:{video_id}' + if url: + full_id = smuggle_url(full_id, {'hostname': urllib.parse.urlparse(url).hostname}) + return self.url_result(full_id, FranceTVIE, video_id) class FranceTVIE(InfoExtractor): - _VALID_URL = r'''(?x) - (?: - https?:// - sivideo\.webservices\.francetelevisions\.fr/tools/getInfosOeuvre/v2/\? - .*?\bidDiffusion=[^&]+| - (?: - https?://videos\.francetv\.fr/video/| - francetv: - ) - (?P[^@]+)(?:@(?P.+))? - ) - ''' - _EMBED_REGEX = [r']+?src=(["\'])(?P(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1'] + _VALID_URL = r'francetv:(?P[^@#]+)' + _GEO_COUNTRIES = ['FR'] + _GEO_BYPASS = False _TESTS = [{ - # without catalog - 'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=162311093&callback=_jsonp_loader_callback_request_0', - 'md5': 'c2248a8de38c4e65ea8fae7b5df2d84f', + # tokenized url is in dinfo['video']['token'] + 'url': 'francetv:ec217ecc-0733-48cf-ac06-af1347b849d1', 'info_dict': { - 'id': '162311093', + 'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1', 'ext': 'mp4', 'title': '13h15, le dimanche... - Les mystères de Jésus', - 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42', 'timestamp': 1502623500, + 'duration': 2580, + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20170813', }, + 'params': {'skip_download': 'm3u8'}, }, { - # with catalog - 'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=NI_1004933&catalogue=Zouzous&callback=_jsonp_loader_callback_request_4', - 'only_matching': True, - }, { - 'url': 'http://videos.francetv.fr/video/NI_657393@Regions', - 'only_matching': True, + # tokenized url is in dinfo['video']['token']['akamai'] + 'url': 'francetv:c5bda21d-2c6f-4470-8849-3d8327adb2ba', + 'info_dict': { + 'id': 'c5bda21d-2c6f-4470-8849-3d8327adb2ba', + 'ext': 'mp4', + 'title': '13h15, le dimanche... - Les mystères de Jésus', + 'timestamp': 1514118300, + 'duration': 2880, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20171224', + }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'francetv:162311093', 'only_matching': True, @@ -74,32 +80,33 @@ class FranceTVIE(InfoExtractor): 'only_matching': True, }] - def _extract_video(self, video_id, catalogue=None): - # Videos are identified by idDiffusion so catalogue part is optional. - # However when provided, some extra formats may be returned so we pass - # it if available. + def _extract_video(self, video_id, hostname=None): is_live = None videos = [] + drm_formats = False title = None subtitle = None + episode_number = None + season_number = None image = None duration = None timestamp = None spritesheets = None - for device_type in ('desktop', 'mobile'): + # desktop+chrome returns dash; mobile+safari returns hls + for device_type, browser in [('desktop', 'chrome'), ('mobile', 'safari')]: dinfo = self._download_json( - 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id, - video_id, 'Downloading %s video JSON' % device_type, query={ + f'https://k7.ftven.fr/videos/{video_id}', video_id, + f'Downloading {device_type} {browser} video JSON', query=filter_dict({ 'device_type': device_type, - 'browser': 'chrome', - }, fatal=False) + 'browser': browser, + 'domain': hostname, + }), fatal=False, expected_status=422) # 422 json gives detailed error code/message if not dinfo: continue - video = dinfo.get('video') - if video: + if video := traverse_obj(dinfo, ('video', {dict})): videos.append(video) if duration is None: duration = video.get('duration') @@ -107,12 +114,24 @@ def _extract_video(self, video_id, catalogue=None): is_live = video.get('is_live') if spritesheets is None: spritesheets = video.get('spritesheets') + elif code := traverse_obj(dinfo, ('code', {int})): + if code == 2009: + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + elif code in (2015, 2017): + # 2015: L'accès à cette vidéo est impossible. (DRM-only) + # 2017: Cette vidéo n'est pas disponible depuis le site web mobile (b/c DRM) + drm_formats = True + continue + self.report_warning( + f'{self.IE_NAME} said: {code} "{clean_html(dinfo.get("message"))}"') + continue - meta = dinfo.get('meta') - if meta: + if meta := traverse_obj(dinfo, ('meta', {dict})): if title is None: title = meta.get('title') - # XXX: what is meta['pre_title']? + # meta['pre_title'] contains season and episode number for series in format "S E" + season_number, episode_number = self._search_regex( + r'S(\d+)\s*E(\d+)', meta.get('pre_title'), 'episode info', group=(1, 2), default=(None, None)) if subtitle is None: subtitle = meta.get('additional_title') if image is None: @@ -120,43 +139,49 @@ def _extract_video(self, video_id, catalogue=None): if timestamp is None: timestamp = parse_iso8601(meta.get('broadcasted_at')) - formats = [] - subtitles = {} - for video in videos: + if not videos and drm_formats: + self.report_drm(video_id) + + formats, subtitles, video_url = [], {}, None + for video in traverse_obj(videos, lambda _, v: url_or_none(v['url'])): + video_url = video['url'] format_id = video.get('format') - video_url = None - if video.get('workflow') == 'token-akamai': - token_url = video.get('token') - if token_url: - token_json = self._download_json( - token_url, video_id, - 'Downloading signed %s manifest URL' % format_id) - if token_json: - video_url = token_json.get('url') - if not video_url: - video_url = video.get('url') + if token_url := traverse_obj(video, ('token', (None, 'akamai'), {url_or_none}, any)): + tokenized_url = traverse_obj(self._download_json( + token_url, video_id, f'Downloading signed {format_id} manifest URL', + fatal=False, query={ + 'format': 'json', + 'url': video_url, + }), ('url', {url_or_none})) + if tokenized_url: + video_url = tokenized_url ext = determine_ext(video_url) if ext == 'f4m': formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id=format_id, fatal=False)) + video_url, video_id, f4m_id=format_id or ext, fatal=False)) elif ext == 'm3u8': + format_id = format_id or 'hls' fmts, subs = self._extract_m3u8_formats_and_subtitles( - video_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id=format_id, - fatal=False) + video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False) + for f in traverse_obj(fmts, lambda _, v: v['vcodec'] == 'none' and v.get('tbr') is None): + if mobj := re.match(rf'{format_id}-[Aa]udio-\w+-(?P\d+)', f['format_id']): + f.update({ + 'tbr': int_or_none(mobj.group('bitrate')), + 'acodec': 'mp4a', + }) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) elif ext == 'mpd': fmts, subs = self._extract_mpd_formats_and_subtitles( - video_url, video_id, mpd_id=format_id, fatal=False) + video_url, video_id, mpd_id=format_id or 'dash', fatal=False) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) elif video_url.startswith('rtmp'): formats.append({ 'url': video_url, - 'format_id': 'rtmp-%s' % format_id, + 'format_id': join_nonempty('rtmp', format_id), 'ext': 'flv', }) else: @@ -168,10 +193,17 @@ def _extract_video(self, video_id, catalogue=None): # XXX: what is video['captions']? + if not formats and video_url: + urlh = self._request_webpage( + HEADRequest(video_url), video_id, 'Checking for geo-restriction', + fatal=False, expected_status=403) + if urlh and urlh.headers.get('x-errortype') == 'geo': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + for f in formats: if f.get('acodec') != 'none' and f.get('language') in ('qtz', 'qad'): f['language_preference'] = -10 - f['format_note'] = 'audio description%s' % format_field(f, 'format_note', ', %s') + f['format_note'] = 'audio description{}'.format(format_field(f, 'format_note', ', %s')) if spritesheets: formats.append({ @@ -185,40 +217,34 @@ def _extract_video(self, video_id, catalogue=None): 'fragments': [{ 'url': sheet, # XXX: not entirely accurate; each spritesheet seems to be - # a 10×10 grid of thumbnails corresponding to approximately + # a 10x10 grid of thumbnails corresponding to approximately # 2 seconds of the video; the last spritesheet may be shorter 'duration': 200, - } for sheet in spritesheets] + } for sheet in traverse_obj(spritesheets, (..., {url_or_none}))], }) - if subtitle: - title += ' - %s' % subtitle - title = title.strip() - return { 'id': video_id, - 'title': title, + 'title': join_nonempty(title, subtitle, delim=' - ').strip(), 'thumbnail': image, 'duration': duration, 'timestamp': timestamp, 'is_live': is_live, 'formats': formats, 'subtitles': subtitles, + 'episode': subtitle if episode_number else None, + 'series': title if episode_number else None, + 'episode_number': int_or_none(episode_number), + 'season_number': int_or_none(season_number), + '_format_sort_fields': ('res', 'tbr', 'proto'), # prioritize m3u8 over dash } def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - catalog = mobj.group('catalog') + url, smuggled_data = unsmuggle_url(url, {}) + video_id = self._match_id(url) + hostname = smuggled_data.get('hostname') or 'www.france.tv' - if not video_id: - qs = parse_qs(url) - video_id = qs.get('idDiffusion', [None])[0] - catalog = qs.get('catalogue', [None])[0] - if not video_id: - raise ExtractorError('Invalid URL', expected=True) - - return self._extract_video(video_id, catalog) + return self._extract_video(video_id, hostname=hostname) class FranceTVSiteIE(FranceTVBaseInfoExtractor): @@ -227,17 +253,55 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): _TESTS = [{ 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html', 'info_dict': { - 'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1', + 'id': 'c5bda21d-2c6f-4470-8849-3d8327adb2ba', 'ext': 'mp4', 'title': '13h15, le dimanche... - Les mystères de Jésus', - 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42', - 'timestamp': 1502623500, - 'upload_date': '20170813', + 'timestamp': 1514118300, + 'duration': 2880, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20171224', }, 'params': { 'skip_download': True, }, 'add_ie': [FranceTVIE.ie_key()], + }, { + # geo-restricted + 'url': 'https://www.france.tv/enfants/six-huit-ans/foot2rue/saison-1/3066387-duel-au-vieux-port.html', + 'info_dict': { + 'id': 'a9050959-eedd-4b4a-9b0d-de6eeaa73e44', + 'ext': 'mp4', + 'title': 'Foot2Rue - Duel au vieux port', + 'episode': 'Duel au vieux port', + 'series': 'Foot2Rue', + 'episode_number': 1, + 'season_number': 1, + 'timestamp': 1642761360, + 'upload_date': '20220121', + 'season': 'Season 1', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1441, + }, + }, { + # geo-restricted livestream (workflow == 'token-akamai') + 'url': 'https://www.france.tv/france-4/direct.html', + 'info_dict': { + 'id': '9a6a7670-dde9-4264-adbc-55b89558594b', + 'ext': 'mp4', + 'title': r're:France 4 en direct .+', + 'live_status': 'is_live', + }, + 'skip': 'geo-restricted livestream', + }, { + # livestream (workflow == 'dai') + 'url': 'https://www.france.tv/france-2/direct.html', + 'info_dict': { + 'id': '006194ea-117d-4bcf-94a9-153d999c59ae', + 'ext': 'mp4', + 'title': r're:France 2 en direct .+', + 'live_status': 'is_live', + }, + 'params': {'skip_download': 'livestream'}, }, { # france3 'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html', @@ -254,10 +318,6 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): # franceo 'url': 'https://www.france.tv/france-o/archipels/132249-mon-ancetre-l-esclave.html', 'only_matching': True, - }, { - # france2 live - 'url': 'https://www.france.tv/france-2/direct.html', - 'only_matching': True, }, { 'url': 'https://www.france.tv/documentaires/histoire/136517-argentine-les-500-bebes-voles-de-la-dictature.html', 'only_matching': True, @@ -281,17 +341,16 @@ def _real_extract(self, url): webpage = self._download_webpage(url, display_id) - catalogue = None video_id = self._search_regex( r'(?:data-main-video\s*=|videoId["\']?\s*[:=])\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video id', default=None, group='id') if not video_id: - video_id, catalogue = self._html_search_regex( - r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', - webpage, 'video ID').split('@') + video_id = self._html_search_regex( + r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@"]+@[^"]+)"', + webpage, 'video ID') - return self._make_url_result(video_id, catalogue) + return self._make_url_result(video_id, url=url) class FranceTVInfoIE(FranceTVBaseInfoExtractor): @@ -305,8 +364,9 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): 'ext': 'mp4', 'title': 'Soir 3', 'upload_date': '20190822', - 'timestamp': 1566510900, - 'description': 'md5:72d167097237701d6e8452ff03b83c00', + 'timestamp': 1566510730, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'duration': 1637, 'subtitles': { 'fr': 'mincount:2', }, @@ -321,8 +381,8 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): 'info_dict': { 'id': '7d204c9e-a2d3-11eb-9e4c-000d3a23d482', 'ext': 'mp4', - 'title': 'Covid-19 : une situation catastrophique à New Dehli', - 'thumbnail': str, + 'title': 'Covid-19 : une situation catastrophique à New Dehli - Édition du mercredi 21 avril 2021', + 'thumbnail': r're:^https?://.*\.jpe?g$', 'duration': 76, 'timestamp': 1619028518, 'upload_date': '20210421', @@ -348,11 +408,17 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): 'id': 'x4iiko0', 'ext': 'mp4', 'title': 'NDDL, référendum, Brexit : Cécile Duflot répond à Patrick Cohen', - 'description': 'Au lendemain de la victoire du "oui" au référendum sur l\'aéroport de Notre-Dame-des-Landes, l\'ancienne ministre écologiste est l\'invitée de Patrick Cohen. Plus d\'info : https://www.franceinter.fr/emissions/le-7-9/le-7-9-27-juin-2016', + 'description': 'md5:fdcb582c370756293a65cdfbc6ecd90e', 'timestamp': 1467011958, - 'upload_date': '20160627', 'uploader': 'France Inter', 'uploader_id': 'x2q2ez', + 'upload_date': '20160627', + 'view_count': int, + 'tags': ['Politique', 'France Inter', '27 juin 2016', 'Linvité de 8h20', 'Cécile Duflot', 'Patrick Cohen'], + 'age_limit': 0, + 'duration': 640, + 'like_count': int, + 'thumbnail': r're:https://[^/?#]+/v/[^/?#]+/x1080', }, 'add_ie': ['Dailymotion'], }, { @@ -382,4 +448,4 @@ def _real_extract(self, url): r'(?:data-id|]*>(?P[^<]+)<', response, 'error message', default=None, group='error') if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError(f'Unable to login: {error}', expected=True) raise ExtractorError('Unable to log in') class FrontendMastersPageBaseIE(FrontendMastersBaseIE): def _download_course(self, course_name, url): return self._download_json( - '%s/courses/%s' % (self._API_BASE, course_name), course_name, + f'{self._API_BASE}/courses/{course_name}', course_name, 'Downloading course JSON', headers={'Referer': url}) @staticmethod @@ -92,7 +89,7 @@ def _extract_lesson(chapters, lesson_id, lesson): duration = None timestamp = lesson.get('timestamp') - if isinstance(timestamp, compat_str): + if isinstance(timestamp, str): mobj = re.search( r'(?P\d{1,2}:\d{1,2}:\d{1,2})\s*-(?P\s*\d{1,2}:\d{1,2}:\d{1,2})', timestamp) @@ -102,7 +99,7 @@ def _extract_lesson(chapters, lesson_id, lesson): return { '_type': 'url_transparent', - 'url': 'frontendmasters:%s' % lesson_id, + 'url': f'frontendmasters:{lesson_id}', 'ie_key': FrontendMastersIE.ie_key(), 'id': lesson_id, 'display_id': display_id, @@ -134,16 +131,16 @@ class FrontendMastersIE(FrontendMastersBaseIE): def _real_extract(self, url): lesson_id = self._match_id(url) - source_url = '%s/video/%s/source' % (self._API_BASE, lesson_id) + source_url = f'{self._API_BASE}/video/{lesson_id}/source' formats = [] for ext in ('webm', 'mp4'): for quality in ('low', 'mid', 'high'): resolution = self._QUALITIES[quality].copy() - format_id = '%s-%s' % (ext, quality) + format_id = f'{ext}-{quality}' format_url = self._download_json( source_url, lesson_id, - 'Downloading %s source JSON' % format_id, query={ + f'Downloading {format_id} source JSON', query={ 'f': ext, 'r': resolution['height'], }, headers={ @@ -163,15 +160,15 @@ def _real_extract(self, url): subtitles = { 'en': [{ - 'url': '%s/transcripts/%s.vtt' % (self._API_BASE, lesson_id), - }] + 'url': f'{self._API_BASE}/transcripts/{lesson_id}.vtt', + }], } return { 'id': lesson_id, 'title': lesson_id, 'formats': formats, - 'subtitles': subtitles + 'subtitles': subtitles, } diff --git a/yt_dlp/extractor/fujitv.py b/yt_dlp/extractor/fujitv.py index 668bb2743c..a2d1a828b4 100644 --- a/yt_dlp/extractor/fujitv.py +++ b/yt_dlp/extractor/fujitv.py @@ -1,5 +1,5 @@ -from ..utils import HEADRequest from .common import InfoExtractor +from ..networking import HEADRequest class FujiTVFODPlus7IE(InfoExtractor): @@ -34,7 +34,7 @@ class FujiTVFODPlus7IE(InfoExtractor): 'series': 'ちびまる子ちゃん', 'series_id': '5d40', 'thumbnail': 'https://i.fod.fujitv.co.jp/img/program/5d40/episode/5d40810083_a.jpg'}, - 'skip': 'Video available only in one week' + 'skip': 'Video available only in one week', }] def _real_extract(self, url): @@ -43,7 +43,9 @@ def _real_extract(self, url): json_info = {} token = self._get_cookies(url).get('CT') if token: - json_info = self._download_json('https://fod-sp.fujitv.co.jp/apps/api/episode/detail/?ep_id=%s&is_premium=false' % video_id, video_id, headers={'x-authorization': f'Bearer {token.value}'}, fatal=False) + json_info = self._download_json( + f'https://fod-sp.fujitv.co.jp/apps/api/episode/detail/?ep_id={video_id}&is_premium=false', + video_id, headers={'x-authorization': f'Bearer {token.value}'}, fatal=False) else: self.report_warning(f'The token cookie is needed to extract video metadata. {self._login_hint("cookies")}') formats, subtitles = [], {} @@ -67,5 +69,5 @@ def _real_extract(self, url): 'formats': formats, 'subtitles': subtitles, 'thumbnail': f'{self._BASE_URL}img/program/{series_id}/episode/{video_id}_a.jpg', - '_format_sort_fields': ('tbr', ) + '_format_sort_fields': ('tbr', ), } diff --git a/yt_dlp/extractor/funimation.py b/yt_dlp/extractor/funimation.py index 47c316664a..d3e61c84f8 100644 --- a/yt_dlp/extractor/funimation.py +++ b/yt_dlp/extractor/funimation.py @@ -3,7 +3,7 @@ import string from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, determine_ext, @@ -46,8 +46,8 @@ def _perform_login(self, username, password): })) FunimationBaseIE._TOKEN = data['token'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read().decode(), None)['error'] + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + error = self._parse_json(e.cause.response.read().decode(), None)['error'] raise ExtractorError(error, expected=True) raise @@ -96,7 +96,7 @@ def _real_extract(self, url): f'{show}_{episode}', query={ 'deviceType': 'web', 'region': self._REGION, - 'locale': locale or 'en' + 'locale': locale or 'en', }), ('videoList', ..., 'id'), get_all=False) return self.url_result(f'https://www.funimation.com/player/{video_id}', FunimationIE.ie_key(), video_id) @@ -157,7 +157,7 @@ def _get_experiences(episode): yield lang, version.title(), f def _get_episode(self, webpage, experience_id=None, episode_id=None, fatal=True): - ''' Extract the episode, season and show objects given either episode/experience id ''' + """ Extract the episode, season and show objects given either episode/experience id """ show = self._parse_json( self._search_regex( r'show\s*=\s*({.+?})\s*;', webpage, 'show data', fatal=fatal), @@ -199,16 +199,16 @@ def _real_extract(self, url): continue thumbnails.append({'url': fmt.get('poster')}) duration = max(duration, fmt.get('duration', 0)) - format_name = '%s %s (%s)' % (version, lang, experience_id) + format_name = f'{version} {lang} ({experience_id})' self.extract_subtitles( subtitles, experience_id, display_id=display_id, format_name=format_name, episode=episode if experience_id == initial_experience_id else episode_id) headers = {} if self._TOKEN: - headers['Authorization'] = 'Token %s' % self._TOKEN + headers['Authorization'] = f'Token {self._TOKEN}' page = self._download_json( - 'https://www.funimation.com/api/showexperience/%s/' % experience_id, + f'https://www.funimation.com/api/showexperience/{experience_id}/', display_id, headers=headers, expected_status=403, query={ 'pinst_id': ''.join(random.choices(string.digits + string.ascii_letters, k=8)), }, note=f'Downloading {format_name} JSON') @@ -216,7 +216,7 @@ def _real_extract(self, url): if not sources: error = try_get(page, lambda x: x['errors'][0], dict) if error: - self.report_warning('%s said: Error %s - %s' % ( + self.report_warning('{} said: Error {} - {}'.format( self.IE_NAME, error.get('code'), error.get('detail') or error.get('title'))) else: self.report_warning('No sources found for format') @@ -227,11 +227,11 @@ def _real_extract(self, url): source_type = source.get('videoType') or determine_ext(source_url) if source_type == 'm3u8': current_formats.extend(self._extract_m3u8_formats( - source_url, display_id, 'mp4', m3u8_id='%s-%s' % (experience_id, 'hls'), fatal=False, + source_url, display_id, 'mp4', m3u8_id='{}-{}'.format(experience_id, 'hls'), fatal=False, note=f'Downloading {format_name} m3u8 information')) else: current_formats.append({ - 'format_id': '%s-%s' % (experience_id, source_type), + 'format_id': f'{experience_id}-{source_type}', 'url': source_url, }) for f in current_formats: @@ -284,7 +284,7 @@ def _get_subtitles(self, subtitles, experience_id, episode, display_id, format_n sub_type = sub_type if sub_type != 'FULL' else None current_sub = { 'url': text_track['src'], - 'name': join_nonempty(version, text_track.get('label'), sub_type, delim=' ') + 'name': join_nonempty(version, text_track.get('label'), sub_type, delim=' '), } lang = join_nonempty(text_track.get('language', 'und'), version if version != 'Simulcast' else None, @@ -301,8 +301,8 @@ class FunimationShowIE(FunimationBaseIE): _TESTS = [{ 'url': 'https://www.funimation.com/en/shows/sk8-the-infinity', 'info_dict': { - 'id': 1315000, - 'title': 'SK8 the Infinity' + 'id': '1315000', + 'title': 'SK8 the Infinity', }, 'playlist_count': 13, 'params': { @@ -312,8 +312,8 @@ class FunimationShowIE(FunimationBaseIE): # without lang code 'url': 'https://www.funimation.com/shows/ouran-high-school-host-club/', 'info_dict': { - 'id': 39643, - 'title': 'Ouran High School Host Club' + 'id': '39643', + 'title': 'Ouran High School Host Club', }, 'playlist_count': 26, 'params': { @@ -329,21 +329,21 @@ def _real_extract(self, url): base_url, locale, display_id = self._match_valid_url(url).groups() show_info = self._download_json( - 'https://title-api.prd.funimationsvc.com/v2/shows/%s?region=%s&deviceType=web&locale=%s' - % (display_id, self._REGION, locale or 'en'), display_id) + 'https://title-api.prd.funimationsvc.com/v2/shows/{}?region={}&deviceType=web&locale={}'.format( + display_id, self._REGION, locale or 'en'), display_id) items_info = self._download_json( - 'https://prod-api-funimationnow.dadcdigital.com/api/funimation/episodes/?limit=99999&title_id=%s' - % show_info.get('id'), display_id) + 'https://prod-api-funimationnow.dadcdigital.com/api/funimation/episodes/?limit=99999&title_id={}'.format( + show_info.get('id')), display_id) vod_items = traverse_obj(items_info, ('items', ..., lambda k, _: re.match(r'(?i)mostRecent[AS]vod', k), 'item')) return { '_type': 'playlist', - 'id': show_info['id'], + 'id': str_or_none(show_info['id']), 'title': show_info['name'], 'entries': orderedSet( self.url_result( - '%s/%s' % (base_url, vod_item.get('episodeSlug')), FunimationPageIE.ie_key(), + '{}/{}'.format(base_url, vod_item.get('episodeSlug')), FunimationPageIE.ie_key(), vod_item.get('episodeId'), vod_item.get('episodeName')) for vod_item in sorted(vod_items, key=lambda x: x.get('episodeOrder', -1))), } diff --git a/yt_dlp/extractor/funk.py b/yt_dlp/extractor/funk.py index 539d719c5b..8bdea3fce7 100644 --- a/yt_dlp/extractor/funk.py +++ b/yt_dlp/extractor/funk.py @@ -1,25 +1,29 @@ from .common import InfoExtractor from .nexx import NexxIE -from ..utils import ( - int_or_none, - str_or_none, -) class FunkIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.|origin\.)?funk\.net/(?:channel|playlist)/[^/]+/(?P[0-9a-z-]+)-(?P\d+)' _TESTS = [{ 'url': 'https://www.funk.net/channel/ba-793/die-lustigsten-instrumente-aus-dem-internet-teil-2-1155821', - 'md5': '8dd9d9ab59b4aa4173b3197f2ea48e81', + 'md5': '8610449476156f338761a75391b0017d', 'info_dict': { 'id': '1155821', 'ext': 'mp4', 'title': 'Die LUSTIGSTEN INSTRUMENTE aus dem Internet - Teil 2', - 'description': 'md5:a691d0413ef4835588c5b03ded670c1f', + 'description': 'md5:2a03b67596eda0d1b5125c299f45e953', 'timestamp': 1514507395, 'upload_date': '20171229', + 'duration': 426.0, + 'cast': ['United Creators PMB GmbH'], + 'thumbnail': 'https://assets.nexx.cloud/media/75/56/79/3YKUSJN1LACN0CRxL.jpg', + 'display_id': 'die-lustigsten-instrumente-aus-dem-internet-teil-2', + 'alt_title': 'Die LUSTIGSTEN INSTRUMENTE aus dem Internet Teil 2', + 'season_number': 0, + 'season': 'Season 0', + 'episode_number': 0, + 'episode': 'Episode 0', }, - }, { 'url': 'https://www.funk.net/playlist/neuesteVideos/kameras-auf-dem-fusion-festival-1618699', 'only_matching': True, @@ -27,18 +31,10 @@ class FunkIE(InfoExtractor): def _real_extract(self, url): display_id, nexx_id = self._match_valid_url(url).groups() - video = self._download_json( - 'https://www.funk.net/api/v4.0/videos/' + nexx_id, nexx_id) return { '_type': 'url_transparent', - 'url': 'nexx:741:' + nexx_id, + 'url': f'nexx:741:{nexx_id}', 'ie_key': NexxIE.ie_key(), 'id': nexx_id, - 'title': video.get('title'), - 'description': video.get('description'), - 'duration': int_or_none(video.get('duration')), - 'channel_id': str_or_none(video.get('channelId')), 'display_id': display_id, - 'tags': video.get('tags'), - 'thumbnail': video.get('imageUrlLandscape'), } diff --git a/yt_dlp/extractor/funker530.py b/yt_dlp/extractor/funker530.py new file mode 100644 index 0000000000..5d59e9c983 --- /dev/null +++ b/yt_dlp/extractor/funker530.py @@ -0,0 +1,80 @@ +from .common import InfoExtractor +from .rumble import RumbleEmbedIE +from .youtube import YoutubeIE +from ..utils import ExtractorError, clean_html, get_element_by_class, strip_or_none + + +class Funker530IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?funker530\.com/video/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://funker530.com/video/azov-patrol-caught-in-open-under-automatic-grenade-launcher-fire/', + 'md5': '085f50fea27523a388bbc22e123e09c8', + 'info_dict': { + 'id': 'v2qbmu4', + 'ext': 'mp4', + 'title': 'Azov Patrol Caught In Open Under Automatic Grenade Launcher Fire', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Funker530', + 'channel': 'Funker530', + 'channel_url': 'https://rumble.com/c/c-1199543', + 'width': 1280, + 'height': 720, + 'fps': 25, + 'duration': 27, + 'upload_date': '20230608', + 'timestamp': 1686241321, + 'live_status': 'not_live', + 'description': 'md5:bea2e1f458095414e04b5ac189c2f980', + }, + }, { + 'url': 'https://funker530.com/video/my-friends-joined-the-russians-civdiv/', + 'md5': 'a42c2933391210662e93e867d7124b70', + 'info_dict': { + 'id': 'k-pk4bOvoac', + 'ext': 'mp4', + 'view_count': int, + 'channel': 'Civ Div', + 'comment_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/k-pk4bOvoac/maxresdefault.jpg', + 'uploader_id': '@CivDiv', + 'duration': 357, + 'channel_url': 'https://www.youtube.com/channel/UCgsCiwJ88up-YyMHo7hL5-A', + 'tags': [], + 'uploader_url': 'https://www.youtube.com/@CivDiv', + 'channel_id': 'UCgsCiwJ88up-YyMHo7hL5-A', + 'like_count': int, + 'description': 'md5:aef75ec3f59c07a0e39400f609b24429', + 'live_status': 'not_live', + 'age_limit': 0, + 'uploader': 'Civ Div', + 'categories': ['People & Blogs'], + 'title': 'My “Friends” joined the Russians.', + 'availability': 'public', + 'upload_date': '20230608', + 'playable_in_embed': True, + 'heatmap': 'count:100', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + info = {} + rumble_url = list(RumbleEmbedIE._extract_embed_urls(url, webpage)) + if rumble_url: + info = {'url': rumble_url[0], 'ie_key': RumbleEmbedIE.ie_key()} + else: + youtube_url = list(YoutubeIE._extract_embed_urls(url, webpage)) + if youtube_url: + info = {'url': youtube_url[0], 'ie_key': YoutubeIE.ie_key()} + if not info: + raise ExtractorError('No videos found on webpage', expected=True) + + return { + **info, + '_type': 'url_transparent', + 'description': strip_or_none(self._search_regex( + r'(?s)(.+)About the Author', clean_html(get_element_by_class('video-desc-paragraph', webpage)), + 'description', default=None)), + } diff --git a/yt_dlp/extractor/fusion.py b/yt_dlp/extractor/fusion.py deleted file mode 100644 index 689422fca4..0000000000 --- a/yt_dlp/extractor/fusion.py +++ /dev/null @@ -1,81 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - determine_ext, - int_or_none, - mimetype2ext, - parse_iso8601, -) - - -class FusionIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?fusion\.(?:net|tv)/(?:video/|show/.+?\bvideo=)(?P\d+)' - _TESTS = [{ - 'url': 'http://fusion.tv/video/201781/u-s-and-panamanian-forces-work-together-to-stop-a-vessel-smuggling-drugs/', - 'info_dict': { - 'id': '3145868', - 'ext': 'mp4', - 'title': 'U.S. and Panamanian forces work together to stop a vessel smuggling drugs', - 'description': 'md5:0cc84a9943c064c0f46b128b41b1b0d7', - 'duration': 140.0, - 'timestamp': 1442589635, - 'uploader': 'UNIVISON', - 'upload_date': '20150918', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Anvato'], - }, { - 'url': 'http://fusion.tv/video/201781', - 'only_matching': True, - }, { - 'url': 'https://fusion.tv/show/food-exposed-with-nelufar-hedayat/?ancla=full-episodes&video=588644', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - video = self._download_json( - 'https://platform.fusion.net/wp-json/fusiondotnet/v1/video/' + video_id, video_id) - - info = { - 'id': video_id, - 'title': video['title'], - 'description': video.get('excerpt'), - 'timestamp': parse_iso8601(video.get('published')), - 'series': video.get('show'), - } - - formats = [] - src = video.get('src') or {} - for f_id, f in src.items(): - for q_id, q in f.items(): - q_url = q.get('url') - if not q_url: - continue - ext = determine_ext(q_url, mimetype2ext(q.get('type'))) - if ext == 'smil': - formats.extend(self._extract_smil_formats(q_url, video_id, fatal=False)) - elif f_id == 'm3u8-variant' or (ext == 'm3u8' and q_id == 'Variant'): - formats.extend(self._extract_m3u8_formats( - q_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'format_id': '-'.join([f_id, q_id]), - 'url': q_url, - 'width': int_or_none(q.get('width')), - 'height': int_or_none(q.get('height')), - 'tbr': int_or_none(self._search_regex(r'_(\d+)\.m(?:p4|3u8)', q_url, 'bitrate')), - 'ext': 'mp4' if ext == 'm3u8' else ext, - 'protocol': 'm3u8_native' if ext == 'm3u8' else 'https', - }) - if formats: - info['formats'] = formats - else: - info.update({ - '_type': 'url', - 'url': 'anvato:uni:' + video['video_ids']['anvato'], - 'ie_key': 'Anvato', - }) - - return info diff --git a/yt_dlp/extractor/fuyintv.py b/yt_dlp/extractor/fuyintv.py index 197901d570..f46839bba6 100644 --- a/yt_dlp/extractor/fuyintv.py +++ b/yt_dlp/extractor/fuyintv.py @@ -11,7 +11,7 @@ class FuyinTVIE(InfoExtractor): 'ext': 'mp4', 'title': '第1集', 'description': 'md5:21a3d238dc8d49608e1308e85044b9c3', - } + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/gab.py b/yt_dlp/extractor/gab.py index 5016e2ff9e..024628e1ca 100644 --- a/yt_dlp/extractor/gab.py +++ b/yt_dlp/extractor/gab.py @@ -7,7 +7,7 @@ parse_codecs, parse_duration, str_to_int, - unified_timestamp + unified_timestamp, ) @@ -19,33 +19,32 @@ class GabTVIE(InfoExtractor): 'id': '61217eacea5665de450d0488', 'ext': 'mp4', 'title': 'WHY WAS AMERICA IN AFGHANISTAN - AMERICA FIRST AGAINST AMERICAN OLIGARCHY', - 'description': None, 'uploader': 'Wurzelroot', 'uploader_id': '608fb0a85738fd1974984f7d', 'thumbnail': 'https://tv.gab.com/image/61217eacea5665de450d0488', - } + }, }] def _real_extract(self, url): - id = self._match_id(url).split('-')[-1] - webpage = self._download_webpage(url, id) + video_id = self._match_id(url).split('-')[-1] + webpage = self._download_webpage(url, video_id) channel_id = self._search_regex(r'data-channel-id=\"(?P[^\"]+)', webpage, 'channel_id') channel_name = self._search_regex(r'data-channel-name=\"(?P[^\"]+)', webpage, 'channel_name') title = self._search_regex(r'data-episode-title=\"(?P[^\"]+)', webpage, 'title') view_key = self._search_regex(r'data-view-key=\"(?P[^\"]+)', webpage, 'view_key') description = clean_html( self._html_search_regex(self._meta_regex('description'), webpage, 'description', group='content')) or None - available_resolutions = re.findall(r'[^\"]+)' % id, - webpage) + available_resolutions = re.findall( + rf'[^\"]+)', webpage) formats = [] for resolution in available_resolutions: frmt = { - 'url': f'https://tv.gab.com/media/{id}?viewKey={view_key}&r={resolution}', + 'url': f'https://tv.gab.com/media/{video_id}?viewKey={view_key}&r={resolution}', 'format_id': resolution, 'vcodec': 'h264', 'acodec': 'aac', - 'ext': 'mp4' + 'ext': 'mp4', } if 'audio-' in resolution: frmt['abr'] = str_to_int(resolution.replace('audio-', '')) @@ -56,13 +55,13 @@ def _real_extract(self, url): formats.append(frmt) return { - 'id': id, + 'id': video_id, 'title': title, 'formats': formats, 'description': description, 'uploader': channel_name, 'uploader_id': channel_id, - 'thumbnail': f'https://tv.gab.com/image/{id}', + 'thumbnail': f'https://tv.gab.com/image/{video_id}', } @@ -80,7 +79,7 @@ class GabIE(InfoExtractor): 'description': 'md5:204055fafd5e1a519f5d6db953567ca3', 'timestamp': 1635192289, 'upload_date': '20211025', - } + }, }, { 'url': 'https://gab.com/TheLonelyProud/posts/107045884469287653', 'md5': 'f9cefcfdff6418e392611a828d47839d', @@ -92,7 +91,7 @@ class GabIE(InfoExtractor): 'timestamp': 1633390571, 'upload_date': '20211004', 'uploader': 'TheLonelyProud', - } + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/gaia.py b/yt_dlp/extractor/gaia.py index c84386f2cb..048ea517ba 100644 --- a/yt_dlp/extractor/gaia.py +++ b/yt_dlp/extractor/gaia.py @@ -1,8 +1,6 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, -) from ..utils import ( ExtractorError, int_or_none, @@ -52,7 +50,7 @@ class GaiaIE(InfoExtractor): def _real_initialize(self): auth = self._get_cookies('https://www.gaia.com/').get('auth') if auth: - auth = self._parse_json(compat_urllib_parse_unquote(auth.value), None, fatal=False) + auth = self._parse_json(urllib.parse.unquote(auth.value), None, fatal=False) self._jwt = auth.get('jwt') def _perform_login(self, username, password): @@ -62,7 +60,7 @@ def _perform_login(self, username, password): 'https://auth.gaia.com/v1/login', None, data=urlencode_postdata({ 'username': username, - 'password': password + 'password': password, })) if auth.get('success') is False: raise ExtractorError(', '.join(auth['messages']), expected=True) @@ -77,7 +75,7 @@ def _real_extract(self, url): node = self._download_json( 'https://brooklyn.gaia.com/node/%d' % node_id, node_id) vdata = node[vtype] - media_id = compat_str(vdata['nid']) + media_id = str(vdata['nid']) title = node['title'] headers = None @@ -115,7 +113,7 @@ def get_field_value(key, value_key='value'): 'like_count': int_or_none(try_get(fivestar, lambda x: x['up_count']['value'])), 'dislike_count': int_or_none(try_get(fivestar, lambda x: x['down_count']['value'])), 'comment_count': int_or_none(node.get('comment_count')), - 'series': try_get(node, lambda x: x['series']['title'], compat_str), + 'series': try_get(node, lambda x: x['series']['title'], str), 'season_number': int_or_none(get_field_value('season')), 'season_id': str_or_none(get_field_value('series_nid', 'nid')), 'episode_number': int_or_none(get_field_value('episode')), diff --git a/yt_dlp/extractor/gameinformer.py b/yt_dlp/extractor/gameinformer.py deleted file mode 100644 index 2664edb816..0000000000 --- a/yt_dlp/extractor/gameinformer.py +++ /dev/null @@ -1,46 +0,0 @@ -from .brightcove import BrightcoveNewIE -from .common import InfoExtractor -from ..utils import ( - clean_html, - get_element_by_class, - get_element_by_id, -) - - -class GameInformerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gameinformer\.com/(?:[^/]+/)*(?P[^.?&#]+)' - _TESTS = [{ - # normal Brightcove embed code extracted with BrightcoveNewIE._extract_url - 'url': 'http://www.gameinformer.com/b/features/archive/2015/09/26/replay-animal-crossing.aspx', - 'md5': '292f26da1ab4beb4c9099f1304d2b071', - 'info_dict': { - 'id': '4515472681001', - 'ext': 'mp4', - 'title': 'Replay - Animal Crossing', - 'description': 'md5:2e211891b215c85d061adc7a4dd2d930', - 'timestamp': 1443457610, - 'upload_date': '20150928', - 'uploader_id': '694940074001', - }, - }, { - # Brightcove id inside unique element with field--name-field-brightcove-video-id class - 'url': 'https://www.gameinformer.com/video-feature/new-gameplay-today/2019/07/09/new-gameplay-today-streets-of-rogue', - 'info_dict': { - 'id': '6057111913001', - 'ext': 'mp4', - 'title': 'New Gameplay Today – Streets Of Rogue', - 'timestamp': 1562699001, - 'upload_date': '20190709', - 'uploader_id': '694940074001', - - }, - }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/694940074001/default_default/index.html?videoId=%s' - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage( - url, display_id, headers=self.geo_verification_headers()) - brightcove_id = clean_html(get_element_by_class('field--name-field-brightcove-video-id', webpage) or get_element_by_id('video-source-content', webpage)) - brightcove_url = self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id if brightcove_id else BrightcoveNewIE._extract_url(self, webpage) - return self.url_result(brightcove_url, 'BrightcoveNew', brightcove_id) diff --git a/yt_dlp/extractor/gamejolt.py b/yt_dlp/extractor/gamejolt.py index 8ec046bb3e..01386c142b 100644 --- a/yt_dlp/extractor/gamejolt.py +++ b/yt_dlp/extractor/gamejolt.py @@ -1,16 +1,16 @@ import itertools import json import math +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote from ..utils import ( determine_ext, format_field, int_or_none, str_or_none, traverse_obj, - try_get + try_get, ) @@ -45,7 +45,7 @@ def _get_comments(self, post_num_id, post_hash_id): 'comments/Fireside_Post/%s/%s?%s=%d' % ( post_num_id, sort_by, 'scroll_id' if is_scrolled else 'page', scroll_id if is_scrolled else page), - post_hash_id, note='Downloading comments list page %d' % page) + post_hash_id, note=f'Downloading comments list page {page}') if not comments_data.get('comments'): break for comment in traverse_obj(comments_data, (('comments', 'childComments'), ...), expected_type=dict): @@ -87,15 +87,15 @@ def _parse_post(self, post_data): 'uploader': user_data.get('display_name') or user_data.get('name'), 'uploader_id': user_data.get('username'), 'uploader_url': format_field(user_data, 'url', 'https://gamejolt.com%s'), - 'categories': [try_get(category, lambda x: '%s - %s' % (x['community']['name'], x['channel'].get('display_title') or x['channel']['title'])) - for category in post_data.get('communities' or [])], + 'categories': [try_get(category, lambda x: '{} - {}'.format(x['community']['name'], x['channel'].get('display_title') or x['channel']['title'])) + for category in post_data.get('communities') or []], 'tags': traverse_obj( lead_content, ('content', ..., 'content', ..., 'marks', ..., 'attrs', 'tag'), expected_type=str_or_none), 'like_count': int_or_none(post_data.get('like_count')), 'comment_count': int_or_none(post_data.get('comment_count'), default=0), 'timestamp': int_or_none(post_data.get('added_on'), scale=1000), 'release_timestamp': int_or_none(post_data.get('published_on'), scale=1000), - '__post_extractor': self.extract_comments(post_data.get('id'), post_id) + '__post_extractor': self.extract_comments(post_data.get('id'), post_id), } # TODO: Handle multiple videos/embeds? @@ -152,7 +152,7 @@ def _parse_post(self, post_data): 'height': media.get('height') if url_key == 'img_url' else None, 'filesize': media.get('filesize') if url_key == 'img_url' else None, 'acodec': 'none', - } for url_key in ('img_url', 'mediaserver_url', 'mediaserver_url_mp4', 'mediaserver_url_webm') if media.get(url_key)] + } for url_key in ('img_url', 'mediaserver_url', 'mediaserver_url_mp4', 'mediaserver_url_webm') if media.get(url_key)], }) if gif_entries: return { @@ -192,7 +192,7 @@ class GameJoltIE(GameJoltBaseIE): 'like_count': int, 'comment_count': int, 'view_count': int, - } + }, }, { # YouTube embed 'url': 'https://gamejolt.com/p/hey-hey-if-there-s-anyone-who-s-looking-to-get-into-learning-a-n6g4jzpq', @@ -220,7 +220,7 @@ class GameJoltIE(GameJoltBaseIE): 'upload_date': '20211015', 'view_count': int, 'chapters': 'count:18', - } + }, }, { # Article 'url': 'https://gamejolt.com/p/i-fuckin-broke-chaos-d56h3eue', @@ -243,7 +243,7 @@ class GameJoltIE(GameJoltBaseIE): 'like_count': int, 'comment_count': int, 'view_count': int, - } + }, }, { # Single GIF 'url': 'https://gamejolt.com/p/hello-everyone-i-m-developing-a-pixel-art-style-mod-for-fnf-and-i-vs4gdrd8', @@ -267,9 +267,9 @@ class GameJoltIE(GameJoltBaseIE): 'id': 'dszyjnwi', 'ext': 'webm', 'title': 'gif-presentacion-mejorado-dszyjnwi', - 'n_entries': 1, - } - }] + }, + }], + 'playlist_count': 1, }, { # Multiple GIFs 'url': 'https://gamejolt.com/p/gif-yhsqkumq', @@ -310,7 +310,7 @@ def _entries(self, endpoint, list_id, note='Downloading post list', errnote='Una endpoint, list_id, note=f'{note} page {page_num}', errnote=errnote, data=json.dumps({ 'scrollDirection': 'from', 'scrollId': scroll_id, - }).encode('utf-8')).get('items') + }).encode()).get('items') class GameJoltUserIE(GameJoltPostListBaseIE): @@ -348,7 +348,7 @@ class GameJoltGameIE(GameJoltPostListBaseIE): 'info_dict': { 'id': '655124', 'title': 'Friday Night Funkin\': Friday 4 Fun', - 'description': 'md5:576a7dd87912a2dcf33c50d2bd3966d3' + 'description': 'md5:576a7dd87912a2dcf33c50d2bd3966d3', }, 'params': { 'ignore_no_formats_error': True, @@ -374,7 +374,6 @@ class GameJoltGameSoundtrackIE(GameJoltBaseIE): 'info_dict': { 'id': '657899', 'title': 'Friday Night Funkin\': Vs Oswald', - 'n_entries': None, }, 'playlist': [{ 'info_dict': { @@ -384,8 +383,7 @@ class GameJoltGameSoundtrackIE(GameJoltBaseIE): 'url': r're:^https://.+vs-oswald-menu-music\.mp3$', 'release_timestamp': 1635190816, 'release_date': '20211025', - 'n_entries': 3, - } + }, }, { 'info_dict': { 'id': '184435', @@ -394,8 +392,7 @@ class GameJoltGameSoundtrackIE(GameJoltBaseIE): 'url': r're:^https://.+rabbit-s-luck--full-version-\.mp3$', 'release_timestamp': 1635190841, 'release_date': '20211025', - 'n_entries': 3, - } + }, }, { 'info_dict': { 'id': '185228', @@ -404,9 +401,9 @@ class GameJoltGameSoundtrackIE(GameJoltBaseIE): 'url': r're:^https://.+last-straw\.mp3$', 'release_timestamp': 1635881104, 'release_date': '20211102', - 'n_entries': 3, - } - }] + }, + }], + 'playlist_count': 3, }] def _real_extract(self, url): @@ -430,7 +427,7 @@ class GameJoltCommunityIE(GameJoltPostListBaseIE): 'info_dict': { 'id': 'fnf/videos', 'title': 'Friday Night Funkin\' - Videos', - 'description': 'md5:6d8c06f27460f7d35c1554757ffe53c8' + 'description': 'md5:6d8c06f27460f7d35c1554757ffe53c8', }, 'params': { 'playlistend': 50, @@ -443,7 +440,7 @@ class GameJoltCommunityIE(GameJoltPostListBaseIE): 'info_dict': { 'id': 'youtubers/featured', 'title': 'Youtubers - featured', - 'description': 'md5:53e5582c93dcc467ab597bfca4db17d4' + 'description': 'md5:53e5582c93dcc467ab597bfca4db17d4', }, 'params': { 'playlistend': 50, @@ -531,7 +528,7 @@ def _search_entries(self, query, filter_mode, display_query): def _real_extract(self, url): filter_mode, query = self._match_valid_url(url).group('filter', 'id') - display_query = compat_urllib_parse_unquote(query) + display_query = urllib.parse.unquote(query) return self.playlist_result( self._search_entries(query, filter_mode, display_query) if filter_mode else self._entries( f'web/posts/fetch/search/{query}', display_query, initial_items=self._call_api( diff --git a/yt_dlp/extractor/gamespot.py b/yt_dlp/extractor/gamespot.py index 8dec2522c6..cd3f9655d8 100644 --- a/yt_dlp/extractor/gamespot.py +++ b/yt_dlp/extractor/gamespot.py @@ -1,5 +1,6 @@ +import urllib.parse + from .once import OnceIE -from ..compat import compat_urllib_parse_unquote class GameSpotIE(OnceIE): @@ -40,7 +41,7 @@ def _real_extract(self, url): data_video = self._parse_json(self._html_search_regex( r'data-video=(["\'])({.*?})\1', webpage, 'video data', group=2), page_id) - title = compat_urllib_parse_unquote(data_video['title']) + title = urllib.parse.unquote(data_video['title']) streams = data_video['videoStreams'] formats = [] diff --git a/yt_dlp/extractor/gamestar.py b/yt_dlp/extractor/gamestar.py index e9966f5327..8e3b8a5da7 100644 --- a/yt_dlp/extractor/gamestar.py +++ b/yt_dlp/extractor/gamestar.py @@ -19,7 +19,7 @@ class GameStarIE(InfoExtractor): 'timestamp': 1406542380, 'upload_date': '20140728', 'duration': 17, - } + }, }, { 'url': 'http://www.gamepro.de/videos/top-10-indie-spiele-fuer-nintendo-switch-video-tolle-nindies-games-zum-download,95316.html', 'only_matching': True, @@ -42,7 +42,7 @@ def _real_extract(self, url): webpage, 'JSON-LD', group='json_ld'), video_id) info_dict = self._json_ld(json_ld, video_id) info_dict['title'] = remove_end( - info_dict['title'], ' - Game%s' % site.title()) + info_dict['title'], f' - Game{site.title()}') view_count = int_or_none(json_ld.get('interactionCount')) comment_count = int_or_none(self._html_search_regex( @@ -54,7 +54,7 @@ def _real_extract(self, url): 'url': 'http://gamestar.de/_misc/videos/portal/getVideoUrl.cfm?premium=0&videoId=' + video_id, 'ext': 'mp4', 'view_count': view_count, - 'comment_count': comment_count + 'comment_count': comment_count, }) return info_dict diff --git a/yt_dlp/extractor/gaskrank.py b/yt_dlp/extractor/gaskrank.py index e0bbdae0a2..beb5a8a82f 100644 --- a/yt_dlp/extractor/gaskrank.py +++ b/yt_dlp/extractor/gaskrank.py @@ -1,4 +1,5 @@ import re + from .common import InfoExtractor from ..utils import ( float_or_none, @@ -21,8 +22,7 @@ class GaskrankIE(InfoExtractor): 'display_id': 'strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden', 'uploader_id': 'Bikefun', 'upload_date': '20170110', - 'uploader_url': None, - } + }, }, { 'url': 'http://www.gaskrank.tv/tv/racing/isle-of-man-tt-2011-michael-du-15920.htm', 'md5': 'c33ee32c711bc6c8224bfcbe62b23095', @@ -36,7 +36,7 @@ class GaskrankIE(InfoExtractor): 'uploader_id': 'IOM', 'upload_date': '20170523', 'uploader_url': 'www.iomtt.com', - } + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/gazeta.py b/yt_dlp/extractor/gazeta.py index c6868a6722..b1b6ee7714 100644 --- a/yt_dlp/extractor/gazeta.py +++ b/yt_dlp/extractor/gazeta.py @@ -2,6 +2,7 @@ class GazetaIE(InfoExtractor): + _WORKING = False _VALID_URL = r'(?Phttps?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:main/)*(?:\d{4}/\d{2}/\d{2}/)?(?P[A-Za-z0-9-_.]+)\.s?html)' _TESTS = [{ 'url': 'http://www.gazeta.ru/video/main/zadaite_vopros_vladislavu_yurevichu.shtml', @@ -32,7 +33,7 @@ def _real_extract(self, url): mobj = self._match_valid_url(url) display_id = mobj.group('id') - embed_url = '%s?p=embed' % mobj.group('url') + embed_url = '{}?p=embed'.format(mobj.group('url')) embed_page = self._download_webpage( embed_url, display_id, 'Downloading embed page') @@ -40,4 +41,4 @@ def _real_extract(self, url): r']*?class="eagleplayer"[^>]*?data-id="([^"]+)"', embed_page, 'video id') return self.url_result( - 'eagleplatform:gazeta.media.eagleplatform.com:%s' % video_id, 'EaglePlatform') + f'eagleplatform:gazeta.media.eagleplatform.com:{video_id}', 'EaglePlatform') diff --git a/yt_dlp/extractor/gbnews.py b/yt_dlp/extractor/gbnews.py new file mode 100644 index 0000000000..d652566370 --- /dev/null +++ b/yt_dlp/extractor/gbnews.py @@ -0,0 +1,113 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + extract_attributes, + get_elements_html_by_class, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class GBNewsIE(InfoExtractor): + IE_DESC = 'GB News clips, features and live streams' + _VALID_URL = r'https?://(?:www\.)?gbnews\.(?:uk|com)/(?:\w+/)?(?P[^#?]+)' + + _PLATFORM = 'safari' + _SSMP_URL = 'https://mm-v2.simplestream.com/ssmp/api.php' + _TESTS = [{ + 'url': 'https://www.gbnews.com/news/bbc-claudine-gay-harvard-university-antisemitism-row', + 'info_dict': { + 'id': '52264136', + 'ext': 'mp4', + 'thumbnail': r're:https?://www\.gbnews\.\w+/.+\.(?:jpe?g|png|webp)', + 'display_id': 'bbc-claudine-gay-harvard-university-antisemitism-row', + 'description': 'The post was criticised by former employers of the broadcaster', + 'title': 'BBC deletes post after furious backlash over headline downplaying antisemitism', + }, + }, { + 'url': 'https://www.gbnews.com/royal/prince-harry-in-love-with-kate-meghan-markle-jealous-royal', + 'info_dict': { + 'id': '52328390', + 'ext': 'mp4', + 'thumbnail': r're:https?://www\.gbnews\.\w+/.+\.(?:jpe?g|png|webp)', + 'display_id': 'prince-harry-in-love-with-kate-meghan-markle-jealous-royal', + 'description': 'Ingrid Seward has published 17 books documenting the highs and lows of the Royal Family', + 'title': 'Royal author claims Prince Harry was \'in love\' with Kate - Meghan was \'jealous\'', + }, + }, { + 'url': 'https://www.gbnews.uk/watchlive', + 'info_dict': { + 'id': '1069', + 'ext': 'mp4', + 'thumbnail': r're:https?://www\.gbnews\.\w+/.+\.(?:jpe?g|png|webp)', + 'display_id': 'watchlive', + 'live_status': 'is_live', + 'title': r're:^GB News Live', + }, + 'params': {'skip_download': 'm3u8'}, + }] + _SS_ENDPOINTS = None + + def _get_ss_endpoint(self, data_id, data_env): + if not self._SS_ENDPOINTS: + self._SS_ENDPOINTS = {} + + if not data_id: + data_id = 'GB003' + if not data_env: + data_env = 'production' + key = data_id, data_env + result = self._SS_ENDPOINTS.get(key) + if result: + return result + + json_data = self._download_json( + self._SSMP_URL, None, 'Downloading Simplestream JSON metadata', query={ + 'id': data_id, + 'env': data_env, + }) + meta_url = traverse_obj(json_data, ('response', 'api_hostname', {url_or_none})) + if not meta_url: + raise ExtractorError('No API host found') + + self._SS_ENDPOINTS[key] = meta_url + return meta_url + + def _real_extract(self, url): + display_id = self._match_id(url).rpartition('/')[2] + webpage = self._download_webpage(url, display_id) + + video_data = None + elements = get_elements_html_by_class('simplestream', webpage) + for html_tag in elements: + attributes = extract_attributes(html_tag) + if 'sidebar' not in (attributes.get('class') or ''): + video_data = attributes + if not video_data: + raise ExtractorError('Could not find video element', expected=True) + + endpoint_url = self._get_ss_endpoint(video_data.get('data-id'), video_data.get('data-env')) + + uvid = video_data['data-uvid'] + video_type = video_data.get('data-type') + if not video_type or video_type == 'vod': + video_type = 'show' + stream_data = self._download_json( + f'{endpoint_url}/api/{video_type}/stream/{uvid}', + uvid, 'Downloading stream JSON', query={ + 'key': video_data.get('data-key'), + 'platform': self._PLATFORM, + }) + if traverse_obj(stream_data, 'drm'): + self.report_drm(uvid) + + return { + 'id': uvid, + 'display_id': display_id, + 'title': self._og_search_title(webpage, default=None), + 'description': self._og_search_description(webpage, default=None), + 'formats': self._extract_m3u8_formats(traverse_obj(stream_data, ( + 'response', 'stream', {url_or_none})), uvid, 'mp4'), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'is_live': video_type == 'live', + } diff --git a/yt_dlp/extractor/gdcvault.py b/yt_dlp/extractor/gdcvault.py index 2878bbd88c..5d45240650 100644 --- a/yt_dlp/extractor/gdcvault.py +++ b/yt_dlp/extractor/gdcvault.py @@ -2,16 +2,12 @@ from .common import InfoExtractor from .kaltura import KalturaIE -from ..utils import ( - HEADRequest, - remove_start, - sanitized_Request, - smuggle_url, - urlencode_postdata, -) +from ..networking import HEADRequest, Request +from ..utils import remove_start, smuggle_url, urlencode_postdata class GDCVaultIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P\d+)(?:/(?P[\w-]+))?' _NETRC_MACHINE = 'gdcvault' _TESTS = [ @@ -22,8 +18,8 @@ class GDCVaultIE(InfoExtractor): 'id': '201311826596_AWNY', 'display_id': 'Doki-Doki-Universe-Sweet-Simple', 'ext': 'mp4', - 'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)' - } + 'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)', + }, }, { 'url': 'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of', @@ -31,11 +27,11 @@ class GDCVaultIE(InfoExtractor): 'id': '201203272_1330951438328RSXR', 'display_id': 'Embracing-the-Dark-Art-of', 'ext': 'flv', - 'title': 'Embracing the Dark Art of Mathematical Modeling in AI' + 'title': 'Embracing the Dark Art of Mathematical Modeling in AI', }, 'params': { 'skip_download': True, # Requires rtmpdump - } + }, }, { 'url': 'http://www.gdcvault.com/play/1015301/Thexder-Meets-Windows-95-or', @@ -138,8 +134,8 @@ def _login(self, webpage_url, display_id): 'password': password, } - request = sanitized_Request(login_url, urlencode_postdata(login_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') + request = Request(login_url, urlencode_postdata(login_form)) + request.headers['Content-Type'] = 'application/x-www-form-urlencoded' self._download_webpage(request, display_id, 'Logging in') start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page') self._download_webpage(logout_url, display_id, 'Logging out') @@ -163,7 +159,7 @@ def _real_extract(self, url): video_url = 'http://www.gdcvault.com' + direct_url # resolve the url so that we can detect the correct extension video_url = self._request_webpage( - HEADRequest(video_url), video_id).geturl() + HEADRequest(video_url), video_id).url return { 'id': video_id, @@ -206,7 +202,7 @@ def _real_extract(self, url): 'display_id': display_id, }) return info - embed_url = '%s/xml/%s' % (xml_root, xml_name) + embed_url = f'{xml_root}/xml/{xml_name}' ie_key = 'DigitallySpeaking' return { diff --git a/yt_dlp/extractor/gedidigital.py b/yt_dlp/extractor/gedidigital.py index 1878d636d2..2ffa660307 100644 --- a/yt_dlp/extractor/gedidigital.py +++ b/yt_dlp/extractor/gedidigital.py @@ -109,7 +109,7 @@ def _sanitize_urls(urls): # add protocol if missing for i, e in enumerate(urls): if e.startswith('//'): - urls[i] = 'https:%s' % e + urls[i] = f'https:{e}' # clean iframes urls for i, e in enumerate(urls): urls[i] = urljoin(base_url(e), url_basename(e)) @@ -166,7 +166,7 @@ def _real_extract(self, url): 'abr': abr, 'tbr': abr, 'acodec': ext, - 'vcodec': 'none' + 'vcodec': 'none', }) else: mobj = re.match(r'^video-rrtv-(\d+)(?:-(\d+))?$', n) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 55e55d5248..04cffaa861 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -4,7 +4,7 @@ import urllib.parse import xml.etree.ElementTree -from .common import InfoExtractor # isort: split +from .common import InfoExtractor from .commonprotocols import RtmpIE from .youtube import YoutubeIE from ..compat import compat_etree_fromstring @@ -14,7 +14,10 @@ ExtractorError, UnsupportedError, determine_ext, + determine_protocol, dict_get, + extract_basic_auth, + filter_dict, format_field, int_or_none, is_html, @@ -31,13 +34,16 @@ unescapeHTML, unified_timestamp, unsmuggle_url, + update_url_query, url_or_none, + urlhandle_detect_ext, urljoin, variadic, xpath_attr, xpath_text, xpath_with_ns, ) +from ..utils._utils import _UnsafeExtensionError class GenericIE(InfoExtractor): @@ -55,7 +61,9 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'trailer', 'upload_date': '20100513', - } + 'direct': True, + 'timestamp': 1273772943.0, + }, }, # Direct link to media delivered compressed (until Accept-Encoding is *) { @@ -68,7 +76,7 @@ class GenericIE(InfoExtractor): 'upload_date': '20140522', }, 'expected_warnings': [ - 'URL could be a direct video link, returning it as such.' + 'URL could be a direct video link, returning it as such.', ], 'skip': 'URL invalid', }, @@ -98,10 +106,12 @@ class GenericIE(InfoExtractor): 'ext': 'webm', 'title': '5_Lennart_Poettering_-_Systemd', 'upload_date': '20141120', + 'direct': True, + 'timestamp': 1416498816.0, }, 'expected_warnings': [ - 'URL could be a direct video link, returning it as such.' - ] + 'URL could be a direct video link, returning it as such.', + ], }, # RSS feed { @@ -109,7 +119,7 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': 'https://phihag.de/2014/youtube-dl/rss2.xml', 'title': 'Zero Punctuation', - 'description': 're:.*groundbreaking video review series.*' + 'description': 're:.*groundbreaking video review series.*', }, 'playlist_mincount': 11, }, @@ -130,6 +140,7 @@ class GenericIE(InfoExtractor): 'upload_date': '20201204', }, }], + 'skip': 'Dead link', }, # RSS feed with item with description and thumbnails { @@ -142,12 +153,12 @@ class GenericIE(InfoExtractor): 'playlist': [{ 'info_dict': { 'ext': 'm4a', - 'id': 'c1c879525ce2cb640b344507e682c36d', + 'id': '818a5d38-01cd-152f-2231-ee479677fa82', 'title': 're:Hydrogen!', 'description': 're:.*In this episode we are going.*', 'timestamp': 1567977776, 'upload_date': '20190908', - 'duration': 459, + 'duration': 423, 'thumbnail': r're:^https?://.*\.jpg$', 'episode_number': 1, 'season_number': 1, @@ -264,6 +275,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': '404 Not Found', }, # MPD from http://dash-mse-test.appspot.com/media.html { @@ -275,6 +287,7 @@ class GenericIE(InfoExtractor): 'title': 'car-20120827-manifest', 'formats': 'mincount:9', 'upload_date': '20130904', + 'timestamp': 1378272859.0, }, }, # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 @@ -315,14 +328,14 @@ class GenericIE(InfoExtractor): 'id': 'cmQHVoWB5FY', 'ext': 'mp4', 'upload_date': '20130224', - 'uploader_id': 'TheVerge', + 'uploader_id': '@TheVerge', 'description': r're:^Chris Ziegler takes a look at the\.*', 'uploader': 'The Verge', 'title': 'First Firefox OS phones side-by-side', }, 'params': { 'skip_download': False, - } + }, }, { # redirect in Refresh HTTP header @@ -348,7 +361,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'uploader': 'www.hodiho.fr', 'title': 'R\u00e9gis plante sa Jeep', - } + }, }, # bandcamp page with custom domain { @@ -362,46 +375,6 @@ class GenericIE(InfoExtractor): }, 'skip': 'There is a limit of 200 free downloads / month for the test song', }, - # ooyala video - { - 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', - 'md5': '166dd577b433b4d4ebfee10b0824d8ff', - 'info_dict': { - 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', - 'ext': 'mp4', - 'title': '2cc213299525360.mov', # that's what we get - 'duration': 238.231, - }, - 'add_ie': ['Ooyala'], - }, - { - # ooyala video embedded with http://player.ooyala.com/iframe.js - 'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/', - 'info_dict': { - 'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB', - 'ext': 'mp4', - 'title': '"Steve Jobs: Man in the Machine" trailer', - 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."', - 'duration': 135.427, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'movie expired', - }, - # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js - { - 'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/', - 'info_dict': { - 'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2', - 'ext': 'mp4', - 'title': 'Steampunk Fest Comes to Honesdale', - 'duration': 43.276, - }, - 'params': { - 'skip_download': True, - } - }, # embed.ly video { 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', @@ -466,19 +439,19 @@ class GenericIE(InfoExtractor): 'id': '370908', 'title': 'Госзаказ. День 3', 'ext': 'mp4', - } + }, }, { 'info_dict': { 'id': '370905', 'title': 'Госзаказ. День 2', 'ext': 'mp4', - } + }, }, { 'info_dict': { 'id': '370902', 'title': 'Госзаказ. День 1', 'ext': 'mp4', - } + }, }], 'params': { # m3u8 download @@ -494,7 +467,8 @@ class GenericIE(InfoExtractor): 'title': 'Ужастики, русский трейлер (2015)', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 153, - } + }, + 'skip': 'Site dead', }, # XHamster embed { @@ -518,7 +492,7 @@ class GenericIE(InfoExtractor): 'title': 'Hidden miracles of the natural world', 'uploader': 'Louie Schwartzberg', 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9', - } + }, }, # nowvideo embed hidden behind percent encoding { @@ -543,7 +517,7 @@ class GenericIE(InfoExtractor): 'upload_date': '20140320', }, 'params': { - 'skip_download': 'Requires rtmpdump' + 'skip_download': 'Requires rtmpdump', }, 'skip': 'video gone', }, @@ -564,8 +538,8 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, 'expected_warnings': [ - 'Forbidden' - ] + 'Forbidden', + ], }, # Condé Nast embed { @@ -575,7 +549,7 @@ class GenericIE(InfoExtractor): 'id': '53501be369702d3275860000', 'ext': 'mp4', 'title': 'Honda’s New Asimo Robot Is More Human Than Ever', - } + }, }, # Dailymotion embed { @@ -622,7 +596,7 @@ class GenericIE(InfoExtractor): 'add_ie': ['Youtube'], 'params': { 'skip_download': True, - } + }, }, # MTVServices embed { @@ -651,7 +625,7 @@ class GenericIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, }, # Flowplayer { @@ -663,7 +637,7 @@ class GenericIE(InfoExtractor): 'age_limit': 18, 'uploader': 'www.handjobhub.com', 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com', - } + }, }, # MLB embed { @@ -707,7 +681,7 @@ class GenericIE(InfoExtractor): 'uploader': 'Sophos Security', 'title': 'Chet Chat 171 - Oct 29, 2014', 'upload_date': '20141029', - } + }, }, # Soundcloud multiple embeds { @@ -741,7 +715,7 @@ class GenericIE(InfoExtractor): 'ext': 'flv', 'upload_date': '20141112', 'title': 'Rosetta #CometLanding webcast HL 10', - } + }, }, # Another Livestream embed, without 'new.' in URL { @@ -766,15 +740,17 @@ class GenericIE(InfoExtractor): 'playlist_mincount': 1, 'add_ie': ['Youtube'], }, - # Cinchcast embed + # Libsyn embed { 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/', 'info_dict': { - 'id': '7141703', + 'id': '3793998', 'ext': 'mp3', 'upload_date': '20141126', - 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing', - } + 'title': 'Underground Wellness Radio - Jack Tips: 5 Steps to Permanent Gut Healing', + 'thumbnail': 'https://assets.libsyn.com/secure/item/3793998/?height=90&width=90', + 'duration': 3989.0, + }, }, # Cinerama player { @@ -784,7 +760,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'uploader': 'www.abc.net.au', 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015', - } + }, }, # embedded viddler video { @@ -865,7 +841,7 @@ class GenericIE(InfoExtractor): }, }, { - # Video.js embed, multiple formats + # Youtube embed, formerly: Video.js embed, multiple formats 'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html', 'info_dict': { 'id': 'yygqldloqIk', @@ -892,6 +868,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': '404 Not Found', }, # rtl.nl embed { @@ -900,7 +877,7 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': 'aanslagen-kopenhagen', 'title': 'Aanslagen Kopenhagen', - } + }, }, # Zapiks embed { @@ -909,7 +886,7 @@ class GenericIE(InfoExtractor): 'id': '118046', 'ext': 'mp4', 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !', - } + }, }, # Kaltura embed (different embed code) { @@ -948,11 +925,11 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], 'expected_warnings': [ - 'Could not send HEAD request' + 'Could not send HEAD request', ], 'params': { 'skip_download': True, - } + }, }, { # Kaltura embedded, some fileExt broken (#11480) @@ -1079,7 +1056,7 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': '8RUoRhRi', 'ext': 'mp4', - 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!", + 'title': 'Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!', 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f', 'timestamp': 1428207000, 'upload_date': '20150405', @@ -1155,7 +1132,7 @@ class GenericIE(InfoExtractor): 'uploader': 'clickhole', 'upload_date': '20150527', 'timestamp': 1432744860, - } + }, }, # SnagFilms embed { @@ -1164,7 +1141,7 @@ class GenericIE(InfoExtractor): 'id': '74849a00-85a9-11e1-9660-123139220831', 'ext': 'mp4', 'title': '#whilewewatch', - } + }, }, # AdobeTVVideo embed { @@ -1460,7 +1437,7 @@ class GenericIE(InfoExtractor): 'upload_date': '20211217', 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/12/tsiodras-mitsotakis-1024x545.jpg', }, - }] + }], }, { 'url': 'https://www.ertnews.gr/video/manolis-goyalles-o-anthropos-piso-apo-ti-diadiktyaki-vasilopita/', @@ -1554,16 +1531,6 @@ class GenericIE(InfoExtractor): 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', }, }, - { - # vzaar embed - 'url': 'http://help.vzaar.com/article/165-embedding-video', - 'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4', - 'info_dict': { - 'id': '8707641', - 'ext': 'mp4', - 'title': 'Building A Business Online: Principal Chairs Q & A', - }, - }, { # multiple HTML5 videos on one page 'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html', @@ -1581,7 +1548,7 @@ class GenericIE(InfoExtractor): 'id': '0f64ce6', 'title': 'vl14062007715967', 'ext': 'mp4', - } + }, }, { 'url': 'http://www.heidelberg-laureate-forum.org/blog/video/lecture-friday-september-23-2016-sir-c-antony-r-hoare/', @@ -1593,7 +1560,7 @@ class GenericIE(InfoExtractor): 'description': 'md5:5a51db84a62def7b7054df2ade403c6c', 'timestamp': 1474354800, 'upload_date': '20160920', - } + }, }, { 'url': 'http://www.kidzworld.com/article/30935-trolls-the-beat-goes-on-interview-skylar-astin-and-amanda-leighton', @@ -1685,7 +1652,7 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': '83645793', 'title': 'Lock up and get excited', - 'ext': 'mp4' + 'ext': 'mp4', }, 'skip': 'TODO: fix nested playlists processing in tests', }, @@ -1761,7 +1728,7 @@ class GenericIE(InfoExtractor): 'upload_date': '20220110', 'thumbnail': 'https://opentv-static.siliconweb.com/imgHandler/1920/70bc39fa-895b-4918-a364-c39d2135fc6d.jpg', - } + }, }, { # blogger embed @@ -1931,8 +1898,8 @@ class GenericIE(InfoExtractor): 'timestamp': 1501941939.0, 'title': 'That small heart attack.', 'upload_date': '20170805', - 'uploader': 'Antw87' - } + 'uploader': 'Antw87', + }, }, { # 1080p Reddit-hosted video that will redirect and be processed by RedditIE @@ -1944,8 +1911,8 @@ class GenericIE(InfoExtractor): 'title': "The game Didn't want me to Knife that Guy I guess", 'uploader': 'paraf1ve', 'timestamp': 1636788683.0, - 'upload_date': '20211113' - } + 'upload_date': '20211113', + }, }, { # MainStreaming player @@ -1957,15 +1924,15 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'live_status': 'not_live', 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster', - 'duration': 1512 - } + 'duration': 1512, + }, }, { # Multiple gfycat iframe embeds 'url': 'https://www.gezip.net/bbs/board.php?bo_table=entertaine&wr_id=613422', 'info_dict': { 'title': '재이, 윤, 세은 황금 드레스를 입고 빛난다', - 'id': 'board' + 'id': 'board', }, 'playlist_count': 8, }, @@ -1974,18 +1941,18 @@ class GenericIE(InfoExtractor): 'url': 'https://www.gezip.net/bbs/board.php?bo_table=entertaine&wr_id=612199', 'info_dict': { 'title': '옳게 된 크롭 니트 스테이씨 아이사', - 'id': 'board' + 'id': 'board', }, - 'playlist_count': 6 + 'playlist_count': 6, }, { # Multiple gfycat embeds, with uppercase "IFR" in urls 'url': 'https://kkzz.kr/?vid=2295', 'info_dict': { 'title': '지방시 앰버서더 에스파 카리나 움짤', - 'id': '?vid=2295' + 'id': '?vid=2295', }, - 'playlist_count': 9 + 'playlist_count': 9, }, { # Panopto embeds @@ -2018,9 +1985,9 @@ class GenericIE(InfoExtractor): 'url': 'https://www.hs.fi/kotimaa/art-2000008762560.html', 'info_dict': { 'title': 'Koronavirus | Epidemiahuippu voi olla Suomessa ohi, mutta koronaviruksen poistamista yleisvaarallisten tautien joukosta harkitaan vasta syksyllä', - 'id': 'art-2000008762560' + 'id': 'art-2000008762560', }, - 'playlist_count': 3 + 'playlist_count': 3, }, { # Ruutu embed in hs.fi with a single video @@ -2049,7 +2016,7 @@ class GenericIE(InfoExtractor): 'thumbnail': 'https://www.filmarkivet.se/wp-content/uploads/parisdmoll2.jpg', 'timestamp': 1652833414, 'age_limit': 0, - } + }, }, { 'url': 'https://www.mollymovieclub.com/p/interstellar?s=r#details', @@ -2089,7 +2056,7 @@ class GenericIE(InfoExtractor): 'thumbnail': 'https://cdn.jwplayer.com/v2/media/YTmgRiNU/poster.jpg?width=720', 'duration': 5688.0, 'upload_date': '20210111', - } + }, }, { 'note': 'JSON LD with multiple @type', @@ -2105,7 +2072,7 @@ class GenericIE(InfoExtractor): 'upload_date': '20200411', 'age_limit': 0, 'duration': 111.0, - } + }, }, { 'note': 'JSON LD with unexpected data type', @@ -2120,7 +2087,7 @@ class GenericIE(InfoExtractor): 'thumbnail': r're:^https://media.autoweek.nl/m/.+\.jpg$', 'age_limit': 0, 'direct': True, - } + }, }, { 'note': 'server returns data in brotli compression by default if `accept-encoding: *` is specified.', @@ -2138,22 +2105,6 @@ class GenericIE(InfoExtractor): 'age_limit': 0, }, }, - { - 'note': 'JW Player embed with unicode-escape sequences in URL', - 'url': 'https://www.medici.tv/en/concerts/lahav-shani-mozart-mahler-israel-philharmonic-abu-dhabi-classics', - 'info_dict': { - 'id': 'm', - 'ext': 'mp4', - 'title': 'Lahav Shani conducts the Israel Philharmonic\'s first-ever concert in Abu Dhabi', - 'description': 'Mahler\'s ', - 'uploader': 'www.medici.tv', - 'age_limit': 0, - 'thumbnail': r're:^https?://.+\.jpg', - }, - 'params': { - 'skip_download': True, - }, - }, { 'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/', 'md5': 'e2f0a4c329f7986280b7328e24036d60', @@ -2167,11 +2118,38 @@ class GenericIE(InfoExtractor): 'age_limit': 18, }, }, + { + 'note': 'Live HLS direct link', + 'url': 'https://d18j67ugtrocuq.cloudfront.net/out/v1/2767aec339144787926bd0322f72c6e9/index.m3u8', + 'info_dict': { + 'id': 'index', + 'title': r're:index', + 'ext': 'mp4', + 'live_status': 'is_live', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, + { + 'note': 'Video.js VOD HLS', + 'url': 'https://gist.githubusercontent.com/bashonly/2aae0862c50f4a4b84f220c315767208/raw/e3380d413749dabbe804c9c2d8fd9a45142475c7/videojs_hls_test.html', + 'info_dict': { + 'id': 'videojs_hls_test', + 'title': 'video', + 'ext': 'mp4', + 'age_limit': 0, + 'duration': 1800, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, ] def report_following_redirect(self, new_url): """Report information extraction.""" - self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) + self._downloader.to_screen(f'[redirect] Following redirect to {new_url}') def report_detected(self, name, num=1, note=None): if num > 1: @@ -2183,12 +2161,49 @@ def report_detected(self, name, num=1, note=None): self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}') - def _fragment_query(self, url): - if self._configuration_arg('fragment_query'): - query_string = urllib.parse.urlparse(url).query - if query_string: - return {'extra_param_to_segment_url': query_string} - return {} + def _extra_manifest_info(self, info, manifest_url): + fragment_query = self._configuration_arg('fragment_query', [None], casesense=True)[0] + if fragment_query is not None: + info['extra_param_to_segment_url'] = ( + urllib.parse.urlparse(fragment_query).query or fragment_query + or urllib.parse.urlparse(manifest_url).query or None) + + key_query = self._configuration_arg('key_query', [None], casesense=True)[0] + if key_query is not None: + info['extra_param_to_key_url'] = ( + urllib.parse.urlparse(key_query).query or key_query + or urllib.parse.urlparse(manifest_url).query or None) + + def hex_or_none(value): + return value if re.fullmatch(r'(0x)?[\da-f]+', value, re.IGNORECASE) else None + + info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), { + 'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}), + }) or None + + variant_query = self._configuration_arg('variant_query', [None], casesense=True)[0] + if variant_query is not None: + query = urllib.parse.parse_qs( + urllib.parse.urlparse(variant_query).query or variant_query + or urllib.parse.urlparse(manifest_url).query) + for fmt in self._downloader._get_formats(info): + fmt['url'] = update_url_query(fmt['url'], query) + + # Attempt to detect live HLS or set VOD duration + m3u8_format = next((f for f in self._downloader._get_formats(info) + if determine_protocol(f) == 'm3u8_native'), None) + if m3u8_format: + is_live = self._configuration_arg('is_live', [None])[0] + if is_live is not None: + info['live_status'] = 'not_live' if is_live == 'false' else 'is_live' + return + headers = m3u8_format.get('http_headers') or info.get('http_headers') + duration = self._extract_m3u8_vod_duration( + m3u8_format['url'], info.get('id'), note='Checking m3u8 live status', + errnote='Failed to download m3u8 media playlist', headers=headers) + if not duration: + info['live_status'] = 'is_live' + info['duration'] = info.get('duration') or duration def _extract_rss(self, url, video_id, doc): NS_MAP = { @@ -2238,29 +2253,29 @@ def _kvs_get_real_url(cls, video_url, license_code): return video_url # not obfuscated parsed = urllib.parse.urlparse(video_url[len('function/0/'):]) - license = cls._kvs_get_license_token(license_code) + license_token = cls._kvs_get_license_token(license_code) urlparts = parsed.path.split('/') HASH_LENGTH = 32 - hash = urlparts[3][:HASH_LENGTH] + hash_ = urlparts[3][:HASH_LENGTH] indices = list(range(HASH_LENGTH)) # Swap indices of hash according to the destination calculated from the license token accum = 0 for src in reversed(range(HASH_LENGTH)): - accum += license[src] + accum += license_token[src] dest = (src + accum) % HASH_LENGTH indices[src], indices[dest] = indices[dest], indices[src] - urlparts[3] = ''.join(hash[index] for index in indices) + urlparts[3][HASH_LENGTH:] + urlparts[3] = ''.join(hash_[index] for index in indices) + urlparts[3][HASH_LENGTH:] return urllib.parse.urlunparse(parsed._replace(path='/'.join(urlparts))) @staticmethod - def _kvs_get_license_token(license): - license = license.replace('$', '') - license_values = [int(char) for char in license] + def _kvs_get_license_token(license_code): + license_code = license_code.replace('$', '') + license_values = [int(char) for char in license_code] - modlicense = license.replace('0', '1') + modlicense = license_code.replace('0', '1') center = len(modlicense) // 2 fronthalf = int(modlicense[:center + 1]) backhalf = int(modlicense[center:]) @@ -2310,7 +2325,7 @@ def _extract_kvs(self, url, webpage, video_id): 'id': flashvars['video_id'], 'display_id': display_id, 'title': title, - 'thumbnail': thumbnail, + 'thumbnail': urljoin(url, thumbnail), 'formats': formats, } @@ -2332,18 +2347,17 @@ def _real_extract(self, url): if default_search == 'auto_warning': if re.match(r'^(?:url|URL)$', url): raise ExtractorError( - 'Invalid URL: %r . Call yt-dlp like this: yt-dlp -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url, + f'Invalid URL: {url!r} . Call yt-dlp like this: yt-dlp -v "https://www.youtube.com/watch?v=BaW_jenozKc" ', expected=True) else: self.report_warning( - 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url) + f'Falling back to youtube search for {url} . Set --default-search "auto" to suppress this warning.') return self.url_result('ytsearch:' + url) if default_search in ('error', 'fixup_error'): raise ExtractorError( - '%r is not a valid URL. ' - 'Set --default-search "ytsearch" (or run yt-dlp "ytsearch:%s" ) to search YouTube' - % (url, url), expected=True) + f'{url!r} is not a valid URL. ' + f'Set --default-search "ytsearch" (or run yt-dlp "ytsearch:{url}" ) to search YouTube', expected=True) else: if ':' not in default_search: default_search += ':' @@ -2367,14 +2381,12 @@ def _real_extract(self, url): # to accept raw bytes and being able to download only a chunk. # It may probably better to solve this by checking Content-Type for application/octet-stream # after a HEAD request, but not sure if we can rely on this. - full_response = self._request_webpage(url, video_id, headers={ + full_response = self._request_webpage(url, video_id, headers=filter_dict({ 'Accept-Encoding': 'identity', - **smuggled_data.get('http_headers', {}) - }) - new_url = full_response.geturl() - if new_url == urllib.parse.urlparse(url)._replace(scheme='https').geturl(): - url = new_url - elif url != new_url: + 'Referer': smuggled_data.get('referer'), + })) + new_url = full_response.url + if new_url != extract_basic_auth(url)[0]: self.report_following_redirect(new_url) if force_videoid: new_url = smuggle_url(new_url, {'force_videoid': force_videoid}) @@ -2383,7 +2395,7 @@ def _real_extract(self, url): info_dict = { 'id': video_id, 'title': self._generic_title(url), - 'timestamp': unified_timestamp(full_response.headers.get('Last-Modified')) + 'timestamp': unified_timestamp(full_response.headers.get('Last-Modified')), } # Check for direct link to a video @@ -2391,22 +2403,22 @@ def _real_extract(self, url): m = re.match(r'^(?Paudio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P[^;\s]+)', content_type) if m: self.report_detected('direct video link') - headers = smuggled_data.get('http_headers', {}) + headers = filter_dict({'Referer': smuggled_data.get('referer')}) format_id = str(m.group('format_id')) + ext = determine_ext(url, default_ext=None) or urlhandle_detect_ext(full_response) subtitles = {} - if format_id.endswith('mpegurl'): + if format_id.endswith('mpegurl') or ext == 'm3u8': formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers) - info_dict.update(self._fragment_query(url)) - elif format_id.endswith('mpd') or format_id.endswith('dash+xml'): + elif format_id.endswith(('mpd', 'dash+xml')) or ext == 'mpd': formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id, headers=headers) - info_dict.update(self._fragment_query(url)) - elif format_id == 'f4m': + elif format_id == 'f4m' or ext == 'f4m': formats = self._extract_f4m_formats(url, video_id, headers=headers) else: formats = [{ 'format_id': format_id, 'url': url, - 'vcodec': 'none' if m.group('type') == 'audio' else None + 'ext': ext, + 'vcodec': 'none' if m.group('type') == 'audio' else None, }] info_dict['direct'] = True info_dict.update({ @@ -2414,6 +2426,7 @@ def _real_extract(self, url): 'subtitles': subtitles, 'http_headers': headers or None, }) + self._extra_manifest_info(info_dict, url) return info_dict if not self.get_param('test', False) and not is_intentional: @@ -2426,7 +2439,7 @@ def _real_extract(self, url): if first_bytes.startswith(b'#EXTM3U'): self.report_detected('M3U playlist') info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') - info_dict.update(self._fragment_query(url)) + self._extra_manifest_info(info_dict, url) return info_dict # Maybe it's a direct link to a video? @@ -2434,9 +2447,13 @@ def _real_extract(self, url): if not is_html(first_bytes): self.report_warning( 'URL could be a direct video link, returning it as such.') + ext = determine_ext(url) + if ext not in _UnsafeExtensionError.ALLOWED_EXTENSIONS: + ext = 'unknown_video' info_dict.update({ 'direct': True, 'url': url, + 'ext': ext, }) return info_dict @@ -2453,7 +2470,7 @@ def _real_extract(self, url): try: doc = compat_etree_fromstring(webpage) except xml.etree.ElementTree.ParseError: - doc = compat_etree_fromstring(webpage.encode('utf-8')) + doc = compat_etree_fromstring(webpage.encode()) if doc.tag == 'rss': self.report_detected('RSS feed') return self._extract_rss(url, video_id, doc) @@ -2470,14 +2487,14 @@ def _real_extract(self, url): return self.playlist_result( self._parse_xspf( doc, video_id, xspf_url=url, - xspf_base_url=full_response.geturl()), + xspf_base_url=full_response.url), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles( doc, - mpd_base_url=full_response.geturl().rpartition('/')[0], + mpd_base_url=full_response.url.rpartition('/')[0], mpd_url=url) - info_dict.update(self._fragment_query(url)) + self._extra_manifest_info(info_dict, url) self.report_detected('DASH manifest') return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): @@ -2503,7 +2520,7 @@ def _real_extract(self, url): self._downloader.write_debug('Looking for embeds') embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict)) if len(embeds) == 1: - return {**info_dict, **embeds[0]} + return merge_dicts(embeds[0], info_dict) elif embeds: return self.playlist_result(embeds, **info_dict) raise UnsupportedError(url) @@ -2513,7 +2530,7 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): info_dict = types.MappingProxyType(info_dict) # Prevents accidental mutation video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url) url, smuggled_data = unsmuggle_url(url, {}) - actual_url = urlh.geturl() if urlh else url + actual_url = urlh.url if urlh else url # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) @@ -2566,8 +2583,7 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): varname = mobj.group(1) sources = variadic(self._parse_json( mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or []) - formats = [] - subtitles = {} + formats, subtitles, src = [], {}, None for source in sources: src = source.get('src') if not src or not isinstance(src, str): @@ -2590,8 +2606,6 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): m3u8_id='hls', fatal=False) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) - for fmt in formats: - fmt.update(self._fragment_query(src)) if not formats: formats.append({ @@ -2604,14 +2618,14 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): }) # https://docs.videojs.com/player#addRemoteTextTrack # https://html.spec.whatwg.org/multipage/media.html#htmltrackelement - for sub_match in re.finditer(rf'(?s){re.escape(varname)}' r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage): + for sub_match in re.finditer(rf'(?s){re.escape(varname)}' + r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage): sub = self._parse_json( sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {} - src = str_or_none(sub.get('src')) - if not src: + sub_src = str_or_none(sub.get('src')) + if not sub_src: continue subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({ - 'url': urllib.parse.urljoin(url, src), + 'url': urllib.parse.urljoin(url, sub_src), 'name': sub.get('label'), 'http_headers': { 'Referer': actual_url, @@ -2619,7 +2633,10 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): }) if formats or subtitles: self.report_detected('video.js embed') - return [{'formats': formats, 'subtitles': subtitles}] + info_dict = {'formats': formats, 'subtitles': subtitles} + if formats: + self._extra_manifest_info(info_dict, src) + return [info_dict] # Look for generic KVS player (before json-ld bc of some urls that break otherwise) found = self._search_regex(( @@ -2642,7 +2659,7 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): 'url': smuggle_url(json_ld['url'], { 'force_videoid': video_id, 'to_generic': True, - 'http_headers': {'Referer': url}, + 'referer': url, }), }, json_ld)] @@ -2723,7 +2740,7 @@ def filter_video(urls): REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' found = re.search( r'(?i)[^?/#]+)-lyrics[?/#]?' + _VALID_URL = r'https?://(?:www\.)?genius\.com/(?P[^?/#]+)-lyrics(?:[?/#]|$)' _TESTS = [{ 'url': 'https://genius.com/Lil-baby-heyy-lyrics', 'playlist_mincount': 2, diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py new file mode 100644 index 0000000000..53b881011c --- /dev/null +++ b/yt_dlp/extractor/getcourseru.py @@ -0,0 +1,178 @@ +import re +import time +import urllib.parse + +from .common import InfoExtractor +from ..utils import ExtractorError, int_or_none, url_or_none, urlencode_postdata +from ..utils.traversal import traverse_obj + + +class GetCourseRuPlayerIE(InfoExtractor): + _VALID_URL = r'https?://player02\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=[^#&]+' + _EMBED_REGEX = [rf']+\bsrc=[\'"](?P{_VALID_URL}[^\'"]*)'] + _TESTS = [{ + 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1NDQ5NjQyLCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=354ad2c993d95d5ac629e3133d6cefea&vh-static-feature=zigzag', + 'info_dict': { + 'id': '513573381', + 'title': '190bdf93f1b29735309853a7a19e24b3', + 'ext': 'mp4', + 'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80', + 'duration': 1693, + }, + 'skip': 'JWT expired', + }] + + def _real_extract(self, url): + webpage = self._download_webpage(url, None, 'Downloading player page') + window_configs = self._search_json( + r'window\.configs\s*=', webpage, 'config', None) + video_id = str(window_configs['gcFileId']) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + window_configs['masterPlaylistUrl'], video_id) + + return { + **traverse_obj(window_configs, { + 'title': ('videoHash', {str}), + 'thumbnail': ('previewUrl', {url_or_none}), + 'duration': ('videoDuration', {int_or_none}), + }), + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + } + + +class GetCourseRuIE(InfoExtractor): + _NETRC_MACHINE = 'getcourseru' + _DOMAINS = [ + 'academymel.online', + 'marafon.mani-beauty.com', + 'on.psbook.ru', + ] + _BASE_URL_RE = rf'https?://(?:(?!player02\.)[^.]+\.getcourse\.(?:ru|io)|{"|".join(map(re.escape, _DOMAINS))})' + _VALID_URL = [ + rf'{_BASE_URL_RE}/(?!pl/|teach/)(?P[^?#]+)', + rf'{_BASE_URL_RE}/(:?pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P\d+)', + ] + _TESTS = [{ + 'url': 'http://academymel.online/3video_1', + 'info_dict': { + 'id': '3059742', + 'display_id': '3video_1', + 'title': 'Промоуроки Академии МЕЛ', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '513573381', + 'ext': 'mp4', + 'title': 'Промоуроки Академии МЕЛ', + 'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80', + 'duration': 1693, + }, + }], + }, { + 'url': 'https://academymel.getcourse.ru/3video_1', + 'info_dict': { + 'id': '3059742', + 'display_id': '3video_1', + 'title': 'Промоуроки Академии МЕЛ', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '513573381', + 'ext': 'mp4', + 'title': 'Промоуроки Академии МЕЛ', + 'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80', + 'duration': 1693, + }, + }], + }, { + 'url': 'https://academymel.getcourse.ru/pl/teach/control/lesson/view?id=319141781&editMode=0', + 'info_dict': { + 'id': '319141781', + 'title': '1. Разминка у стены', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '4919601', + 'ext': 'mp4', + 'title': '1. Разминка у стены', + 'thumbnail': 'https://preview-htz.vhcdn.com/preview/5a521788e7dc25b4f70c3dff6512d90e/preview.jpg?version=1703223532&host=vh-81', + 'duration': 704, + }, + }], + 'skip': 'paid lesson', + }, { + 'url': 'https://manibeauty.getcourse.ru/pl/teach/control/lesson/view?id=272499894', + 'info_dict': { + 'id': '272499894', + 'title': 'Мотивация к тренировкам', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '447479687', + 'ext': 'mp4', + 'title': 'Мотивация к тренировкам', + 'thumbnail': 'https://preview-htz.vhcdn.com/preview/70ed5b9f489dd03b4aff55bfdff71a26/preview.jpg?version=1685115787&host=vh-71', + 'duration': 30, + }, + }], + 'skip': 'paid lesson', + }, { + 'url': 'https://gaismasmandalas.getcourse.io/ATLAUTSEVBUT', + 'only_matching': True, + }] + + _LOGIN_URL_PATH = '/cms/system/login' + + def _login(self, hostname, username, password): + if self._get_cookies(f'https://{hostname}').get('PHPSESSID5'): + return + login_url = f'https://{hostname}{self._LOGIN_URL_PATH}' + webpage = self._download_webpage(login_url, None) + + self._request_webpage( + login_url, None, 'Logging in', 'Failed to log in', + data=urlencode_postdata({ + 'action': 'processXdget', + 'xdgetId': self._html_search_regex( + r']+\bclass="[^"]*\bstate-login[^"]*"[^>]+\bdata-xdget-id="([^"]+)"', + webpage, 'xdgetId'), + 'params[action]': 'login', + 'params[url]': login_url, + 'params[object_type]': 'cms_page', + 'params[object_id]': -1, + 'params[email]': username, + 'params[password]': password, + 'requestTime': int(time.time()), + 'requestSimpleSign': self._html_search_regex( + r'window.requestSimpleSign\s*=\s*"([\da-f]+)"', webpage, 'simple sign'), + })) + + def _real_extract(self, url): + hostname = urllib.parse.urlparse(url).hostname + username, password = self._get_login_info(netrc_machine=hostname) + if username: + self._login(hostname, username, password) + + display_id = self._match_id(url) + webpage, urlh = self._download_webpage_handle(url, display_id) + if self._LOGIN_URL_PATH in urlh.url: + raise ExtractorError( + f'This video is only available for registered users. {self._login_hint("any", netrc=hostname)}', + expected=True) + + playlist_id = self._search_regex( + r'window\.(?:lessonId|gcsObjectId)\s*=\s*(\d+)', webpage, 'playlist id', default=display_id) + title = self._og_search_title(webpage) or self._html_extract_title(webpage) + + return self.playlist_from_matches( + re.findall(GetCourseRuPlayerIE._EMBED_REGEX[0], webpage), + playlist_id, title, display_id=display_id, ie=GetCourseRuPlayerIE, video_kwargs={ + 'url_transparent': True, + 'title': title, + }) diff --git a/yt_dlp/extractor/gettr.py b/yt_dlp/extractor/gettr.py index 7795dc56f7..2a9d5e7e98 100644 --- a/yt_dlp/extractor/gettr.py +++ b/yt_dlp/extractor/gettr.py @@ -1,7 +1,7 @@ from .common import InfoExtractor from ..utils import ( - bool_or_none, ExtractorError, + bool_or_none, dict_get, float_or_none, int_or_none, @@ -38,7 +38,7 @@ class GettrIE(GettrBaseIE): 'timestamp': 1632782451.058, 'duration': 58.5585, 'tags': ['hornofafrica', 'explorations'], - } + }, }, { 'url': 'https://gettr.com/post/p4iahp', 'info_dict': { @@ -53,7 +53,7 @@ class GettrIE(GettrBaseIE): 'timestamp': 1626594455.017, 'duration': 23, 'tags': 'count:12', - } + }, }, { # quote post 'url': 'https://gettr.com/post/pxn5b743a9', @@ -76,7 +76,7 @@ class GettrIE(GettrBaseIE): def _real_extract(self, url): post_id = self._match_id(url) webpage = self._download_webpage(url, post_id) - api_data = self._call_api('post/%s?incl="poststats|userinfo"' % post_id, post_id) + api_data = self._call_api(f'post/{post_id}?incl="poststats|userinfo"', post_id) post_data = api_data.get('data') user_data = try_get(api_data, lambda x: x['aux']['uinf'][post_data['uid']], dict) or {} @@ -106,7 +106,7 @@ def _real_extract(self, url): or self._search_regex(r'^(.+?) on GETTR', self._og_search_title(webpage, default=''), 'uploader', fatal=False)) if uploader: - title = '%s - %s' % (uploader, title) + title = f'{uploader} - {title}' formats, subtitles = self._extract_m3u8_formats_and_subtitles( urljoin(self._MEDIA_BASE_URL, vid), post_id, 'mp4', @@ -157,7 +157,7 @@ class GettrStreamingIE(GettrBaseIE): 'title': 'Day 1: Opening Session of the Grand Jury Proceeding', 'timestamp': 1644080997.164, 'upload_date': '20220205', - } + }, }, { 'url': 'https://gettr.com/streaming/psfmeefcc1', 'info_dict': { @@ -172,12 +172,12 @@ class GettrStreamingIE(GettrBaseIE): 'duration': 21872.507, 'timestamp': 1643976662.858, 'upload_date': '20220204', - } + }, }] def _real_extract(self, url): video_id = self._match_id(url) - video_info = self._call_api('live/join/%s' % video_id, video_id, data={}) + video_info = self._call_api(f'live/join/{video_id}', video_id, data={}) live_info = video_info['broadcast'] live_url = url_or_none(live_info.get('url')) diff --git a/yt_dlp/extractor/gfycat.py b/yt_dlp/extractor/gfycat.py deleted file mode 100644 index edc2e56e44..0000000000 --- a/yt_dlp/extractor/gfycat.py +++ /dev/null @@ -1,145 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - int_or_none, - float_or_none, - qualities, - ExtractorError, -) - - -class GfycatIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?i:ru/|ifr/|gifs/detail/)?(?P[^-/?#\."\']+)' - _EMBED_REGEX = [rf'<(?:iframe|source)[^>]+\bsrc=["\'](?P{_VALID_URL})'] - _TESTS = [{ - 'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher', - 'info_dict': { - 'id': 'DeadlyDecisiveGermanpinscher', - 'ext': 'mp4', - 'title': 'Ghost in the Shell', - 'timestamp': 1410656006, - 'upload_date': '20140914', - 'uploader': 'anonymous', - 'duration': 10.4, - 'view_count': int, - 'like_count': int, - 'categories': list, - 'age_limit': 0, - 'uploader_id': 'anonymous', - 'description': '', - } - }, { - 'url': 'http://gfycat.com/ifr/JauntyTimelyAmazontreeboa', - 'info_dict': { - 'id': 'JauntyTimelyAmazontreeboa', - 'ext': 'mp4', - 'title': 'JauntyTimelyAmazontreeboa', - 'timestamp': 1411720126, - 'upload_date': '20140926', - 'uploader': 'anonymous', - 'duration': 3.52, - 'view_count': int, - 'like_count': int, - 'categories': list, - 'age_limit': 0, - 'uploader_id': 'anonymous', - 'description': '', - } - }, { - 'url': 'https://gfycat.com/alienatedsolidgreathornedowl', - 'info_dict': { - 'id': 'alienatedsolidgreathornedowl', - 'ext': 'mp4', - 'upload_date': '20211226', - 'uploader_id': 'reactions', - 'timestamp': 1640536930, - 'like_count': int, - 'description': '', - 'title': 'Ingrid Michaelson, Zooey Deschanel - Merry Christmas Happy New Year', - 'categories': list, - 'age_limit': 0, - 'duration': 2.9583333333333335, - 'uploader': 'Reaction GIFs', - 'view_count': int, - } - }, { - 'url': 'https://gfycat.com/ru/RemarkableDrearyAmurstarfish', - 'only_matching': True - }, { - 'url': 'https://gfycat.com/gifs/detail/UnconsciousLankyIvorygull', - 'only_matching': True - }, { - 'url': 'https://gfycat.com/acceptablehappygoluckyharborporpoise-baseball', - 'only_matching': True - }, { - 'url': 'https://thumbs.gfycat.com/acceptablehappygoluckyharborporpoise-size_restricted.gif', - 'only_matching': True - }, { - 'url': 'https://giant.gfycat.com/acceptablehappygoluckyharborporpoise.mp4', - 'only_matching': True - }, { - 'url': 'http://gfycat.com/IFR/JauntyTimelyAmazontreeboa', - 'only_matching': True - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - gfy = self._download_json( - 'https://api.gfycat.com/v1/gfycats/%s' % video_id, - video_id, 'Downloading video info') - if 'error' in gfy: - raise ExtractorError('Gfycat said: ' + gfy['error'], expected=True) - gfy = gfy['gfyItem'] - - title = gfy.get('title') or gfy['gfyName'] - description = gfy.get('description') - timestamp = int_or_none(gfy.get('createDate')) - uploader = gfy.get('userName') or gfy.get('username') - view_count = int_or_none(gfy.get('views')) - like_count = int_or_none(gfy.get('likes')) - dislike_count = int_or_none(gfy.get('dislikes')) - age_limit = 18 if gfy.get('nsfw') == '1' else 0 - - width = int_or_none(gfy.get('width')) - height = int_or_none(gfy.get('height')) - fps = int_or_none(gfy.get('frameRate')) - num_frames = int_or_none(gfy.get('numFrames')) - - duration = float_or_none(num_frames, fps) if num_frames and fps else None - - categories = gfy.get('tags') or gfy.get('extraLemmas') or [] - - FORMATS = ('gif', 'webm', 'mp4') - quality = qualities(FORMATS) - - formats = [] - for format_id in FORMATS: - video_url = gfy.get('%sUrl' % format_id) - if not video_url: - continue - filesize = int_or_none(gfy.get('%sSize' % format_id)) - formats.append({ - 'url': video_url, - 'format_id': format_id, - 'width': width, - 'height': height, - 'fps': fps, - 'filesize': filesize, - 'quality': quality(format_id), - }) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'uploader': gfy.get('userDisplayName') or uploader, - 'uploader_id': uploader, - 'duration': duration, - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'categories': categories, - 'age_limit': age_limit, - 'formats': formats, - } diff --git a/yt_dlp/extractor/giantbomb.py b/yt_dlp/extractor/giantbomb.py index 112572366b..259d39095e 100644 --- a/yt_dlp/extractor/giantbomb.py +++ b/yt_dlp/extractor/giantbomb.py @@ -22,7 +22,7 @@ class GiantBombIE(InfoExtractor): 'description': 'md5:0aa3aaf2772a41b91d44c63f30dfad24', 'duration': 2399, 'thumbnail': r're:^https?://.*\.jpg$', - } + }, }, { 'url': 'https://www.giantbomb.com/shows/ben-stranding/2970-20212', 'only_matching': True, diff --git a/yt_dlp/extractor/giga.py b/yt_dlp/extractor/giga.py deleted file mode 100644 index b59c129abf..0000000000 --- a/yt_dlp/extractor/giga.py +++ /dev/null @@ -1,93 +0,0 @@ -import itertools - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import parse_duration, parse_iso8601, qualities, str_to_int - - -class GigaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?giga\.de/(?:[^/]+/)*(?P[^/]+)' - _TESTS = [{ - 'url': 'http://www.giga.de/filme/anime-awesome/trailer/anime-awesome-chihiros-reise-ins-zauberland-das-beste-kommt-zum-schluss/', - 'md5': '6bc5535e945e724640664632055a584f', - 'info_dict': { - 'id': '2622086', - 'display_id': 'anime-awesome-chihiros-reise-ins-zauberland-das-beste-kommt-zum-schluss', - 'ext': 'mp4', - 'title': 'Anime Awesome: Chihiros Reise ins Zauberland – Das Beste kommt zum Schluss', - 'description': 'md5:afdf5862241aded4718a30dff6a57baf', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 578, - 'timestamp': 1414749706, - 'upload_date': '20141031', - 'uploader': 'Robin Schweiger', - 'view_count': int, - }, - }, { - 'url': 'http://www.giga.de/games/channel/giga-top-montag/giga-topmontag-die-besten-serien-2014/', - 'only_matching': True, - }, { - 'url': 'http://www.giga.de/extra/netzkultur/videos/giga-games-tom-mats-robin-werden-eigene-wege-gehen-eine-ankuendigung/', - 'only_matching': True, - }, { - 'url': 'http://www.giga.de/tv/jonas-liest-spieletitel-eingedeutscht-episode-2/', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - video_id = self._search_regex( - [r'data-video-id="(\d+)"', r'/api/video/jwplayer/#v=(\d+)'], - webpage, 'video id') - - playlist = self._download_json( - 'http://www.giga.de/api/syndication/video/video_id/%s/playlist.json?content=syndication/key/368b5f151da4ae05ced7fa296bdff65a/' - % video_id, video_id)[0] - - quality = qualities(['normal', 'hd720']) - - formats = [] - for format_id in itertools.count(0): - fmt = playlist.get(compat_str(format_id)) - if not fmt: - break - formats.append({ - 'url': fmt['src'], - 'format_id': '%s-%s' % (fmt['quality'], fmt['type'].split('/')[-1]), - 'quality': quality(fmt['quality']), - }) - - title = self._html_search_meta( - 'title', webpage, 'title', fatal=True) - description = self._html_search_meta( - 'description', webpage, 'description') - thumbnail = self._og_search_thumbnail(webpage) - - duration = parse_duration(self._search_regex( - r'(?s)(?:data-video-id="{0}"|data-video="[^"]*/api/video/jwplayer/#v={0}[^"]*")[^>]*>.+?([^<]+)'.format(video_id), - webpage, 'duration', fatal=False)) - - timestamp = parse_iso8601(self._search_regex( - r'datetime="([^"]+)"', webpage, 'upload date', fatal=False)) - uploader = self._search_regex( - r'class="author">([^<]+)', webpage, 'uploader', fatal=False) - - view_count = str_to_int(self._search_regex( - r'([\d.,]+)', - webpage, 'view count', fatal=False)) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'uploader': uploader, - 'view_count': view_count, - 'formats': formats, - } diff --git a/yt_dlp/extractor/gigya.py b/yt_dlp/extractor/gigya.py index c5bc86bb4a..cc18ee67c2 100644 --- a/yt_dlp/extractor/gigya.py +++ b/yt_dlp/extractor/gigya.py @@ -1,5 +1,4 @@ from .common import InfoExtractor - from ..utils import ( ExtractorError, urlencode_postdata, @@ -16,5 +15,5 @@ def _gigya_login(self, auth_data): error_message = auth_info.get('errorDetails') or auth_info.get('errorMessage') if error_message: raise ExtractorError( - 'Unable to login: %s' % error_message, expected=True) + f'Unable to login: {error_message}', expected=True) return auth_info diff --git a/yt_dlp/extractor/glide.py b/yt_dlp/extractor/glide.py index d114f3494c..b4c8681083 100644 --- a/yt_dlp/extractor/glide.py +++ b/yt_dlp/extractor/glide.py @@ -12,7 +12,7 @@ class GlideIE(InfoExtractor): 'ext': 'mp4', 'title': "Damon's Glide message", 'thumbnail': r're:^https?://.*?\.cloudfront\.net/.*\.jpg$', - } + }, } def _real_extract(self, url): diff --git a/yt_dlp/extractor/globalplayer.py b/yt_dlp/extractor/globalplayer.py new file mode 100644 index 0000000000..3d4a9304ca --- /dev/null +++ b/yt_dlp/extractor/globalplayer.py @@ -0,0 +1,254 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + join_nonempty, + parse_duration, + str_or_none, + traverse_obj, + unified_strdate, + unified_timestamp, + urlhandle_detect_ext, +) + + +class GlobalPlayerBaseIE(InfoExtractor): + def _get_page_props(self, url, video_id): + webpage = self._download_webpage(url, video_id) + return self._search_nextjs_data(webpage, video_id)['props']['pageProps'] + + def _request_ext(self, url, video_id): + return urlhandle_detect_ext(self._request_webpage( # Server rejects HEAD requests + url, video_id, note='Determining source extension')) + + def _extract_audio(self, episode, series): + return { + 'vcodec': 'none', + **traverse_obj(series, { + 'series': 'title', + 'series_id': 'id', + 'thumbnail': 'imageUrl', + 'uploader': 'itunesAuthor', # podcasts only + }), + **traverse_obj(episode, { + 'id': 'id', + 'description': ('description', {clean_html}), + 'duration': ('duration', {parse_duration}), + 'thumbnail': 'imageUrl', + 'url': 'streamUrl', + 'timestamp': (('pubDate', 'startDate'), {unified_timestamp}), + 'title': 'title', + }, get_all=False), + } + + +class GlobalPlayerLiveIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/live/(?P\w+)/\w+' + _TESTS = [{ + 'url': 'https://www.globalplayer.com/live/smoothchill/uk/', + 'info_dict': { + 'id': '2mx1E', + 'ext': 'aac', + 'display_id': 'smoothchill-uk', + 'title': 're:^Smooth Chill.+$', + 'thumbnail': 'https://herald.musicradio.com/media/f296ade8-50c9-4f60-911f-924e96873620.png', + 'description': 'Music To Chill To', + 'live_status': 'is_live', + }, + }, { + # national station + 'url': 'https://www.globalplayer.com/live/heart/uk/', + 'info_dict': { + 'id': '2mwx4', + 'ext': 'aac', + 'description': 'turn up the feel good!', + 'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png', + 'live_status': 'is_live', + 'title': 're:^Heart UK.+$', + 'display_id': 'heart-uk', + }, + }, { + # regional variation + 'url': 'https://www.globalplayer.com/live/heart/london/', + 'info_dict': { + 'id': 'AMqg', + 'ext': 'aac', + 'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png', + 'title': 're:^Heart London.+$', + 'live_status': 'is_live', + 'display_id': 'heart-london', + 'description': 'turn up the feel good!', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + station = self._get_page_props(url, video_id)['station'] + stream_url = station['streamUrl'] + + return { + 'id': station['id'], + 'display_id': join_nonempty('brandSlug', 'slug', from_dict=station) or station.get('legacyStationPrefix'), + 'url': stream_url, + 'ext': self._request_ext(stream_url, video_id), + 'vcodec': 'none', + 'is_live': True, + **traverse_obj(station, { + 'title': (('name', 'brandName'), {str_or_none}), + 'description': 'tagline', + 'thumbnail': 'brandLogo', + }, get_all=False), + } + + +class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/playlists/(?P\w+)' + _TESTS = [{ + # "live playlist" + 'url': 'https://www.globalplayer.com/playlists/8bLk/', + 'info_dict': { + 'id': '8bLk', + 'ext': 'aac', + 'live_status': 'is_live', + 'description': 'md5:e10f5e10b01a7f2c14ba815509fbb38d', + 'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=', + 'title': 're:^Classic FM Hall of Fame.+$', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + station = self._get_page_props(url, video_id)['playlistData'] + stream_url = station['streamUrl'] + + return { + 'id': video_id, + 'url': stream_url, + 'ext': self._request_ext(stream_url, video_id), + 'vcodec': 'none', + 'is_live': True, + **traverse_obj(station, { + 'title': 'title', + 'description': 'description', + 'thumbnail': 'image', + }), + } + + +class GlobalPlayerAudioIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/(?:(?Ppodcasts)/|catchup/\w+/\w+/)(?P\w+)/?(?:$|[?#])' + _TESTS = [{ + # podcast + 'url': 'https://www.globalplayer.com/podcasts/42KuaM/', + 'playlist_mincount': 5, + 'info_dict': { + 'id': '42KuaM', + 'title': 'Filthy Ritual', + 'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e', + 'categories': ['Society & Culture', 'True Crime'], + 'uploader': 'Global', + 'description': 'md5:da5b918eac9ae319454a10a563afacf9', + }, + }, { + # radio catchup + 'url': 'https://www.globalplayer.com/catchup/lbc/uk/46vyD7z/', + 'playlist_mincount': 3, + 'info_dict': { + 'id': '46vyD7z', + 'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.', + 'title': 'Nick Ferrari', + 'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf', + }, + }] + + def _real_extract(self, url): + video_id, podcast = self._match_valid_url(url).group('id', 'podcast') + props = self._get_page_props(url, video_id) + series = props['podcastInfo'] if podcast else props['catchupInfo'] + + return { + '_type': 'playlist', + 'id': video_id, + 'entries': [self._extract_audio(ep, series) for ep in traverse_obj( + series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))], + 'categories': traverse_obj(series, ('categories', ..., 'name')) or None, + **traverse_obj(series, { + 'description': 'description', + 'thumbnail': 'imageUrl', + 'title': 'title', + 'uploader': 'itunesAuthor', # podcasts only + }), + } + + +class GlobalPlayerAudioEpisodeIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/(?:(?Ppodcasts)|catchup/\w+/\w+)/episodes/(?P\w+)/?(?:$|[?#])' + _TESTS = [{ + # podcast + 'url': 'https://www.globalplayer.com/podcasts/episodes/7DrfNnE/', + 'info_dict': { + 'id': '7DrfNnE', + 'ext': 'mp3', + 'title': 'Filthy Ritual - Trailer', + 'description': 'md5:1f1562fd0f01b4773b590984f94223e0', + 'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e', + 'duration': 225.0, + 'timestamp': 1681254900, + 'series': 'Filthy Ritual', + 'series_id': '42KuaM', + 'upload_date': '20230411', + 'uploader': 'Global', + }, + }, { + # radio catchup + 'url': 'https://www.globalplayer.com/catchup/lbc/uk/episodes/2zGq26Vcv1fCWhddC4JAwETXWe/', + 'info_dict': { + 'id': '2zGq26Vcv1fCWhddC4JAwETXWe', + 'ext': 'm4a', + 'timestamp': 1682056800, + 'series': 'Nick Ferrari', + 'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf', + 'upload_date': '20230421', + 'series_id': '46vyD7z', + 'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.', + 'title': 'Nick Ferrari', + 'duration': 10800.0, + }, + }] + + def _real_extract(self, url): + video_id, podcast = self._match_valid_url(url).group('id', 'podcast') + props = self._get_page_props(url, video_id) + episode = props['podcastEpisode'] if podcast else props['catchupEpisode'] + + return self._extract_audio( + episode, traverse_obj(episode, 'podcast', 'show', expected_type=dict) or {}) + + +class GlobalPlayerVideoIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/videos/(?P\w+)' + _TESTS = [{ + 'url': 'https://www.globalplayer.com/videos/2JsSZ7Gm2uP/', + 'info_dict': { + 'id': '2JsSZ7Gm2uP', + 'ext': 'mp4', + 'description': 'md5:6a9f063c67c42f218e42eee7d0298bfd', + 'thumbnail': 'md5:d4498af48e15aae4839ce77b97d39550', + 'upload_date': '20230420', + 'title': 'Treble Malakai Bayoh sings a sublime Handel aria at Classic FM Live', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + meta = self._get_page_props(url, video_id)['videoData'] + + return { + 'id': video_id, + **traverse_obj(meta, { + 'url': 'url', + 'thumbnail': ('image', 'url'), + 'title': 'title', + 'upload_date': ('publish_date', {unified_strdate}), + 'description': 'description', + }), + } diff --git a/yt_dlp/extractor/globo.py b/yt_dlp/extractor/globo.py index a7be2cb766..d72296be6e 100644 --- a/yt_dlp/extractor/globo.py +++ b/yt_dlp/extractor/globo.py @@ -5,11 +5,8 @@ import re from .common import InfoExtractor -from ..compat import ( - compat_str, -) +from ..networking import HEADRequest from ..utils import ( - HEADRequest, ExtractorError, float_or_none, orderedSet, @@ -88,7 +85,7 @@ def _real_extract(self, url): video_id, 'Getting cookies') video = self._download_json( - 'http://api.globovideos.com/videos/%s/playlist' % video_id, + f'http://api.globovideos.com/videos/{video_id}/playlist', video_id)['videos'][0] if not self.get_param('allow_unplayable_formats') and video.get('encrypted') is True: self.report_drm(video_id) @@ -97,14 +94,14 @@ def _real_extract(self, url): formats = [] security = self._download_json( - 'https://playback.video.globo.com/v2/video-session', video_id, 'Downloading security hash for %s' % video_id, + 'https://playback.video.globo.com/v2/video-session', video_id, f'Downloading security hash for {video_id}', headers={'content-type': 'application/json'}, data=json.dumps({ - "player_type": "desktop", - "video_id": video_id, - "quality": "max", - "content_protection": "widevine", - "vsid": "581b986b-4c40-71f0-5a58-803e579d5fa2", - "tz": "-3.0:00" + 'player_type': 'desktop', + 'video_id': video_id, + 'quality': 'max', + 'content_protection': 'widevine', + 'vsid': '581b986b-4c40-71f0-5a58-803e579d5fa2', + 'tz': '-3.0:00', }).encode()) self._request_webpage(HEADRequest(security['sources'][0]['url_template']), video_id, 'Getting locksession cookie') @@ -114,7 +111,7 @@ def _real_extract(self, url): message = security.get('message') if message: raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, message), expected=True) + f'{self.IE_NAME} returned error: {message}', expected=True) hash_code = security_hash[:2] padding = '%010d' % random.randint(1, 10000000000) @@ -128,13 +125,13 @@ def _real_extract(self, url): padding += '1' hash_prefix = '05' + security_hash[:22] - padded_sign_time = compat_str(int(received_time) + 86400) + padding + padded_sign_time = str(int(received_time) + 86400) + padding md5_data = (received_md5 + padded_sign_time + '0xAC10FD').encode() signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=') signed_hash = hash_prefix + padded_sign_time + signed_md5 source = security['sources'][0]['url_parts'] resource_url = source['scheme'] + '://' + source['domain'] + source['path'] - signed_url = '%s?h=%s&k=html5&a=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A') + signed_url = '{}?h={}&k=html5&a={}'.format(resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A') fmts, subtitles = self._extract_m3u8_formats_and_subtitles( signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) @@ -230,7 +227,7 @@ class GloboArticleIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if GloboIE.suitable(url) else super(GloboArticleIE, cls).suitable(url) + return False if GloboIE.suitable(url) else super().suitable(url) def _real_extract(self, url): display_id = self._match_id(url) @@ -239,7 +236,7 @@ def _real_extract(self, url): for video_regex in self._VIDEOID_REGEXES: video_ids.extend(re.findall(video_regex, webpage)) entries = [ - self.url_result('globo:%s' % video_id, GloboIE.ie_key()) + self.url_result(f'globo:{video_id}', GloboIE.ie_key()) for video_id in orderedSet(video_ids)] title = self._og_search_title(webpage).strip() description = self._html_search_meta('description', webpage) diff --git a/yt_dlp/extractor/glomex.py b/yt_dlp/extractor/glomex.py index 22aac0db90..35ffad56c2 100644 --- a/yt_dlp/extractor/glomex.py +++ b/yt_dlp/extractor/glomex.py @@ -3,9 +3,9 @@ from .common import InfoExtractor from ..utils import ( + ExtractorError, determine_ext, extract_attributes, - ExtractorError, int_or_none, parse_qs, smuggle_url, @@ -49,15 +49,15 @@ def _download_api_data(self, video_id, integration, current_url=None): video_id_type = self._get_videoid_type(video_id) return self._download_json( self._API_URL, - video_id, 'Downloading %s JSON' % video_id_type, - 'Unable to download %s JSON' % video_id_type, + video_id, f'Downloading {video_id_type} JSON', + f'Unable to download {video_id_type} JSON', query=query) def _download_and_extract_api_data(self, video_id, integration, current_url): api_data = self._download_api_data(video_id, integration, current_url) videos = api_data['videos'] if not videos: - raise ExtractorError('no videos found for %s' % video_id) + raise ExtractorError(f'no videos found for {video_id}') videos = [self._extract_api_data(video, video_id) for video in videos] return videos[0] if len(videos) == 1 else self.playlist_result(videos, video_id) diff --git a/yt_dlp/extractor/gmanetwork.py b/yt_dlp/extractor/gmanetwork.py new file mode 100644 index 0000000000..ecef1e16ae --- /dev/null +++ b/yt_dlp/extractor/gmanetwork.py @@ -0,0 +1,83 @@ +from .common import InfoExtractor +from .dailymotion import DailymotionIE +from .youtube import YoutubeIE + + +class GMANetworkVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www)\.gmanetwork\.com/(?:\w+/){3}(?P\d+)/(?P[\w-]+)/video' + _TESTS = [{ + 'url': 'https://www.gmanetwork.com/fullepisodes/home/running_man_philippines/168677/running-man-philippines-catch-the-thief-full-chapter-2/video?section=home', + 'info_dict': { + 'id': '28BqW0AXPe0', + 'ext': 'mp4', + 'upload_date': '20220919', + 'uploader_url': 'http://www.youtube.com/channel/UChsoPNR5x-wdSO2GrOSIWqQ', + 'like_count': int, + 'view_count': int, + 'uploader': 'YoüLOL', + 'channel_id': 'UChsoPNR5x-wdSO2GrOSIWqQ', + 'duration': 5313, + 'comment_count': int, + 'tags': 'count:22', + 'uploader_id': 'UChsoPNR5x-wdSO2GrOSIWqQ', + 'title': 'Running Man Philippines: Catch the Thief (FULL CHAPTER 2)', + 'channel_url': 'https://www.youtube.com/channel/UChsoPNR5x-wdSO2GrOSIWqQ', + 'thumbnail': 'https://i.ytimg.com/vi/28BqW0AXPe0/maxresdefault.jpg', + 'release_timestamp': 1663594212, + 'age_limit': 0, + 'channel_follower_count': int, + 'categories': ['Entertainment'], + 'description': 'md5:811bdcea74f9c48051824e494756e926', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'channel': 'YoüLOL', + 'availability': 'public', + 'release_date': '20220919', + }, + }, { + 'url': 'https://www.gmanetwork.com/fullepisodes/home/more_than_words/87059/more-than-words-full-episode-80/video?section=home', + 'info_dict': { + 'id': 'yiDOExw2aSA', + 'ext': 'mp4', + 'live_status': 'not_live', + 'channel': 'GMANetwork', + 'like_count': int, + 'channel_follower_count': int, + 'description': 'md5:6d00cd658394fa1a5071200d3ed4be05', + 'duration': 1419, + 'age_limit': 0, + 'comment_count': int, + 'upload_date': '20181003', + 'thumbnail': 'https://i.ytimg.com/vi_webp/yiDOExw2aSA/maxresdefault.webp', + 'availability': 'public', + 'playable_in_embed': True, + 'channel_id': 'UCKL5hAuzgFQsyrsQKgU0Qng', + 'title': 'More Than Words: Full Episode 80 (Finale)', + 'uploader_id': 'GMANETWORK', + 'categories': ['Entertainment'], + 'uploader': 'GMANetwork', + 'channel_url': 'https://www.youtube.com/channel/UCKL5hAuzgFQsyrsQKgU0Qng', + 'tags': 'count:29', + 'view_count': int, + 'uploader_url': 'http://www.youtube.com/user/GMANETWORK', + }, + }] + + def _real_extract(self, url): + content_id, display_id = self._match_valid_url(url).group('id', 'display_id') + webpage = self._download_webpage(url, display_id) + # webpage route + youtube_id = self._search_regex( + r'var\s*YOUTUBE_VIDEO\s*=\s*[\'"]+(?P[\w-]+)', webpage, 'youtube_id', fatal=False) + if youtube_id: + return self.url_result(youtube_id, YoutubeIE, youtube_id) + + # api call route + # more info at https://aphrodite.gmanetwork.com/fullepisodes/assets/fullepisodes/js/dist/fullepisodes_video.js?v=1.1.11 + network_url = self._search_regex( + r'NETWORK_URL\s*=\s*[\'"](?P[^\'"]+)', webpage, 'network_url') + json_data = self._download_json(f'{network_url}api/data/content/video/{content_id}', display_id) + if json_data.get('video_file'): + return self.url_result(json_data['video_file'], YoutubeIE, json_data['video_file']) + else: + return self.url_result(json_data['dailymotion_file'], DailymotionIE, json_data['dailymotion_file']) diff --git a/yt_dlp/extractor/go.py b/yt_dlp/extractor/go.py index b075a02e0f..83c1979db8 100644 --- a/yt_dlp/extractor/go.py +++ b/yt_dlp/extractor/go.py @@ -1,18 +1,18 @@ import re from .adobepass import AdobePassIE -from ..compat import compat_str from ..utils import ( - int_or_none, - determine_ext, - parse_age_limit, - remove_start, - remove_end, - try_get, - urlencode_postdata, ExtractorError, - unified_timestamp, + determine_ext, + int_or_none, + join_nonempty, + parse_age_limit, + remove_end, + remove_start, traverse_obj, + try_get, + unified_timestamp, + urlencode_postdata, ) @@ -50,14 +50,14 @@ class GoIE(AdobePassIE): _VALID_URL = r'''(?x) https?:// (?P - (?:%s\.)?go|fxnow\.fxnetworks| + (?:{}\.)?go|fxnow\.fxnetworks| (?:www\.)?(?:abc|freeform|disneynow) )\.com/ (?: (?:[^/]+/)*(?P[Vv][Dd][Kk][Aa]\w+)| (?:[^/]+/)*(?P[^/?\#]+) ) - ''' % r'\.|'.join(list(_SITE_INFO.keys())) + '''.format(r'\.|'.join(list(_SITE_INFO.keys()))) _TESTS = [{ 'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643', 'info_dict': { @@ -94,7 +94,7 @@ class GoIE(AdobePassIE): 'series': 'Shadowhunters', 'episode_number': 1, 'timestamp': 1483387200, - 'ext': 'mp4' + 'ext': 'mp4', }, 'params': { 'geo_bypass_ip_block': '3.244.239.0/24', @@ -168,7 +168,7 @@ class GoIE(AdobePassIE): def _extract_videos(self, brand, video_id='-1', show_id='-1'): display_id = video_id if video_id != '-1' else show_id return self._download_json( - 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/%s/-1/%s/-1/-1.json' % (brand, show_id, video_id), + f'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/{brand}/001/-1/{show_id}/-1/{video_id}/-1/-1.json', display_id)['video'] def _real_extract(self, url): @@ -191,7 +191,7 @@ def _real_extract(self, url): video_id = try_get( layout, (lambda x: x['videoid'], lambda x: x['video']['id']), - compat_str) + str) if not video_id: video_id = self._search_regex( ( @@ -201,7 +201,7 @@ def _real_extract(self, url): # page.analytics.videoIdCode r'\bvideoIdCode["\']\s*:\s*["\']((?:vdka|VDKA)\w+)', # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet - r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' + r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)', ), webpage, 'video id', default=video_id) if not site_info: brand = self._search_regex( @@ -266,7 +266,7 @@ def _real_extract(self, url): self.raise_geo_restricted( error['message'], countries=['US']) error_message = ', '.join([error['message'] for error in errors]) - raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) + raise ExtractorError(f'{self.IE_NAME} said: {error_message}', expected=True) asset_url += '?' + entitlement['uplynkData']['sessionKey'] fmts, subs = self._extract_m3u8_formats_and_subtitles( asset_url, video_id, 'mp4', m3u8_id=format_id or 'hls', fatal=False) @@ -280,7 +280,7 @@ def _real_extract(self, url): } if re.search(r'(?:/mp4/source/|_source\.mp4)', asset_url): f.update({ - 'format_id': ('%s-' % format_id if format_id else '') + 'SOURCE', + 'format_id': (f'{format_id}-' if format_id else '') + 'SOURCE', 'quality': 1, }) else: @@ -288,7 +288,7 @@ def _real_extract(self, url): if mobj: height = int(mobj.group(2)) f.update({ - 'format_id': ('%s-' % format_id if format_id else '') + '%dP' % height, + 'format_id': join_nonempty(format_id, f'{height}P'), 'width': int(mobj.group(1)), 'height': height, }) diff --git a/yt_dlp/extractor/godresource.py b/yt_dlp/extractor/godresource.py new file mode 100644 index 0000000000..35cc30cd8d --- /dev/null +++ b/yt_dlp/extractor/godresource.py @@ -0,0 +1,79 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + str_or_none, + unified_timestamp, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class GodResourceIE(InfoExtractor): + _VALID_URL = r'https?://new\.godresource\.com/video/(?P\w+)' + _TESTS = [{ + # hls stream + 'url': 'https://new.godresource.com/video/A01mTKjyf6w', + 'info_dict': { + 'id': 'A01mTKjyf6w', + 'ext': 'mp4', + 'view_count': int, + 'timestamp': 1710978666, + 'channel_id': '5', + 'thumbnail': 'https://cdn-02.godresource.com/e42968ac-9e8b-4231-ab86-f4f9d775841f/thumbnail.jpg', + 'channel': 'Stedfast Baptist Church', + 'upload_date': '20240320', + 'title': 'GodResource video #A01mTKjyf6w', + }, + }, { + # mp4 link + 'url': 'https://new.godresource.com/video/01DXmBbQv_X', + 'md5': '0e8f72aa89a106b9d5c011ba6f8717b7', + 'info_dict': { + 'id': '01DXmBbQv_X', + 'ext': 'mp4', + 'channel_id': '12', + 'view_count': int, + 'timestamp': 1687996800, + 'thumbnail': 'https://cdn-02.godresource.com/sodomitedeception/thumbnail.jpg', + 'channel': 'Documentaries', + 'title': 'The Sodomite Deception', + 'upload_date': '20230629', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + api_data = self._download_json( + f'https://api.godresource.com/api/Streams/{display_id}', display_id) + + video_url = api_data['streamUrl'] + is_live = api_data.get('isLive') or False + if (ext := determine_ext(video_url)) == 'm3u8': + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + video_url, display_id, live=is_live) + elif ext == 'mp4': + formats, subtitles = [{ + 'url': video_url, + 'ext': ext, + }], {} + else: + raise ExtractorError(f'Unexpected video format {ext}') + + return { + 'id': display_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': '', + 'is_live': is_live, + **traverse_obj(api_data, { + 'title': ('title', {str}), + 'thumbnail': ('thumbnail', {url_or_none}), + 'view_count': ('views', {int}), + 'channel': ('channelName', {str}), + 'channel_id': ('channelId', {str_or_none}), + 'timestamp': ('streamDateCreated', {unified_timestamp}), + 'modified_timestamp': ('streamDataModified', {unified_timestamp}), + }), + } diff --git a/yt_dlp/extractor/godtube.py b/yt_dlp/extractor/godtube.py index 697540155a..f4496ac5d8 100644 --- a/yt_dlp/extractor/godtube.py +++ b/yt_dlp/extractor/godtube.py @@ -6,6 +6,7 @@ class GodTubeIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?godtube\.com/watch/\?v=(?P[\da-zA-Z]+)' _TESTS = [ { @@ -29,7 +30,7 @@ def _real_extract(self, url): video_id = mobj.group('id') config = self._download_xml( - 'http://www.godtube.com/resource/mediaplayer/%s.xml' % video_id.lower(), + f'http://www.godtube.com/resource/mediaplayer/{video_id.lower()}.xml', video_id, 'Downloading player config XML') video_url = config.find('file').text @@ -39,7 +40,7 @@ def _real_extract(self, url): thumbnail = config.find('image').text media = self._download_xml( - 'http://www.godtube.com/media/xml/?v=%s' % video_id, video_id, 'Downloading media XML') + f'http://www.godtube.com/media/xml/?v={video_id}', video_id, 'Downloading media XML') title = media.find('title').text diff --git a/yt_dlp/extractor/gofile.py b/yt_dlp/extractor/gofile.py index ddbce2ee8f..a9777a5946 100644 --- a/yt_dlp/extractor/gofile.py +++ b/yt_dlp/extractor/gofile.py @@ -1,10 +1,7 @@ import hashlib from .common import InfoExtractor -from ..utils import ( - ExtractorError, - try_get -) +from ..utils import ExtractorError, try_get class GofileIE(InfoExtractor): @@ -23,8 +20,8 @@ class GofileIE(InfoExtractor): 'title': 'nuuh', 'release_timestamp': 1638338704, 'release_date': '20211201', - } - }] + }, + }], }, { 'url': 'https://gofile.io/d/is8lKr', 'info_dict': { @@ -58,21 +55,18 @@ def _real_initialize(self): return account_data = self._download_json( - 'https://api.gofile.io/createAccount', None, note='Getting a new guest account') + 'https://api.gofile.io/accounts', None, 'Getting a new guest account', data=b'{}') self._TOKEN = account_data['data']['token'] - self._set_cookie('gofile.io', 'accountToken', self._TOKEN) + self._set_cookie('.gofile.io', 'accountToken', self._TOKEN) def _entries(self, file_id): - query_params = { - 'contentId': file_id, - 'token': self._TOKEN, - 'websiteToken': 12345, - } + query_params = {'wt': '4fd6sg89d7s6'} # From https://gofile.io/dist/js/alljs.js password = self.get_param('videopassword') if password: - query_params['password'] = hashlib.sha256(password.encode('utf-8')).hexdigest() + query_params['password'] = hashlib.sha256(password.encode()).hexdigest() files = self._download_json( - 'https://api.gofile.io/getContent', file_id, note='Getting filelist', query=query_params) + f'https://api.gofile.io/contents/{file_id}', file_id, 'Getting filelist', + query=query_params, headers={'Authorization': f'Bearer {self._TOKEN}'}) status = files['status'] if status == 'error-passwordRequired': @@ -82,7 +76,7 @@ def _entries(self, file_id): raise ExtractorError(f'{self.IE_NAME} said: status {status}', expected=True) found_files = False - for file in (try_get(files, lambda x: x['data']['contents'], dict) or {}).values(): + for file in (try_get(files, lambda x: x['data']['children'], dict) or {}).values(): file_type, file_format = file.get('mimetype').split('/', 1) if file_type not in ('video', 'audio') and file_format != 'vnd.mts': continue @@ -95,7 +89,7 @@ def _entries(self, file_id): 'title': file['name'].rsplit('.', 1)[0], 'url': file_url, 'filesize': file.get('size'), - 'release_timestamp': file.get('createTime') + 'release_timestamp': file.get('createTime'), } if not found_files: diff --git a/yt_dlp/extractor/golem.py b/yt_dlp/extractor/golem.py index c33d950191..90d2fe6c26 100644 --- a/yt_dlp/extractor/golem.py +++ b/yt_dlp/extractor/golem.py @@ -1,8 +1,6 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) from ..utils import ( determine_ext, ) @@ -20,7 +18,7 @@ class GolemIE(InfoExtractor): 'title': 'iPhone 6 und 6 Plus - Test', 'duration': 300.44, 'filesize': 65309548, - } + }, } _PREFIX = 'http://video.golem.de' @@ -29,7 +27,7 @@ def _real_extract(self, url): video_id = self._match_id(url) config = self._download_xml( - 'https://video.golem.de/xml/{0}.xml'.format(video_id), video_id) + f'https://video.golem.de/xml/{video_id}.xml', video_id) info = { 'id': video_id, @@ -44,8 +42,8 @@ def _real_extract(self, url): continue formats.append({ - 'format_id': compat_str(e.tag), - 'url': compat_urlparse.urljoin(self._PREFIX, url), + 'format_id': str(e.tag), + 'url': urllib.parse.urljoin(self._PREFIX, url), 'height': self._int(e.get('height'), 'height'), 'width': self._int(e.get('width'), 'width'), 'filesize': self._int(e.findtext('filesize'), 'filesize'), @@ -59,7 +57,7 @@ def _real_extract(self, url): if not url: continue thumbnails.append({ - 'url': compat_urlparse.urljoin(self._PREFIX, url), + 'url': urllib.parse.urljoin(self._PREFIX, url), 'width': self._int(e.get('width'), 'thumbnail width'), 'height': self._int(e.get('height'), 'thumbnail height'), }) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index e027ea7c4d..dfba2d3ba1 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -1,11 +1,15 @@ import re +import urllib.parse from .common import InfoExtractor -from ..compat import compat_parse_qs +from .youtube import YoutubeIE from ..utils import ( - determine_ext, ExtractorError, + bug_reports_message, + determine_ext, + extract_attributes, get_element_by_class, + get_element_html_by_id, int_or_none, lowercase_escape, try_get, @@ -17,9 +21,9 @@ class GoogleDriveIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:docs|drive)\.google\.com/ + (?:docs|drive|drive\.usercontent)\.google\.com/ (?: - (?:uc|open)\?.*?id=| + (?:uc|open|download)\?.*?id=| file/d/ )| video\.google\.com/get_player\?.*?docid= @@ -34,7 +38,19 @@ class GoogleDriveIE(InfoExtractor): 'ext': 'mp4', 'title': 'Big Buck Bunny.mp4', 'duration': 45, - } + 'thumbnail': 'https://drive.google.com/thumbnail?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ', + }, + }, { + # has itag 50 which is not in YoutubeIE._formats (royalty Free music from 1922) + 'url': 'https://drive.google.com/uc?id=1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x', + 'md5': '322db8d63dd19788c04050a4bba67073', + 'info_dict': { + 'id': '1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x', + 'ext': 'mp3', + 'title': 'My Buddy - Henry Burr - Gus Kahn - Walter Donaldson.mp3', + 'duration': 184, + 'thumbnail': 'https://drive.google.com/thumbnail?id=1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x', + }, }, { # video can't be watched anonymously due to view count limit reached, # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046) @@ -50,24 +66,13 @@ class GoogleDriveIE(InfoExtractor): }, { 'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28', 'only_matching': True, + }, { + 'url': 'https://drive.usercontent.google.com/download?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ', + 'only_matching': True, }] _FORMATS_EXT = { - '5': 'flv', - '6': 'flv', - '13': '3gp', - '17': '3gp', - '18': 'mp4', - '22': 'mp4', - '34': 'flv', - '35': 'flv', - '36': '3gp', - '37': 'mp4', - '38': 'mp4', - '43': 'webm', - '44': 'webm', - '45': 'webm', - '46': 'webm', - '59': 'mp4', + **{k: v['ext'] for k, v in YoutubeIE._formats.items() if v.get('ext')}, + '50': 'm4a', } _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext' _CAPTIONS_ENTRY_TAG = { @@ -83,7 +88,7 @@ def _extract_embed_urls(cls, url, webpage): r']+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P[a-zA-Z0-9_-]{28,})', webpage) if mobj: - yield 'https://drive.google.com/file/d/%s' % mobj.group('id') + yield 'https://drive.google.com/file/d/{}'.format(mobj.group('id')) def _download_subtitles_xml(self, video_id, subtitles_id, hl): if self._captions_xml: @@ -161,17 +166,15 @@ def _get_automatic_captions(self, video_id, subtitles_id, hl): def _real_extract(self, url): video_id = self._match_id(url) - video_info = compat_parse_qs(self._download_webpage( + video_info = urllib.parse.parse_qs(self._download_webpage( 'https://drive.google.com/get_video_info', - video_id, query={'docid': video_id})) + video_id, 'Downloading video webpage', query={'docid': video_id})) def get_value(key): return try_get(video_info, lambda x: x[key][0]) reason = get_value('reason') title = get_value('title') - if not title and reason: - raise ExtractorError(reason, expected=True) formats = [] fmt_stream_map = (get_value('fmt_stream_map') or '').split(',') @@ -190,10 +193,13 @@ def get_value(key): if len(fmt_stream_split) < 2: continue format_id, format_url = fmt_stream_split[:2] + ext = self._FORMATS_EXT.get(format_id) + if not ext: + self.report_warning(f'Unknown format {format_id}{bug_reports_message()}') f = { 'url': lowercase_escape(format_url), 'format_id': format_id, - 'ext': self._FORMATS_EXT[format_id], + 'ext': ext, } resolution = resolutions.get(format_id) if resolution: @@ -204,25 +210,31 @@ def get_value(key): formats.append(f) source_url = update_url_query( - 'https://drive.google.com/uc', { + 'https://drive.usercontent.google.com/download', { 'id': video_id, 'export': 'download', + 'confirm': 't', }) - def request_source_file(source_url, kind): + def request_source_file(source_url, kind, data=None): return self._request_webpage( - source_url, video_id, note='Requesting %s file' % kind, - errnote='Unable to request %s file' % kind, fatal=False) + source_url, video_id, note=f'Requesting {kind} file', + errnote=f'Unable to request {kind} file', fatal=False, data=data) urlh = request_source_file(source_url, 'source') if urlh: def add_source_format(urlh): + nonlocal title + if not title: + title = self._search_regex( + r'\bfilename="([^"]+)"', urlh.headers.get('Content-Disposition'), + 'title', default=None) formats.append({ # Use redirect URLs as download URLs in order to calculate # correct cookies in _calc_cookies. # Using original URLs may result in redirect loop due to # google.com's cookies mistakenly used for googleusercontent.com # redirect URLs (see #23919). - 'url': urlh.geturl(), + 'url': urlh.url, 'ext': determine_ext(title, 'mp4').lower(), 'format_id': 'source', 'quality': 1, @@ -234,14 +246,10 @@ def add_source_format(urlh): urlh, url, video_id, note='Downloading confirmation page', errnote='Unable to confirm download', fatal=False) if confirmation_webpage: - confirm = self._search_regex( - r'confirm=([^&"\']+)', confirmation_webpage, - 'confirmation code', default=None) - if confirm: - confirmed_source_url = update_url_query(source_url, { - 'confirm': confirm, - }) - urlh = request_source_file(confirmed_source_url, 'confirmed source') + confirmed_source_url = extract_attributes( + get_element_html_by_id('download-form', confirmation_webpage) or '').get('action') + if confirmed_source_url: + urlh = request_source_file(confirmed_source_url, 'confirmed source', data=b'') if urlh and urlh.headers.get('Content-Disposition'): add_source_format(urlh) else: @@ -251,7 +259,10 @@ def add_source_format(urlh): or 'unable to extract confirmation code') if not formats and reason: - self.raise_no_formats(reason, expected=True) + if title: + self.raise_no_formats(reason, expected=True) + else: + raise ExtractorError(reason, expected=True) hl = get_value('hl') subtitles_id = None @@ -259,7 +270,7 @@ def add_source_format(urlh): if ttsurl: # the video Id for subtitles will be the last value in the ttsurl # query string - subtitles_id = ttsurl.encode('utf-8').decode( + subtitles_id = ttsurl.encode().decode( 'unicode_escape').split('=')[-1] self.cookiejar.clear(domain='.google.com', path='/', name='NID') @@ -283,7 +294,7 @@ class GoogleDriveFolderIE(InfoExtractor): 'url': 'https://drive.google.com/drive/folders/1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI', 'info_dict': { 'id': '1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI', - 'title': 'Forrest' + 'title': 'Forrest', }, 'playlist_count': 3, }] @@ -301,13 +312,13 @@ class GoogleDriveFolderIE(InfoExtractor): def _call_api(self, folder_id, key, data, **kwargs): response = self._download_webpage( 'https://clients6.google.com/batch/drive/v2beta', - folder_id, data=data.encode('utf-8'), + folder_id, data=data.encode(), headers={ 'Content-Type': 'text/plain;charset=UTF-8;', 'Origin': 'https://drive.google.com', }, query={ '$ct': f'multipart/mixed; boundary="{self._BOUNDARY}"', - 'key': key + 'key': key, }, **kwargs) return self._search_json('', response, 'api response', folder_id, **kwargs) or {} diff --git a/yt_dlp/extractor/googlepodcasts.py b/yt_dlp/extractor/googlepodcasts.py index 8b2351ba88..8d1cc4fa11 100644 --- a/yt_dlp/extractor/googlepodcasts.py +++ b/yt_dlp/extractor/googlepodcasts.py @@ -48,7 +48,7 @@ class GooglePodcastsIE(GooglePodcastsBaseIE): 'timestamp': 1609606800, 'duration': 2901, 'series': "Wait Wait... Don't Tell Me!", - } + }, } def _real_extract(self, url): diff --git a/yt_dlp/extractor/goplay.py b/yt_dlp/extractor/goplay.py index 960d7d7bc0..dfe5afe635 100644 --- a/yt_dlp/extractor/goplay.py +++ b/yt_dlp/extractor/goplay.py @@ -1,6 +1,6 @@ import base64 import binascii -import datetime +import datetime as dt import hashlib import hmac import json @@ -31,7 +31,7 @@ class GoPlayIE(InfoExtractor): 'episode': 'Episode 2', 'episode_number': 2, }, - 'skip': 'This video is only available for registered users' + 'skip': 'This video is only available for registered users', }, { 'url': 'https://www.goplay.be/video/a-family-for-thr-holidays-s1-aflevering-1#autoplay', 'info_dict': { @@ -39,7 +39,23 @@ class GoPlayIE(InfoExtractor): 'ext': 'mp4', 'title': 'A Family for the Holidays', }, - 'skip': 'This video is only available for registered users' + 'skip': 'This video is only available for registered users', + }, { + 'url': 'https://www.goplay.be/video/de-mol/de-mol-s11/de-mol-s11-aflevering-1#autoplay', + 'info_dict': { + 'id': '03eb8f2f-153e-41cb-9805-0d3a29dab656', + 'ext': 'mp4', + 'title': 'S11 - Aflevering 1', + 'episode': 'Episode 1', + 'series': 'De Mol', + 'season_number': 11, + 'episode_number': 1, + 'season': 'Season 11', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This video is only available for registered users', }] _id_token = None @@ -63,7 +79,7 @@ def _real_extract(self, url): if movie: video_id = movie['videoUuid'] info_dict = { - 'title': movie.get('title') + 'title': movie.get('title'), } else: episode = traverse_obj(video_data, ('playlists', ..., 'episodes', lambda _, v: v['pageInfo']['url'] == url), get_all=False) @@ -77,16 +93,39 @@ def _real_extract(self, url): api = self._download_json( f'https://api.goplay.be/web/v1/videos/long-form/{video_id}', - video_id, headers={'Authorization': 'Bearer %s' % self._id_token}) + video_id, headers={ + 'Authorization': f'Bearer {self._id_token}', + **self.geo_verification_headers(), + }) - formats, subs = self._extract_m3u8_formats_and_subtitles( - api['manifestUrls']['hls'], video_id, ext='mp4', m3u8_id='HLS') + if 'manifestUrls' in api: + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + api['manifestUrls']['hls'], video_id, ext='mp4', m3u8_id='HLS') + + else: + if 'ssai' not in api: + raise ExtractorError('expecting Google SSAI stream') + + ssai_content_source_id = api['ssai']['contentSourceID'] + ssai_video_id = api['ssai']['videoID'] + + dai = self._download_json( + f'https://dai.google.com/ondemand/dash/content/{ssai_content_source_id}/vid/{ssai_video_id}/streams', + video_id, data=b'{"api-key":"null"}', + headers={'content-type': 'application/json'}) + + periods = self._extract_mpd_periods(dai['stream_manifest'], video_id) + + # skip pre-roll and mid-roll ads + periods = [p for p in periods if '-ad-' not in p['id']] + + formats, subtitles = self._merge_mpd_periods(periods) info_dict.update({ 'id': video_id, 'formats': formats, + 'subtitles': subtitles, }) - return info_dict @@ -115,31 +154,32 @@ def __init__(self, ie, pool_id, client_id): self.ie = ie self.pool_id = pool_id - if "_" not in self.pool_id: - raise ValueError("Invalid pool_id format. Should be _.") + if '_' not in self.pool_id: + raise ValueError('Invalid pool_id format. Should be _.') self.client_id = client_id - self.region = self.pool_id.split("_")[0] - self.url = "https://cognito-idp.%s.amazonaws.com/" % (self.region,) + self.region = self.pool_id.split('_')[0] + self.url = f'https://cognito-idp.{self.region}.amazonaws.com/' # Initialize the values # https://github.com/aws/amazon-cognito-identity-js/blob/master/src/AuthenticationHelper.js#L22 - self.n_hex = 'FFFFFFFFFFFFFFFFC90FDAA22168C234C4C6628B80DC1CD1' + \ - '29024E088A67CC74020BBEA63B139B22514A08798E3404DD' + \ - 'EF9519B3CD3A431B302B0A6DF25F14374FE1356D6D51C245' + \ - 'E485B576625E7EC6F44C42E9A637ED6B0BFF5CB6F406B7ED' + \ - 'EE386BFB5A899FA5AE9F24117C4B1FE649286651ECE45B3D' + \ - 'C2007CB8A163BF0598DA48361C55D39A69163FA8FD24CF5F' + \ - '83655D23DCA3AD961C62F356208552BB9ED529077096966D' + \ - '670C354E4ABC9804F1746C08CA18217C32905E462E36CE3B' + \ - 'E39E772C180E86039B2783A2EC07A28FB5C55DF06F4C52C9' + \ - 'DE2BCBF6955817183995497CEA956AE515D2261898FA0510' + \ - '15728E5A8AAAC42DAD33170D04507A33A85521ABDF1CBA64' + \ - 'ECFB850458DBEF0A8AEA71575D060C7DB3970F85A6E1E4C7' + \ - 'ABF5AE8CDB0933D71E8C94E04A25619DCEE3D2261AD2EE6B' + \ - 'F12FFA06D98A0864D87602733EC86A64521F2B18177B200C' + \ - 'BBE117577A615D6C770988C0BAD946E208E24FA074E5AB31' + \ - '43DB5BFCE0FD108E4B82D120A93AD2CAFFFFFFFFFFFFFFFF' + self.n_hex = ( + 'FFFFFFFFFFFFFFFFC90FDAA22168C234C4C6628B80DC1CD1' + '29024E088A67CC74020BBEA63B139B22514A08798E3404DD' + 'EF9519B3CD3A431B302B0A6DF25F14374FE1356D6D51C245' + 'E485B576625E7EC6F44C42E9A637ED6B0BFF5CB6F406B7ED' + 'EE386BFB5A899FA5AE9F24117C4B1FE649286651ECE45B3D' + 'C2007CB8A163BF0598DA48361C55D39A69163FA8FD24CF5F' + '83655D23DCA3AD961C62F356208552BB9ED529077096966D' + '670C354E4ABC9804F1746C08CA18217C32905E462E36CE3B' + 'E39E772C180E86039B2783A2EC07A28FB5C55DF06F4C52C9' + 'DE2BCBF6955817183995497CEA956AE515D2261898FA0510' + '15728E5A8AAAC42DAD33170D04507A33A85521ABDF1CBA64' + 'ECFB850458DBEF0A8AEA71575D060C7DB3970F85A6E1E4C7' + 'ABF5AE8CDB0933D71E8C94E04A25619DCEE3D2261AD2EE6B' + 'F12FFA06D98A0864D87602733EC86A64521F2B18177B200C' + 'BBE117577A615D6C770988C0BAD946E208E24FA074E5AB31' + '43DB5BFCE0FD108E4B82D120A93AD2CAFFFFFFFFFFFFFFFF') # https://github.com/aws/amazon-cognito-identity-js/blob/master/src/AuthenticationHelper.js#L49 self.g_hex = '2' @@ -155,26 +195,26 @@ def authenticate(self, username, password): """ Authenticate with a username and password. """ # Step 1: First initiate an authentication request auth_data_dict = self.__get_authentication_request(username) - auth_data = json.dumps(auth_data_dict).encode("utf-8") + auth_data = json.dumps(auth_data_dict).encode() auth_headers = { - "X-Amz-Target": "AWSCognitoIdentityProviderService.InitiateAuth", - "Accept-Encoding": "identity", - "Content-Type": "application/x-amz-json-1.1" + 'X-Amz-Target': 'AWSCognitoIdentityProviderService.InitiateAuth', + 'Accept-Encoding': 'identity', + 'Content-Type': 'application/x-amz-json-1.1', } auth_response_json = self.ie._download_json( self.url, None, data=auth_data, headers=auth_headers, note='Authenticating username', errnote='Invalid username') - challenge_parameters = auth_response_json.get("ChallengeParameters") + challenge_parameters = auth_response_json.get('ChallengeParameters') - if auth_response_json.get("ChallengeName") != "PASSWORD_VERIFIER": - raise AuthenticationException(auth_response_json["message"]) + if auth_response_json.get('ChallengeName') != 'PASSWORD_VERIFIER': + raise AuthenticationException(auth_response_json['message']) # Step 2: Respond to the Challenge with a valid ChallengeResponse challenge_request = self.__get_challenge_response_request(challenge_parameters, password) - challenge_data = json.dumps(challenge_request).encode("utf-8") + challenge_data = json.dumps(challenge_request).encode() challenge_headers = { - "X-Amz-Target": "AWSCognitoIdentityProviderService.RespondToAuthChallenge", - "Content-Type": "application/x-amz-json-1.1" + 'X-Amz-Target': 'AWSCognitoIdentityProviderService.RespondToAuthChallenge', + 'Content-Type': 'application/x-amz-json-1.1', } auth_response_json = self.ie._download_json( self.url, None, data=challenge_data, headers=challenge_headers, @@ -184,7 +224,7 @@ def authenticate(self, username, password): raise InvalidLoginException(auth_response_json['message']) return ( auth_response_json['AuthenticationResult']['IdToken'], - auth_response_json['AuthenticationResult']['RefreshToken'] + auth_response_json['AuthenticationResult']['RefreshToken'], ) def __get_authentication_request(self, username): @@ -195,15 +235,14 @@ def __get_authentication_request(self, username): :return: A full Authorization request. :rtype: dict """ - auth_request = { - "AuthParameters": { - "USERNAME": username, - "SRP_A": self.__long_to_hex(self.large_a_value) + return { + 'AuthParameters': { + 'USERNAME': username, + 'SRP_A': self.__long_to_hex(self.large_a_value), }, - "AuthFlow": "USER_SRP_AUTH", - "ClientId": self.client_id + 'AuthFlow': 'USER_SRP_AUTH', + 'ClientId': self.client_id, } - return auth_request def __get_challenge_response_request(self, challenge_parameters, password): """ Create a Challenge Response Request object. @@ -214,11 +253,11 @@ def __get_challenge_response_request(self, challenge_parameters, password): :return: A valid and full request data object to use as a response for a challenge. :rtype: dict """ - user_id = challenge_parameters["USERNAME"] - user_id_for_srp = challenge_parameters["USER_ID_FOR_SRP"] - srp_b = challenge_parameters["SRP_B"] - salt = challenge_parameters["SALT"] - secret_block = challenge_parameters["SECRET_BLOCK"] + user_id = challenge_parameters['USERNAME'] + user_id_for_srp = challenge_parameters['USER_ID_FOR_SRP'] + srp_b = challenge_parameters['SRP_B'] + salt = challenge_parameters['SALT'] + secret_block = challenge_parameters['SECRET_BLOCK'] timestamp = self.__get_current_timestamp() @@ -227,7 +266,7 @@ def __get_challenge_response_request(self, challenge_parameters, password): user_id_for_srp, password, self.__hex_to_long(srp_b), - salt + salt, ) secret_block_bytes = base64.standard_b64decode(secret_block) @@ -239,17 +278,16 @@ def __get_challenge_response_request(self, challenge_parameters, password): bytearray(timestamp, 'utf-8') hmac_obj = hmac.new(hkdf, msg, digestmod=hashlib.sha256) signature_string = base64.standard_b64encode(hmac_obj.digest()).decode('utf-8') - challenge_request = { - "ChallengeResponses": { - "USERNAME": user_id, - "TIMESTAMP": timestamp, - "PASSWORD_CLAIM_SECRET_BLOCK": secret_block, - "PASSWORD_CLAIM_SIGNATURE": signature_string + return { + 'ChallengeResponses': { + 'USERNAME': user_id, + 'TIMESTAMP': timestamp, + 'PASSWORD_CLAIM_SECRET_BLOCK': secret_block, + 'PASSWORD_CLAIM_SIGNATURE': signature_string, }, - "ChallengeName": "PASSWORD_VERIFIER", - "ClientId": self.client_id + 'ChallengeName': 'PASSWORD_VERIFIER', + 'ClientId': self.client_id, } - return challenge_request def __get_hkdf_key_for_password(self, username, password, server_b_value, salt): """ Calculates the final hkdf based on computed S value, and computed U value and the key. @@ -266,18 +304,17 @@ def __get_hkdf_key_for_password(self, username, password, server_b_value, salt): u_value = self.__calculate_u(self.large_a_value, server_b_value) if u_value == 0: raise ValueError('U cannot be zero.') - username_password = '%s%s:%s' % (self.pool_id.split('_')[1], username, password) - username_password_hash = self.__hash_sha256(username_password.encode('utf-8')) + username_password = '{}{}:{}'.format(self.pool_id.split('_')[1], username, password) + username_password_hash = self.__hash_sha256(username_password.encode()) x_value = self.__hex_to_long(self.__hex_hash(self.__pad_hex(salt) + username_password_hash)) g_mod_pow_xn = pow(self.g, x_value, self.big_n) int_value2 = server_b_value - self.k * g_mod_pow_xn s_value = pow(int_value2, self.small_a_value + u_value * x_value, self.big_n) - hkdf = self.__compute_hkdf( + return self.__compute_hkdf( bytearray.fromhex(self.__pad_hex(s_value)), - bytearray.fromhex(self.__pad_hex(self.__long_to_hex(u_value))) + bytearray.fromhex(self.__pad_hex(self.__long_to_hex(u_value))), ) - return hkdf def __compute_hkdf(self, ikm, salt): """ Standard hkdf algorithm @@ -329,7 +366,7 @@ def __calculate_a(self): @staticmethod def __long_to_hex(long_num): - return '%x' % long_num + return f'{long_num:x}' @staticmethod def __hex_to_long(hex_string): @@ -360,9 +397,9 @@ def __pad_hex(long_int): else: hash_str = long_int if len(hash_str) % 2 == 1: - hash_str = '0%s' % hash_str + hash_str = f'0{hash_str}' elif hash_str[0] in '89ABCDEFabcdef': - hash_str = '00%s' % hash_str + hash_str = f'00{hash_str}' return hash_str @staticmethod @@ -383,12 +420,11 @@ def __get_current_timestamp(): months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] - time_now = datetime.datetime.utcnow() - format_string = "{} {} {} %H:%M:%S UTC %Y".format(days[time_now.weekday()], months[time_now.month], time_now.day) - time_string = datetime.datetime.utcnow().strftime(format_string) - return time_string + time_now = dt.datetime.now(dt.timezone.utc) + format_string = f'{days[time_now.weekday()]} {months[time_now.month]} {time_now.day} %H:%M:%S UTC %Y' + return time_now.strftime(format_string) def __str__(self): - return "AWS IDP Client for:\nRegion: %s\nPoolId: %s\nAppId: %s" % ( - self.region, self.pool_id.split("_")[1], self.client_id + return 'AWS IDP Client for:\nRegion: {}\nPoolId: {}\nAppId: {}'.format( + self.region, self.pool_id.split('_')[1], self.client_id, ) diff --git a/yt_dlp/extractor/gopro.py b/yt_dlp/extractor/gopro.py index ae965374cf..9142566c35 100644 --- a/yt_dlp/extractor/gopro.py +++ b/yt_dlp/extractor/gopro.py @@ -23,7 +23,7 @@ class GoProIE(InfoExtractor): 'upload_date': '20210919', 'uploader_id': 'fireydive30018', 'duration': 396062, - } + }, }, { 'url': 'https://gopro.com/v/KRm6Vgp2peg4e', 'info_dict': { @@ -36,7 +36,7 @@ class GoProIE(InfoExtractor): 'uploader_id': 'dc9bcb8b-47d2-47c6-afbc-4c48f9a3769e', 'duration': 45187, 'track': 'The Sky Machine', - } + }, }, { 'url': 'https://gopro.com/v/kVrK9wlJvBMwn', 'info_dict': { @@ -50,19 +50,19 @@ class GoProIE(InfoExtractor): 'duration': 313075, 'track': 'Battery (Live)', 'artist': 'Metallica', - } + }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - metadata = self._parse_json( - self._html_search_regex(r'window\.__reflectData\s*=\s*([^;]+)', webpage, 'metadata'), video_id) + metadata = self._search_json( + r'window\.__reflectData\s*=', webpage, 'metadata', video_id) video_info = metadata['collectionMedia'][0] media_data = self._download_json( - 'https://api.gopro.com/media/%s/download' % video_info['id'], video_id) + 'https://api.gopro.com/media/{}/download'.format(video_info['id']), video_id) formats = [] for fmt in try_get(media_data, lambda x: x['_embedded']['variations']) or []: @@ -99,7 +99,7 @@ def _real_extract(self, url): 'duration': int_or_none( video_info.get('source_duration')), 'artist': str_or_none( - video_info.get('music_track_artist')), + video_info.get('music_track_artist')) or None, 'track': str_or_none( - video_info.get('music_track_name')), + video_info.get('music_track_name')) or None, } diff --git a/yt_dlp/extractor/goshgay.py b/yt_dlp/extractor/goshgay.py index 9a1f32b7e3..7bcac9bdea 100644 --- a/yt_dlp/extractor/goshgay.py +++ b/yt_dlp/extractor/goshgay.py @@ -1,7 +1,6 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, -) from ..utils import ( parse_duration, ) @@ -19,7 +18,7 @@ class GoshgayIE(InfoExtractor): 'thumbnail': r're:^http://.*\.jpg$', 'duration': 80, 'age_limit': 18, - } + }, } def _real_extract(self, url): @@ -32,7 +31,7 @@ def _real_extract(self, url): r'\s*-?\s*(.*?)', webpage, 'duration', fatal=False)) - flashvars = compat_parse_qs(self._html_search_regex( + flashvars = urllib.parse.parse_qs(self._html_search_regex( r'[a-z0-9]+)/watch' @@ -18,8 +14,8 @@ class GoToStageIE(InfoExtractor): 'ext': 'mp4', 'title': 'What is GoToStage?', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 93.924711 - } + 'duration': 93.924711, + }, }, { 'url': 'https://www.gotostage.com/channel/bacc3d3535b34bafacc3f4ef8d4df78a/recording/831e74cd3e0042be96defba627b6f676/watch?source=HOMEPAGE', 'only_matching': True, @@ -28,7 +24,7 @@ class GoToStageIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) metadata = self._download_json( - 'https://api.gotostage.com/contents?ids=%s' % video_id, + f'https://api.gotostage.com/contents?ids={video_id}', video_id, note='Downloading video metadata', errnote='Unable to download video metadata')[0] @@ -39,7 +35,7 @@ def _real_extract(self, url): 'productReferenceKey': metadata['productRefKey'], 'firstName': 'foo', 'lastName': 'bar', - 'email': 'foobar@example.com' + 'email': 'foobar@example.com', } registration_response = self._download_json( @@ -52,7 +48,7 @@ def _real_extract(self, url): errnote='Unable to register user') content_response = self._download_json( - 'https://api.gotostage.com/contents/%s/asset' % video_id, + f'https://api.gotostage.com/contents/{video_id}/asset', video_id, headers={'x-registrantkey': registration_response['registrationKey']}, note='Get download url', @@ -60,11 +56,11 @@ def _real_extract(self, url): return { 'id': video_id, - 'title': try_get(metadata, lambda x: x['title'], compat_str), - 'url': try_get(content_response, lambda x: x['cdnLocation'], compat_str), + 'title': try_get(metadata, lambda x: x['title'], str), + 'url': try_get(content_response, lambda x: x['cdnLocation'], str), 'ext': 'mp4', 'thumbnail': url_or_none(try_get(metadata, lambda x: x['thumbnail']['location'])), 'duration': try_get(metadata, lambda x: x['duration'], float), - 'categories': [try_get(metadata, lambda x: x['category'], compat_str)], - 'is_live': False + 'categories': [try_get(metadata, lambda x: x['category'], str)], + 'is_live': False, } diff --git a/yt_dlp/extractor/gputechconf.py b/yt_dlp/extractor/gputechconf.py index 2d13bf4915..f31791a718 100644 --- a/yt_dlp/extractor/gputechconf.py +++ b/yt_dlp/extractor/gputechconf.py @@ -11,7 +11,7 @@ class GPUTechConfIE(InfoExtractor): 'ext': 'mp4', 'title': 'Coordinating More Than 3 Million CUDA Threads for Social Network Analysis', 'duration': 1219, - } + }, } def _real_extract(self, url): @@ -27,6 +27,6 @@ def _real_extract(self, url): return { '_type': 'url_transparent', 'id': video_id, - 'url': '%sxml/%s.xml' % (root_path, xml_file_id), + 'url': f'{root_path}xml/{xml_file_id}.xml', 'ie_key': 'DigitallySpeaking', } diff --git a/yt_dlp/extractor/graspop.py b/yt_dlp/extractor/graspop.py new file mode 100644 index 0000000000..09371f8c46 --- /dev/null +++ b/yt_dlp/extractor/graspop.py @@ -0,0 +1,32 @@ +from .common import InfoExtractor +from ..utils import update_url, url_or_none +from ..utils.traversal import traverse_obj + + +class GraspopIE(InfoExtractor): + _VALID_URL = r'https?://vod\.graspop\.be/[a-z]{2}/(?P\d+)/' + _TESTS = [{ + 'url': 'https://vod.graspop.be/fr/101556/thy-art-is-murder-concert/', + 'info_dict': { + 'id': '101556', + 'ext': 'mp4', + 'title': 'Thy Art Is Murder', + 'thumbnail': r're:https://cdn-mds\.pickx\.be/festivals/v3/global/original/.+\.jpg', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + metadata = self._download_json( + f'https://tv.proximus.be/MWC/videocenter/festivals/{video_id}/stream', video_id) + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats( + # Downgrade manifest request to avoid incomplete certificate chain error + update_url(metadata['source']['assetUri'], scheme='http'), video_id, 'mp4'), + **traverse_obj(metadata, { + 'title': ('name', {str}), + 'thumbnail': ('source', 'poster', {url_or_none}), + }), + } diff --git a/yt_dlp/extractor/gronkh.py b/yt_dlp/extractor/gronkh.py index b9370e36c1..1668900378 100644 --- a/yt_dlp/extractor/gronkh.py +++ b/yt_dlp/extractor/gronkh.py @@ -3,6 +3,7 @@ from .common import InfoExtractor from ..utils import ( OnDemandPagedList, + float_or_none, traverse_obj, unified_strdate, ) @@ -19,9 +20,11 @@ class GronkhIE(InfoExtractor): 'title': 'H.O.R.D.E. - DAS ZWEiTE ZEiTALTER 🎲 Session 1', 'view_count': int, 'thumbnail': 'https://01.cdn.vod.farm/preview/9e2555d3a23bf4e5c5b7c6b3b70a9d84.jpg', - 'upload_date': '20221111' + 'upload_date': '20221111', + 'chapters': 'count:3', + 'duration': 31463, }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }, { 'url': 'https://gronkh.tv/stream/536', 'info_dict': { @@ -30,32 +33,39 @@ class GronkhIE(InfoExtractor): 'title': 'GTV0536, 2021-10-01 - MARTHA IS DEAD #FREiAB1830 !FF7 !horde !archiv', 'view_count': int, 'thumbnail': 'https://01.cdn.vod.farm/preview/6436746cce14e25f751260a692872b9b.jpg', - 'upload_date': '20211001' + 'upload_date': '20211001', + 'duration': 32058, }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }, { 'url': 'https://gronkh.tv/watch/stream/546', 'only_matching': True, }] def _real_extract(self, url): - id = self._match_id(url) - data_json = self._download_json(f'https://api.gronkh.tv/v1/video/info?episode={id}', id) - m3u8_url = self._download_json(f'https://api.gronkh.tv/v1/video/playlist?episode={id}', id)['playlist_url'] - formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, id) + video_id = self._match_id(url) + data_json = self._download_json(f'https://api.gronkh.tv/v1/video/info?episode={video_id}', video_id) + m3u8_url = self._download_json(f'https://api.gronkh.tv/v1/video/playlist?episode={video_id}', video_id)['playlist_url'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id) if data_json.get('vtt_url'): subtitles.setdefault('en', []).append({ 'url': data_json['vtt_url'], 'ext': 'vtt', }) return { - 'id': id, + 'id': video_id, 'title': data_json.get('title'), 'view_count': data_json.get('views'), 'thumbnail': data_json.get('preview_url'), 'upload_date': unified_strdate(data_json.get('created_at')), 'formats': formats, 'subtitles': subtitles, + 'duration': float_or_none(data_json.get('source_length')), + 'chapters': traverse_obj(data_json, ( + 'chapters', lambda _, v: float_or_none(v['offset']) is not None, { + 'title': 'title', + 'start_time': ('offset', {float_or_none}), + })) or None, } diff --git a/yt_dlp/extractor/groupon.py b/yt_dlp/extractor/groupon.py index 362d3ff831..a05fab1f14 100644 --- a/yt_dlp/extractor/groupon.py +++ b/yt_dlp/extractor/groupon.py @@ -31,7 +31,6 @@ class GrouponIE(InfoExtractor): } _PROVIDERS = { - 'ooyala': ('ooyala:%s', 'Ooyala'), 'youtube': ('%s', 'Youtube'), } @@ -51,8 +50,7 @@ def _real_extract(self, url): url_pattern, ie_key = self._PROVIDERS.get(provider.lower()) if not url_pattern: self.report_warning( - '%s: Unsupported video provider %s, skipping video' % - (playlist_id, provider)) + f'{playlist_id}: Unsupported video provider {provider}, skipping video') continue entries.append(self.url_result(url_pattern % video_id, ie_key)) diff --git a/yt_dlp/extractor/harpodeon.py b/yt_dlp/extractor/harpodeon.py index 0aa47337ff..aa3b2ca34a 100644 --- a/yt_dlp/extractor/harpodeon.py +++ b/yt_dlp/extractor/harpodeon.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import int_or_none class HarpodeonIE(InfoExtractor): @@ -14,8 +14,8 @@ class HarpodeonIE(InfoExtractor): 'title': 'The Smoking Out of Bella Butts', 'description': 'md5:47e16bdb41fc8a79c83ab83af11c8b77', 'creator': 'Vitagraph Company of America', - 'release_date': '19150101' - } + 'release_year': 1915, + }, }, { 'url': 'https://www.harpodeon.com/preview/The_Smoking_Out_of_Bella_Butts/268068288', 'md5': '6dfea5412845f690c7331be703f884db', @@ -25,8 +25,8 @@ class HarpodeonIE(InfoExtractor): 'title': 'The Smoking Out of Bella Butts', 'description': 'md5:47e16bdb41fc8a79c83ab83af11c8b77', 'creator': 'Vitagraph Company of America', - 'release_date': '19150101' - } + 'release_year': 1915, + }, }, { 'url': 'https://www.harpodeon.com/preview/Behind_the_Screen/421838710', 'md5': '7979df9ca04637282cb7d172ab3a9c3b', @@ -36,8 +36,8 @@ class HarpodeonIE(InfoExtractor): 'title': 'Behind the Screen', 'description': 'md5:008972a3dc51fba3965ee517d2ba9155', 'creator': 'Lone Star Corporation', - 'release_date': '19160101' - } + 'release_year': 1916, + }, }] def _real_extract(self, url): @@ -66,5 +66,5 @@ def _real_extract(self, url): 'http_headers': {'Referer': url}, 'description': self._html_search_meta('description', webpage, fatal=False), 'creator': creator, - 'release_date': unified_strdate(f'{release_year}0101') + 'release_year': int_or_none(release_year), } diff --git a/yt_dlp/extractor/hbo.py b/yt_dlp/extractor/hbo.py index 530bdb7270..34cff458d8 100644 --- a/yt_dlp/extractor/hbo.py +++ b/yt_dlp/extractor/hbo.py @@ -2,11 +2,12 @@ from .common import InfoExtractor from ..utils import ( - xpath_text, - xpath_element, int_or_none, + join_nonempty, parse_duration, urljoin, + xpath_element, + xpath_text, ) @@ -56,7 +57,7 @@ def _extract_info(self, url, display_id): episode_title = title = xpath_text(video_data, 'title', fatal=True) series = xpath_text(video_data, 'program') if series: - title = '%s - %s' % (series, title) + title = f'{series} - {title}' formats = [] for source in xpath_element(video_data, 'videos', 'sources', True): @@ -69,7 +70,7 @@ def _extract_info(self, url, display_id): height = format_info.get('height') fmt = { 'url': path, - 'format_id': 'http%s' % ('-%dp' % height if height else ''), + 'format_id': join_nonempty('http'. height and f'{height}p'), 'width': format_info.get('width'), 'height': height, } @@ -107,7 +108,7 @@ def _extract_info(self, url, display_id): else: format_info = self._FORMATS_INFO.get(source.tag, {}) formats.append({ - 'format_id': 'http-%s' % source.tag, + 'format_id': f'http-{source.tag}', 'url': video_url, 'width': format_info.get('width'), 'height': format_info.get('height'), @@ -133,7 +134,7 @@ def _extract_info(self, url, display_id): subtitles = { 'en': [{ 'url': caption_url, - 'ext': 'ttml' + 'ext': 'ttml', }], } diff --git a/yt_dlp/extractor/hearthisat.py b/yt_dlp/extractor/hearthisat.py index d1a400d8cc..eb0a77952e 100644 --- a/yt_dlp/extractor/hearthisat.py +++ b/yt_dlp/extractor/hearthisat.py @@ -1,19 +1,20 @@ from .common import InfoExtractor from ..utils import ( - determine_ext, KNOWN_EXTENSIONS, + determine_ext, str_to_int, ) class HearThisAtIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hearthis\.at/(?P[^/]+)/(?P[A-Za-z0-9\-]+)/?$' + _VALID_URL = r'https?://(?:www\.)?hearthis\.at/(?P<artist>[^/?#]+)/(?P<title>[\w.-]+)' _PLAYLIST_URL = 'https://hearthis.at/playlist.php' _TESTS = [{ 'url': 'https://hearthis.at/moofi/dr-kreep', 'md5': 'ab6ec33c8fed6556029337c7885eb4e0', 'info_dict': { 'id': '150939', + 'display_id': 'moofi - dr-kreep', 'ext': 'wav', 'title': 'Moofi - Dr. Kreep', 'thumbnail': r're:^https?://.*\.jpg$', @@ -21,15 +22,16 @@ class HearThisAtIE(InfoExtractor): 'description': 'md5:1adb0667b01499f9d27e97ddfd53852a', 'upload_date': '20150118', 'view_count': int, - 'duration': 71, - 'genre': 'Experimental', - } + 'duration': 70, + 'genres': ['Experimental'], + }, }, { # 'download' link redirects to the original webpage 'url': 'https://hearthis.at/twitchsf/dj-jim-hopkins-totally-bitchin-80s-dance-mix/', 'md5': '5980ceb7c461605d30f1f039df160c6e', 'info_dict': { 'id': '811296', + 'display_id': 'twitchsf - dj-jim-hopkins-totally-bitchin-80s-dance-mix', 'ext': 'mp3', 'title': 'TwitchSF - DJ Jim Hopkins - Totally Bitchin\' 80\'s Dance Mix!', 'description': 'md5:ef26815ca8f483272a87b137ff175be2', @@ -38,7 +40,39 @@ class HearThisAtIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'view_count': int, 'duration': 4360, - 'genre': 'Dance', + 'genres': ['Dance'], + }, + }, { + 'url': 'https://hearthis.at/tindalos/0001-tindalos-gnrique/eQd/', + 'md5': 'cd08e51911f147f6da2d9678905b0bd9', + 'info_dict': { + 'id': '2685222', + 'ext': 'mp3', + 'duration': 86, + 'view_count': int, + 'timestamp': 1545471670, + 'display_id': 'tindalos - 0001-tindalos-gnrique', + 'thumbnail': r're:^https?://.*\.jpg$', + 'genres': ['Other'], + 'title': 'Tindalos - Tindalos - générique n°1', + 'description': '', + 'upload_date': '20181222', + }, + }, { + 'url': 'https://hearthis.at/sithi2/biochip-c-classics-set-wolle-xdp-tresor.core-special-tresor-globus-berlin-13.07.20011/', + 'md5': 'b45ac60f0c8111eef6ddc10ec232e312', + 'info_dict': { + 'id': '7145959', + 'ext': 'mp3', + 'description': 'md5:d7ae36a453d78903f6b7ed6eb2fce1f2', + 'duration': 8986, + 'thumbnail': r're:^https?://.*\.jpg$', + 'title': 'md5:62669ce5b1b67f45c6f846033f37d3b9', + 'timestamp': 1588699409, + 'display_id': 'sithi2 - biochip-c-classics-set-wolle-xdp-tresor.core-special-tresor-globus-berlin-13.07.20011', + 'view_count': int, + 'upload_date': '20200505', + 'genres': ['Other'], }, }] diff --git a/yt_dlp/extractor/heise.py b/yt_dlp/extractor/heise.py index 27d737c049..01b700b157 100644 --- a/yt_dlp/extractor/heise.py +++ b/yt_dlp/extractor/heise.py @@ -105,7 +105,7 @@ class HeiseIE(InfoExtractor): 'description': 'md5:fa164d8c8707dff124a9626d39205f5d', 'timestamp': 1414825200, 'upload_date': '20141101', - } + }, }, { 'url': 'http://www.heise.de/ct/artikel/c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2403911.html', 'only_matching': True, @@ -156,7 +156,7 @@ def _make_kaltura_result(kaltura_url): r'entry-id=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'kaltura id', default=None, group='id') if kaltura_id: - return _make_kaltura_result('kaltura:2238431:%s' % kaltura_id) + return _make_kaltura_result(f'kaltura:2238431:{kaltura_id}') yt_urls = tuple(YoutubeIE._extract_embed_urls(url, webpage)) if yt_urls: @@ -191,7 +191,7 @@ def _make_kaltura_result(kaltura_url): formats.append({ 'url': video_url, 'format_note': label, - 'format_id': '%s_%s' % (ext, label), + 'format_id': f'{ext}_{label}', 'height': height, }) diff --git a/yt_dlp/extractor/helsinki.py b/yt_dlp/extractor/helsinki.py deleted file mode 100644 index e518cae1ac..0000000000 --- a/yt_dlp/extractor/helsinki.py +++ /dev/null @@ -1,38 +0,0 @@ -from .common import InfoExtractor -from ..utils import js_to_json - - -class HelsinkiIE(InfoExtractor): - IE_DESC = 'helsinki.fi' - _VALID_URL = r'https?://video\.helsinki\.fi/Arkisto/flash\.php\?id=(?P<id>\d+)' - _TEST = { - 'url': 'http://video.helsinki.fi/Arkisto/flash.php?id=20258', - 'info_dict': { - 'id': '20258', - 'ext': 'mp4', - 'title': 'Tietotekniikkafoorumi-iltapäivä', - 'description': 'md5:f5c904224d43c133225130fe156a5ee0', - }, - 'params': { - 'skip_download': True, # RTMP - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - params = self._parse_json(self._html_search_regex( - r'(?s)jwplayer\("player"\).setup\((\{.*?\})\);', - webpage, 'player code'), video_id, transform_source=js_to_json) - formats = [{ - 'url': s['file'], - 'ext': 'mp4', - } for s in params['sources']] - - return { - 'id': video_id, - 'title': self._og_search_title(webpage).replace('Video: ', ''), - 'description': self._og_search_description(webpage), - 'formats': formats, - } diff --git a/yt_dlp/extractor/hentaistigma.py b/yt_dlp/extractor/hentaistigma.py deleted file mode 100644 index ca5ffc2aea..0000000000 --- a/yt_dlp/extractor/hentaistigma.py +++ /dev/null @@ -1,37 +0,0 @@ -from .common import InfoExtractor - - -class HentaiStigmaIE(InfoExtractor): - _VALID_URL = r'^https?://hentai\.animestigma\.com/(?P<id>[^/]+)' - _TEST = { - 'url': 'http://hentai.animestigma.com/inyouchuu-etsu-bonus/', - 'md5': '4e3d07422a68a4cc363d8f57c8bf0d23', - 'info_dict': { - 'id': 'inyouchuu-etsu-bonus', - 'ext': 'mp4', - 'title': 'Inyouchuu Etsu Bonus', - 'age_limit': 18, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex( - r'<h2[^>]+class="posttitle"[^>]*><a[^>]*>([^<]+)</a>', - webpage, 'title') - wrap_url = self._html_search_regex( - r'<iframe[^>]+src="([^"]+mp4)"', webpage, 'wrapper url') - wrap_webpage = self._download_webpage(wrap_url, video_id) - - video_url = self._html_search_regex( - r'file\s*:\s*"([^"]+)"', wrap_webpage, 'video url') - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'age_limit': 18, - } diff --git a/yt_dlp/extractor/hidive.py b/yt_dlp/extractor/hidive.py index 8a8749859c..0cbe9913cc 100644 --- a/yt_dlp/extractor/hidive.py +++ b/yt_dlp/extractor/hidive.py @@ -47,16 +47,17 @@ def _perform_login(self, username, password): login_webpage = self._download_webpage( self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(data)) # If the user has multiple profiles on their account, select one. For now pick the first profile. - profile_id = self._search_regex(r'<button [^>]+?data-profile-id="(\w+)"', login_webpage, 'profile_id') + profile_id = self._search_regex( + r'<button [^>]+?data-profile-id="(\w+)"', login_webpage, 'profile id', default=None) if profile_id is None: return # If only one profile, Hidive auto-selects it - profile_id_hash = self._search_regex(r'\<button [^>]+?data-hash="(\w+)"', login_webpage, 'profile_id_hash') self._request_webpage( 'https://www.hidive.com/ajax/chooseprofile', None, data=urlencode_postdata({ 'profileId': profile_id, - 'hash': profile_id_hash, - 'returnUrl': '/dashboard' + 'hash': self._search_regex( + r'\<button [^>]+?data-hash="(\w+)"', login_webpage, 'profile id hash'), + 'returnUrl': '/dashboard', })) def _call_api(self, video_id, title, key, data={}, **kwargs): @@ -79,7 +80,7 @@ def _real_extract(self, url): self.raise_geo_restricted() if restriction and restriction != 'None': raise ExtractorError( - '%s said: %s' % (self.IE_NAME, restriction), expected=True) + f'{self.IE_NAME} said: {restriction}', expected=True) formats, parsed_urls = [], {None} for rendition_id, rendition in settings['renditions'].items(): @@ -114,5 +115,5 @@ def _real_extract(self, url): self._search_regex(r's(\d+)', key, 'season number', default=None)), 'episode_number': int_or_none( self._search_regex(r'e(\d+)', key, 'episode number', default=None)), - 'http_headers': {'Referer': url} + 'http_headers': {'Referer': url}, } diff --git a/yt_dlp/extractor/historicfilms.py b/yt_dlp/extractor/historicfilms.py index c428feeded..714f65114a 100644 --- a/yt_dlp/extractor/historicfilms.py +++ b/yt_dlp/extractor/historicfilms.py @@ -33,7 +33,7 @@ def _real_extract(self, url): duration = parse_duration(self._html_search_meta( 'duration', webpage, 'duration')) - video_url = 'http://www.historicfilms.com/video/%s_%s_web.mov' % (tape_id, video_id) + video_url = f'http://www.historicfilms.com/video/{tape_id}_{video_id}_web.mov' return { 'id': video_id, diff --git a/yt_dlp/extractor/hitbox.py b/yt_dlp/extractor/hitbox.py deleted file mode 100644 index f0c6898836..0000000000 --- a/yt_dlp/extractor/hitbox.py +++ /dev/null @@ -1,209 +0,0 @@ -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - clean_html, - determine_ext, - float_or_none, - int_or_none, - parse_iso8601, -) - - -class HitboxIE(InfoExtractor): - IE_NAME = 'hitbox' - _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?:[^/]+/)*videos?/(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://www.hitbox.tv/video/203213', - 'info_dict': { - 'id': '203213', - 'title': 'hitbox @ gamescom, Sub Button Hype extended, Giveaway - hitbox News Update with Oxy', - 'alt_title': 'hitboxlive - Aug 9th #6', - 'description': '', - 'ext': 'mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 215.1666, - 'resolution': 'HD 720p', - 'uploader': 'hitboxlive', - 'view_count': int, - 'timestamp': 1407576133, - 'upload_date': '20140809', - 'categories': ['Live Show'], - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'https://www.smashcast.tv/hitboxlive/videos/203213', - 'only_matching': True, - }] - - def _extract_metadata(self, url, video_id): - thumb_base = 'https://edge.sf.hitbox.tv' - metadata = self._download_json( - '%s/%s' % (url, video_id), video_id, 'Downloading metadata JSON') - - date = 'media_live_since' - media_type = 'livestream' - if metadata.get('media_type') == 'video': - media_type = 'video' - date = 'media_date_added' - - video_meta = metadata.get(media_type, [])[0] - title = video_meta.get('media_status') - alt_title = video_meta.get('media_title') - description = clean_html( - video_meta.get('media_description') - or video_meta.get('media_description_md')) - duration = float_or_none(video_meta.get('media_duration')) - uploader = video_meta.get('media_user_name') - views = int_or_none(video_meta.get('media_views')) - timestamp = parse_iso8601(video_meta.get(date), ' ') - categories = [video_meta.get('category_name')] - thumbs = [{ - 'url': thumb_base + video_meta.get('media_thumbnail'), - 'width': 320, - 'height': 180 - }, { - 'url': thumb_base + video_meta.get('media_thumbnail_large'), - 'width': 768, - 'height': 432 - }] - - return { - 'id': video_id, - 'title': title, - 'alt_title': alt_title, - 'description': description, - 'ext': 'mp4', - 'thumbnails': thumbs, - 'duration': duration, - 'uploader': uploader, - 'view_count': views, - 'timestamp': timestamp, - 'categories': categories, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - player_config = self._download_json( - 'https://www.smashcast.tv/api/player/config/video/%s' % video_id, - video_id, 'Downloading video JSON') - - formats = [] - for video in player_config['clip']['bitrates']: - label = video.get('label') - if label == 'Auto': - continue - video_url = video.get('url') - if not video_url: - continue - bitrate = int_or_none(video.get('bitrate')) - if determine_ext(video_url) == 'm3u8': - if not video_url.startswith('http'): - continue - formats.append({ - 'url': video_url, - 'ext': 'mp4', - 'tbr': bitrate, - 'format_note': label, - 'protocol': 'm3u8_native', - }) - else: - formats.append({ - 'url': video_url, - 'tbr': bitrate, - 'format_note': label, - }) - - metadata = self._extract_metadata( - 'https://www.smashcast.tv/api/media/video', video_id) - metadata['formats'] = formats - - return metadata - - -class HitboxLiveIE(HitboxIE): # XXX: Do not subclass from concrete IE - IE_NAME = 'hitbox:live' - _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.hitbox.tv/dimak', - 'info_dict': { - 'id': 'dimak', - 'ext': 'mp4', - 'description': 'md5:c9f80fa4410bc588d7faa40003fc7d0e', - 'timestamp': int, - 'upload_date': compat_str, - 'title': compat_str, - 'uploader': 'Dimak', - }, - 'params': { - # live - 'skip_download': True, - }, - }, { - 'url': 'https://www.smashcast.tv/dimak', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if HitboxIE.suitable(url) else super(HitboxLiveIE, cls).suitable(url) - - def _real_extract(self, url): - video_id = self._match_id(url) - - player_config = self._download_json( - 'https://www.smashcast.tv/api/player/config/live/%s' % video_id, - video_id) - - formats = [] - cdns = player_config.get('cdns') - servers = [] - for cdn in cdns: - # Subscribe URLs are not playable - if cdn.get('rtmpSubscribe') is True: - continue - base_url = cdn.get('netConnectionUrl') - host = re.search(r'.+\.([^\.]+\.[^\./]+)/.+', base_url).group(1) - if base_url not in servers: - servers.append(base_url) - for stream in cdn.get('bitrates'): - label = stream.get('label') - if label == 'Auto': - continue - stream_url = stream.get('url') - if not stream_url: - continue - bitrate = int_or_none(stream.get('bitrate')) - if stream.get('provider') == 'hls' or determine_ext(stream_url) == 'm3u8': - if not stream_url.startswith('http'): - continue - formats.append({ - 'url': stream_url, - 'ext': 'mp4', - 'tbr': bitrate, - 'format_note': label, - 'rtmp_live': True, - }) - else: - formats.append({ - 'url': '%s/%s' % (base_url, stream_url), - 'ext': 'mp4', - 'tbr': bitrate, - 'rtmp_live': True, - 'format_note': host, - 'page_url': url, - 'player_url': 'http://www.hitbox.tv/static/player/flowplayer/flowplayer.commercial-3.2.16.swf', - }) - - metadata = self._extract_metadata( - 'https://www.smashcast.tv/api/media/live', video_id) - metadata['formats'] = formats - metadata['is_live'] = True - metadata['title'] = metadata.get('title') - - return metadata diff --git a/yt_dlp/extractor/hitrecord.py b/yt_dlp/extractor/hitrecord.py index 902af44fab..3c3d7f9424 100644 --- a/yt_dlp/extractor/hitrecord.py +++ b/yt_dlp/extractor/hitrecord.py @@ -1,5 +1,4 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( clean_html, float_or_none, @@ -27,14 +26,14 @@ class HitRecordIE(InfoExtractor): 'like_count': int, 'comment_count': int, 'tags': list, - } + }, } def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( - 'https://hitrecord.org/api/web/records/%s' % video_id, video_id) + f'https://hitrecord.org/api/web/records/{video_id}', video_id) title = video['title'] video_url = video['source_url']['mp4_url'] @@ -46,7 +45,7 @@ def _real_extract(self, url): t['text'] for t in tags_list if isinstance(t, dict) and t.get('text') - and isinstance(t['text'], compat_str)] + and isinstance(t['text'], str)] return { 'id': video_id, @@ -56,9 +55,9 @@ def _real_extract(self, url): 'duration': float_or_none(video.get('duration'), 1000), 'timestamp': int_or_none(video.get('created_at_i')), 'uploader': try_get( - video, lambda x: x['user']['username'], compat_str), + video, lambda x: x['user']['username'], str), 'uploader_id': try_get( - video, lambda x: compat_str(x['user']['id'])), + video, lambda x: str(x['user']['id'])), 'view_count': int_or_none(video.get('total_views_count')), 'like_count': int_or_none(video.get('hearts_count')), 'comment_count': int_or_none(video.get('comments_count')), diff --git a/yt_dlp/extractor/hketv.py b/yt_dlp/extractor/hketv.py index 10879564fa..3998abc121 100644 --- a/yt_dlp/extractor/hketv.py +++ b/yt_dlp/extractor/hketv.py @@ -1,8 +1,7 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( - clean_html, ExtractorError, + clean_html, int_or_none, merge_dicts, parse_count, @@ -45,9 +44,6 @@ class HKETVIE(InfoExtractor): 'duration': 907, 'subtitles': {}, }, - 'params': { - 'geo_verification_proxy': '<HK proxy here>', - }, 'skip': 'Geo restricted to HK', }] @@ -126,7 +122,7 @@ def _real_extract(self, url): # If we ever wanted to provide the final resolved URL that # does not require cookies, albeit with a shorter lifespan: # urlh = self._downloader.urlopen(file_url) - # resolved_url = urlh.geturl() + # resolved_url = urlh.url label = fmt.get('label') h = self._FORMAT_HEIGHTS.get(label) w = h * width // height if h and width and height else None @@ -144,7 +140,7 @@ def _real_extract(self, url): if not isinstance(track, dict): continue track_kind = str_or_none(track.get('kind')) - if not track_kind or not isinstance(track_kind, compat_str): + if not track_kind or not isinstance(track_kind, str): continue if track_kind.lower() not in ('captions', 'subtitles'): continue diff --git a/yt_dlp/extractor/hollywoodreporter.py b/yt_dlp/extractor/hollywoodreporter.py new file mode 100644 index 0000000000..52db5e5c13 --- /dev/null +++ b/yt_dlp/extractor/hollywoodreporter.py @@ -0,0 +1,72 @@ +import functools +import re + +from .common import InfoExtractor +from .jwplatform import JWPlatformIE +from ..utils import ( + ExtractorError, + OnDemandPagedList, + extract_attributes, + get_element_by_class, + get_element_html_by_class, +) + + +class HollywoodReporterIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hollywoodreporter\.com/video/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.hollywoodreporter.com/video/chris-pine-michelle-rodriguez-dungeons-dragons-cast-directors-on-what-it-took-to-make-film-sxsw-2023/', + 'info_dict': { + 'id': 'zH4jZaR5', + 'ext': 'mp4', + 'title': 'md5:a9a1c073770a32f178955997712c4bd9', + 'description': 'The cast and directors of \'Dungeons & Dragons: Honor Among Thieves\' talk about their new film.', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/zH4jZaR5/poster.jpg?width=720', + 'upload_date': '20230312', + 'timestamp': 1678586423, + 'duration': 242.0, + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + data = extract_attributes(get_element_html_by_class('vlanding-video-card__link', webpage) or '') + video_id = data['data-video-showcase-trigger'] + showcase_type = data['data-video-showcase-type'] + + if showcase_type == 'jwplayer': + return self.url_result(f'jwplatform:{video_id}', JWPlatformIE) + elif showcase_type == 'youtube': + return self.url_result(video_id, 'Youtube') + else: + raise ExtractorError(f'Unsupported showcase type "{showcase_type}"') + + +class HollywoodReporterPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hollywoodreporter\.com/vcategory/(?P<slug>[\w-]+)-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.hollywoodreporter.com/vcategory/heat-vision-breakdown-57822/', + 'playlist_mincount': 109, + 'info_dict': { + 'id': '57822', + 'title': 'heat-vision-breakdown', + }, + }] + + def _fetch_page(self, slug, pl_id, page): + page += 1 + webpage = self._download_webpage( + f'https://www.hollywoodreporter.com/vcategory/{slug}-{pl_id}/page/{page}/', + pl_id, note=f'Downloading playlist page {page}') + section = get_element_by_class('video-playlist-river', webpage) or '' + + for url in re.findall(r'<a[^>]+href="([^"]+)"[^>]+class="c-title__link', section): + yield self.url_result(url, HollywoodReporterIE) + + def _real_extract(self, url): + slug, pl_id = self._match_valid_url(url).group('slug', 'id') + return self.playlist_result( + OnDemandPagedList(functools.partial(self._fetch_page, slug, pl_id), 15), pl_id, slug) diff --git a/yt_dlp/extractor/holodex.py b/yt_dlp/extractor/holodex.py index a2b73ecc1c..00b045ee7f 100644 --- a/yt_dlp/extractor/holodex.py +++ b/yt_dlp/extractor/holodex.py @@ -46,7 +46,7 @@ class HolodexIE(InfoExtractor): 'url': 'https://holodex.net/watch/_m2mQyaofjI?foo=bar&playlist=69', 'info_dict': { 'id': '69', - 'title': '拿著金斧頭的藍髮大姊姊' + 'title': '拿著金斧頭的藍髮大姊姊', }, 'playlist_count': 3, }, { diff --git a/yt_dlp/extractor/hotnewhiphop.py b/yt_dlp/extractor/hotnewhiphop.py index f8570cb861..8573e8947d 100644 --- a/yt_dlp/extractor/hotnewhiphop.py +++ b/yt_dlp/extractor/hotnewhiphop.py @@ -1,14 +1,12 @@ +import base64 + from .common import InfoExtractor -from ..compat import compat_b64decode -from ..utils import ( - ExtractorError, - HEADRequest, - sanitized_Request, - urlencode_postdata, -) +from ..networking import HEADRequest, Request +from ..utils import ExtractorError, urlencode_postdata class HotNewHipHopIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?hotnewhiphop\.com/.*\.(?P<id>.*)\.html' _TEST = { 'url': 'http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html', @@ -16,8 +14,8 @@ class HotNewHipHopIE(InfoExtractor): 'info_dict': { 'id': '1435540', 'ext': 'mp3', - 'title': 'Freddie Gibbs - Lay It Down' - } + 'title': 'Freddie Gibbs - Lay It Down', + }, } def _real_extract(self, url): @@ -36,21 +34,21 @@ def _real_extract(self, url): ('mediaType', 's'), ('mediaId', video_id), ]) - r = sanitized_Request( + r = Request( 'http://www.hotnewhiphop.com/ajax/media/getActions/', data=reqdata) - r.add_header('Content-Type', 'application/x-www-form-urlencoded') + r.headers['Content-Type'] = 'application/x-www-form-urlencoded' mkd = self._download_json( r, video_id, note='Requesting media key', errnote='Could not download media key') if 'mediaKey' not in mkd: raise ExtractorError('Did not get a media key') - redirect_url = compat_b64decode(video_url_base64).decode('utf-8') + redirect_url = base64.b64decode(video_url_base64).decode('utf-8') redirect_req = HEADRequest(redirect_url) req = self._request_webpage( redirect_req, video_id, note='Resolving final URL', errnote='Could not resolve final URL') - video_url = req.geturl() + video_url = req.url if video_url.endswith('.html'): raise ExtractorError('Redirect failed') diff --git a/yt_dlp/extractor/hotstar.py b/yt_dlp/extractor/hotstar.py index cea1812f15..e97740c90b 100644 --- a/yt_dlp/extractor/hotstar.py +++ b/yt_dlp/extractor/hotstar.py @@ -6,7 +6,7 @@ import uuid from .common import InfoExtractor -from ..compat import compat_HTTPError, compat_str +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, determine_ext, @@ -31,7 +31,7 @@ def _call_api_v1(self, path, *args, **kwargs): def _call_api_impl(self, path, video_id, query, st=None, cookies=None): st = int_or_none(st) or int(time.time()) exp = st + 6000 - auth = 'st=%d~exp=%d~acl=/*' % (st, exp) + auth = f'st={st}~exp={exp}~acl=/*' auth += '~hmac=' + hmac.new(self._AKAMAI_ENCRYPTION_KEY, auth.encode(), hashlib.sha256).hexdigest() if cookies and cookies.get('userUP'): @@ -40,7 +40,7 @@ def _call_api_impl(self, path, video_id, query, st=None, cookies=None): token = self._download_json( f'{self._API_URL}/um/v3/users', video_id, note='Downloading token', - data=json.dumps({"device_ids": [{"id": compat_str(uuid.uuid4()), "type": "device_id"}]}).encode('utf-8'), + data=json.dumps({'device_ids': [{'id': str(uuid.uuid4()), 'type': 'device_id'}]}).encode(), headers={ 'hotstarauth': auth, 'x-hs-platform': 'PCTV', # or 'web' @@ -65,7 +65,7 @@ def _call_api_v2(self, path, video_id, st=None, cookies=None): return self._call_api_impl( f'{path}/content/{video_id}', video_id, st=st, cookies=cookies, query={ 'desired-config': 'audio_channel:stereo|container:fmp4|dynamic_range:hdr|encryption:plain|ladder:tv|package:dash|resolution:fhd|subs-tag:HotstarVIP|video_codec:h265', - 'device-id': cookies.get('device_id').value if cookies.get('device_id') else compat_str(uuid.uuid4()), + 'device-id': cookies.get('device_id').value if cookies.get('device_id') else str(uuid.uuid4()), 'os-name': 'Windows', 'os-version': '10', }) @@ -83,7 +83,7 @@ class HotStarIE(HotStarBaseIE): _VALID_URL = r'''(?x) https?://(?:www\.)?hotstar\.com(?:/in)?/(?!in/) (?: - (?P<type>movies|sports|episode|(?P<tv>tv))/ + (?P<type>movies|sports|clips|episode|(?P<tv>tv|shows))/ (?(tv)(?:[^/?#]+/){2}|[^?#]*) )? [^/?#]+/ @@ -114,14 +114,78 @@ class HotStarIE(HotStarBaseIE): 'upload_date': '20190501', 'duration': 1219, 'channel': 'StarPlus', - 'channel_id': 3, + 'channel_id': '3', 'series': 'Ek Bhram - Sarvagun Sampanna', 'season': 'Chapter 1', 'season_number': 1, - 'season_id': 6771, + 'season_id': '6771', 'episode': 'Janhvi Targets Suman', 'episode_number': 8, - } + }, + }, { + 'url': 'https://www.hotstar.com/in/shows/anupama/1260022017/anupama-anuj-share-a-moment/1000282843', + 'info_dict': { + 'id': '1000282843', + 'ext': 'mp4', + 'title': 'Anupama, Anuj Share a Moment', + 'season': 'Chapter 1', + 'description': 'md5:8d74ed2248423b8b06d5c8add4d7a0c0', + 'timestamp': 1678149000, + 'channel': 'StarPlus', + 'series': 'Anupama', + 'season_number': 1, + 'season_id': '7399', + 'upload_date': '20230307', + 'episode': 'Anupama, Anuj Share a Moment', + 'episode_number': 853, + 'duration': 1272, + 'channel_id': '3', + }, + 'skip': 'HTTP Error 504: Gateway Time-out', # XXX: Investigate 504 errors on some episodes + }, { + 'url': 'https://www.hotstar.com/in/shows/kana-kaanum-kaalangal/1260097087/back-to-school/1260097320', + 'info_dict': { + 'id': '1260097320', + 'ext': 'mp4', + 'title': 'Back To School', + 'season': 'Chapter 1', + 'description': 'md5:b0d6a4c8a650681491e7405496fc7e13', + 'timestamp': 1650564000, + 'channel': 'Hotstar Specials', + 'series': 'Kana Kaanum Kaalangal', + 'season_number': 1, + 'season_id': '9441', + 'upload_date': '20220421', + 'episode': 'Back To School', + 'episode_number': 1, + 'duration': 1810, + 'channel_id': '54', + }, + }, { + 'url': 'https://www.hotstar.com/in/clips/e3-sairat-kahani-pyaar-ki/1000262286', + 'info_dict': { + 'id': '1000262286', + 'ext': 'mp4', + 'title': 'E3 - SaiRat, Kahani Pyaar Ki', + 'description': 'md5:e3b4b3203bc0c5396fe7d0e4948a6385', + 'episode': 'E3 - SaiRat, Kahani Pyaar Ki', + 'upload_date': '20210606', + 'timestamp': 1622943900, + 'duration': 5395, + }, + }, { + 'url': 'https://www.hotstar.com/in/movies/premam/1000091195', + 'info_dict': { + 'id': '1000091195', + 'ext': 'mp4', + 'title': 'Premam', + 'release_year': 2015, + 'description': 'md5:d833c654e4187b5e34757eafb5b72d7f', + 'timestamp': 1462149000, + 'upload_date': '20160502', + 'episode': 'Premam', + 'duration': 8994, + }, }, { 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', 'only_matching': True, @@ -139,6 +203,8 @@ class HotStarIE(HotStarBaseIE): 'sports': 'match', 'episode': 'episode', 'tv': 'episode', + 'shows': 'episode', + 'clips': 'content', None: 'content', } @@ -166,8 +232,10 @@ def _real_extract(self, url): video_type = self._TYPE.get(video_type, video_type) cookies = self._get_cookies(url) # Cookies before any request - video_data = self._call_api_v1(f'{video_type}/detail', video_id, - query={'tas': 10000, 'contentId': video_id})['body']['results']['item'] + video_data = traverse_obj( + self._call_api_v1( + f'{video_type}/detail', video_id, fatal=False, query={'tas': 10000, 'contentId': video_id}), + ('body', 'results', 'item', {dict})) or {} if not self.get_param('allow_unplayable_formats') and video_data.get('drmProtected'): self.report_drm(video_id) @@ -188,7 +256,6 @@ def _real_extract(self, url): for key, prefix in self._IGNORE_MAP.items() for ignore in self._configuration_arg(key)): continue - tag_dict = dict((t.split(':', 1) + [None])[:2] for t in tags.split(';')) format_url = url_or_none(playback_set.get('playbackUrl')) if not format_url: @@ -213,10 +280,11 @@ def _real_extract(self, url): 'height': int_or_none(playback_set.get('height')), }] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: geo_restricted = True continue + tag_dict = dict((*t.split(':', 1), None)[:2] for t in tags.split(';')) if tag_dict.get('encryption') not in ('plain', None): for f in current_formats: f['has_drm'] = True @@ -252,14 +320,15 @@ def _real_extract(self, url): 'description': video_data.get('description'), 'duration': int_or_none(video_data.get('duration')), 'timestamp': int_or_none(traverse_obj(video_data, 'broadcastDate', 'startDate')), + 'release_year': int_or_none(video_data.get('year')), 'formats': formats, 'subtitles': subs, 'channel': video_data.get('channelName'), - 'channel_id': video_data.get('channelId'), + 'channel_id': str_or_none(video_data.get('channelId')), 'series': video_data.get('showName'), 'season': video_data.get('seasonName'), 'season_number': int_or_none(video_data.get('seasonNo')), - 'season_id': video_data.get('seasonId'), + 'season_id': str_or_none(video_data.get('seasonId')), 'episode': video_data.get('title'), 'episode_number': int_or_none(video_data.get('episodeNo')), } @@ -304,13 +373,16 @@ def _real_extract(self, url): class HotStarPlaylistIE(HotStarBaseIE): IE_NAME = 'hotstar:playlist' - _VALID_URL = r'https?://(?:www\.)?hotstar\.com(?:/in)?/tv(?:/[^/]+){2}/list/[^/]+/t-(?P<id>\w+)' + _VALID_URL = r'https?://(?:www\.)?hotstar\.com(?:/in)?/(?:tv|shows)(?:/[^/]+){2}/list/[^/]+/t-(?P<id>\w+)' _TESTS = [{ 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/popular-clips/t-3_2_26', 'info_dict': { 'id': '3_2_26', }, 'playlist_mincount': 20, + }, { + 'url': 'https://www.hotstar.com/shows/savdhaan-india/s-26/list/popular-clips/t-3_2_26', + 'only_matching': True, }, { 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/extras/t-2480', 'only_matching': True, @@ -327,7 +399,7 @@ def _real_extract(self, url): class HotStarSeasonIE(HotStarBaseIE): IE_NAME = 'hotstar:season' - _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/\w+)/seasons/[^/]+/ss-(?P<id>\w+)' + _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/(?:tv|shows)/[^/]+/\w+)/seasons/[^/]+/ss-(?P<id>\w+)' _TESTS = [{ 'url': 'https://www.hotstar.com/tv/radhakrishn/1260000646/seasons/season-2/ss-8028', 'info_dict': { @@ -346,6 +418,9 @@ class HotStarSeasonIE(HotStarBaseIE): 'id': '8208', }, 'playlist_mincount': 19, + }, { + 'url': 'https://www.hotstar.com/in/shows/bigg-boss/14714/seasons/season-4/ss-8208/', + 'only_matching': True, }] def _real_extract(self, url): @@ -356,7 +431,7 @@ def _real_extract(self, url): class HotStarSeriesIE(HotStarBaseIE): IE_NAME = 'hotstar:series' - _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P<id>\d+))/?(?:[#?]|$)' + _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/(?:tv|shows)/[^/]+/(?P<id>\d+))/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://www.hotstar.com/in/tv/radhakrishn/1260000646', 'info_dict': { @@ -375,6 +450,12 @@ class HotStarSeriesIE(HotStarBaseIE): 'id': '435', }, 'playlist_mincount': 267, + }, { + 'url': 'https://www.hotstar.com/in/shows/anupama/1260022017/', + 'info_dict': { + 'id': '1260022017', + }, + 'playlist_mincount': 940, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/howcast.py b/yt_dlp/extractor/howcast.py deleted file mode 100644 index 59cf80f1a4..0000000000 --- a/yt_dlp/extractor/howcast.py +++ /dev/null @@ -1,41 +0,0 @@ -from .common import InfoExtractor -from ..utils import parse_iso8601 - - -class HowcastIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?howcast\.com/videos/(?P<id>\d+)' - _TEST = { - 'url': 'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly', - 'md5': '7d45932269a288149483144f01b99789', - 'info_dict': { - 'id': '390161', - 'ext': 'mp4', - 'title': 'How to Tie a Square Knot Properly', - 'description': 'md5:dbe792e5f6f1489027027bf2eba188a3', - 'timestamp': 1276081287, - 'upload_date': '20100609', - 'duration': 56.823, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Ooyala'], - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - embed_code = self._search_regex( - r'<iframe[^>]+src="[^"]+\bembed_code=([^\b]+)\b', - webpage, 'ooyala embed code') - - return { - '_type': 'url_transparent', - 'ie_key': 'Ooyala', - 'url': 'ooyala:%s' % embed_code, - 'id': video_id, - 'timestamp': parse_iso8601(self._html_search_meta( - 'article:published_time', webpage, 'timestamp')), - } diff --git a/yt_dlp/extractor/howstuffworks.py b/yt_dlp/extractor/howstuffworks.py deleted file mode 100644 index 238fc0b42d..0000000000 --- a/yt_dlp/extractor/howstuffworks.py +++ /dev/null @@ -1,86 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - find_xpath_attr, - int_or_none, - js_to_json, - unescapeHTML, - determine_ext, -) - - -class HowStuffWorksIE(InfoExtractor): - _VALID_URL = r'https?://[\da-z-]+\.(?:howstuffworks|stuff(?:(?:youshould|theydontwantyouto)know|toblowyourmind|momnevertoldyou)|(?:brain|car)stuffshow|fwthinking|geniusstuff)\.com/(?:[^/]+/)*(?:\d+-)?(?P<id>.+?)-video\.htm' - _TESTS = [ - { - 'url': 'http://www.stufftoblowyourmind.com/videos/optical-illusions-video.htm', - 'md5': '76646a5acc0c92bf7cd66751ca5db94d', - 'info_dict': { - 'id': '855410', - 'ext': 'mp4', - 'title': 'Your Trickster Brain: Optical Illusions -- Science on the Web', - 'description': 'md5:e374ff9561f6833ad076a8cc0a5ab2fb', - }, - }, - { - 'url': 'http://shows.howstuffworks.com/more-shows/why-does-balloon-stick-to-hair-video.htm', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - clip_js = self._search_regex( - r'(?s)var clip = ({.*?});', webpage, 'clip info') - clip_info = self._parse_json( - clip_js, display_id, transform_source=js_to_json) - - video_id = clip_info['content_id'] - formats = [] - m3u8_url = clip_info.get('m3u8') - if m3u8_url and determine_ext(m3u8_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', format_id='hls', fatal=True)) - flv_url = clip_info.get('flv_url') - if flv_url: - formats.append({ - 'url': flv_url, - 'format_id': 'flv', - }) - for video in clip_info.get('mp4', []): - formats.append({ - 'url': video['src'], - 'format_id': 'mp4-%s' % video['bitrate'], - 'vbr': int_or_none(video['bitrate'].rstrip('k')), - }) - - if not formats: - smil = self._download_xml( - 'http://services.media.howstuffworks.com/videos/%s/smil-service.smil' % video_id, - video_id, 'Downloading video SMIL') - - http_base = find_xpath_attr( - smil, - './{0}head/{0}meta'.format('{http://www.w3.org/2001/SMIL20/Language}'), - 'name', - 'httpBase').get('content') - - URL_SUFFIX = '?v=2.11.3&fp=LNX 11,2,202,356&r=A&g=A' - - for video in smil.findall( - './{0}body/{0}switch/{0}video'.format('{http://www.w3.org/2001/SMIL20/Language}')): - vbr = int_or_none(video.attrib['system-bitrate'], scale=1000) - formats.append({ - 'url': '%s/%s%s' % (http_base, video.attrib['src'], URL_SUFFIX), - 'format_id': '%dk' % vbr, - 'vbr': vbr, - }) - - return { - 'id': '%s' % video_id, - 'display_id': display_id, - 'title': unescapeHTML(clip_info['clip_title']), - 'description': unescapeHTML(clip_info.get('caption')), - 'thumbnail': clip_info.get('video_still_url'), - 'duration': int_or_none(clip_info.get('duration')), - 'formats': formats, - } diff --git a/yt_dlp/extractor/hrefli.py b/yt_dlp/extractor/hrefli.py new file mode 100644 index 0000000000..77db2ea687 --- /dev/null +++ b/yt_dlp/extractor/hrefli.py @@ -0,0 +1,15 @@ +from .common import InfoExtractor + + +class HrefLiRedirectIE(InfoExtractor): + IE_NAME = 'href.li' + IE_DESC = False # Do not list + _VALID_URL = r'https?://href\.li/\?(?P<url>.+)' + + _TESTS = [{ + 'url': 'https://href.li/?https://www.reddit.com/r/cats/comments/12bluel/my_cat_helps_me_with_water/?utm_source=share&utm_medium=android_app&utm_name=androidcss&utm_term=1&utm_content=share_button', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result(self._match_valid_url(url).group('url')) diff --git a/yt_dlp/extractor/hrfensehen.py b/yt_dlp/extractor/hrfensehen.py index 35e9f67c4a..17673d5b8f 100644 --- a/yt_dlp/extractor/hrfensehen.py +++ b/yt_dlp/extractor/hrfensehen.py @@ -24,17 +24,17 @@ class HRFernsehenIE(InfoExtractor): 'Sterbehilfe: Die Lage in Hessen / Miss Hessen leitet zwei eigene Unternehmen / ' 'Pop-Up Museum zeigt Schwarze Unterhaltung und Black Music', 'subtitles': {'de': [{ - 'url': 'https://hr-a.akamaihd.net/video/as/hessenschau/2020_08/hrLogo_200826200407_L385592_512x288-25p-500kbit.vtt' + 'url': 'https://hr-a.akamaihd.net/video/as/hessenschau/2020_08/hrLogo_200826200407_L385592_512x288-25p-500kbit.vtt', }]}, 'timestamp': 1598400000, 'upload_date': '20200826', 'thumbnail': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9.jpg', 'title': 'hessenschau vom 26.08.2020', - 'duration': 1654 - } + 'duration': 1654, + }, }, { 'url': 'https://www.hr-fernsehen.de/sendungen-a-z/mex/sendungen/fair-und-gut---was-hinter-aldis-eigenem-guetesiegel-steckt,video-130544.html', - 'only_matching': True + 'only_matching': True, }] _GEO_COUNTRIES = ['DE'] @@ -74,7 +74,7 @@ def _real_extract(self, url): subtitle = traverse_obj(loader_data, ('mediaCollection', 'subTitles', 0, 'sources', 0, 'url')) - info = { + return { 'id': video_id, 'title': title, 'description': description, @@ -86,5 +86,3 @@ def _real_extract(self, url): loader_data, ('playerConfig', 'pluginData', 'trackingAti@all', 'richMedia', 'duration'))), 'thumbnail': self._search_regex(r'thumbnailUrl\W*([^"]+)', webpage, 'thumbnail', default=None), } - - return info diff --git a/yt_dlp/extractor/hrti.py b/yt_dlp/extractor/hrti.py index cfec80d144..84e3867d34 100644 --- a/yt_dlp/extractor/hrti.py +++ b/yt_dlp/extractor/hrti.py @@ -1,13 +1,13 @@ import json from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking import Request +from ..networking.exceptions import HTTPError from ..utils import ( - clean_html, ExtractorError, + clean_html, int_or_none, parse_age_limit, - sanitized_Request, try_get, ) @@ -28,21 +28,21 @@ class HRTiBaseIE(InfoExtractor): def _initialize_pre_login(self): init_data = { - 'application_publication_id': self._APP_PUBLICATION_ID + 'application_publication_id': self._APP_PUBLICATION_ID, } uuid = self._download_json( self._API_URL, None, note='Downloading uuid', errnote='Unable to download uuid', - data=json.dumps(init_data).encode('utf-8'))['uuid'] + data=json.dumps(init_data).encode())['uuid'] app_data = { 'uuid': uuid, 'application_publication_id': self._APP_PUBLICATION_ID, - 'application_version': self._APP_VERSION + 'application_version': self._APP_VERSION, } - req = sanitized_Request(self._API_URL, data=json.dumps(app_data).encode('utf-8')) + req = Request(self._API_URL, data=json.dumps(app_data).encode()) req.get_method = lambda: 'PUT' resources = self._download_json( @@ -71,17 +71,17 @@ def _perform_login(self, username, password): try: auth_info = self._download_json( self._login_url, None, note='Logging in', errnote='Unable to log in', - data=json.dumps(auth_data).encode('utf-8')) + data=json.dumps(auth_data).encode()) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406: - auth_info = self._parse_json(e.cause.read().encode('utf-8'), None) + if isinstance(e.cause, HTTPError) and e.cause.status == 406: + auth_info = self._parse_json(e.cause.response.read().encode(), None) else: raise error_message = auth_info.get('error', {}).get('message') if error_message: raise ExtractorError( - '%s said: %s' % (self.IE_NAME, error_message), + f'{self.IE_NAME} said: {error_message}', expected=True) self._token = auth_info['secure_streaming_token'] @@ -133,7 +133,7 @@ def _real_extract(self, url): display_id = mobj.group('display_id') or video_id video = self._download_json( - '%s/video_id/%s/format/json' % (self._search_url, video_id), + f'{self._search_url}/video_id/{video_id}/format/json', display_id, 'Downloading video metadata JSON')['video'][0] title_info = video['title'] @@ -188,13 +188,13 @@ def _real_extract(self, url): display_id = mobj.group('display_id') or category_id response = self._download_json( - '%s/category_id/%s/format/json' % (self._search_url, category_id), + f'{self._search_url}/category_id/{category_id}/format/json', display_id, 'Downloading video metadata JSON') video_ids = try_get( response, lambda x: x['video_listings'][0]['alternatives'][0]['list'], list) or [video['id'] for video in response.get('videos', []) if video.get('id')] - entries = [self.url_result('hrti:%s' % video_id) for video_id in video_ids] + entries = [self.url_result(f'hrti:{video_id}') for video_id in video_ids] return self.playlist_result(entries, category_id, display_id) diff --git a/yt_dlp/extractor/hse.py b/yt_dlp/extractor/hse.py index 3cb21d2dd1..d9004293ff 100644 --- a/yt_dlp/extractor/hse.py +++ b/yt_dlp/extractor/hse.py @@ -39,7 +39,7 @@ class HSEShowIE(HSEShowBaseInfoExtractor): 'timestamp': 1638810000, 'upload_date': '20211206', 'channel': 'HSE24', - 'uploader': 'Arina Pirayesh' + 'uploader': 'Arina Pirayesh', }, 'params': {'skip_download': 'm3u8'}, }] @@ -72,7 +72,7 @@ class HSEProductIE(HSEShowBaseInfoExtractor): 'id': '408630', 'ext': 'mp4', 'title': 'Hose im Ponte-Mix', - 'uploader': 'Judith Williams' + 'uploader': 'Judith Williams', }, 'params': {'skip_download': 'm3u8'}, }] diff --git a/yt_dlp/extractor/huajiao.py b/yt_dlp/extractor/huajiao.py index c498fa330c..093ce7dc26 100644 --- a/yt_dlp/extractor/huajiao.py +++ b/yt_dlp/extractor/huajiao.py @@ -22,7 +22,7 @@ class HuajiaoIE(InfoExtractor): 'upload_date': '20161007', 'uploader': 'Penny_余姿昀', 'uploader_id': '75206005', - } + }, } def _real_extract(self, url): diff --git a/yt_dlp/extractor/huffpost.py b/yt_dlp/extractor/huffpost.py index 69fdc34ef7..156ddebf7f 100644 --- a/yt_dlp/extractor/huffpost.py +++ b/yt_dlp/extractor/huffpost.py @@ -40,7 +40,7 @@ class HuffPostIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - api_url = 'http://embed.live.huffingtonpost.com/api/segments/%s.json' % video_id + api_url = f'http://embed.live.huffingtonpost.com/api/segments/{video_id}.json' data = self._download_json(api_url, video_id)['data'] video_title = data['title'] diff --git a/yt_dlp/extractor/hungama.py b/yt_dlp/extractor/hungama.py index 2e9939601f..a687b12dc5 100644 --- a/yt_dlp/extractor/hungama.py +++ b/yt_dlp/extractor/hungama.py @@ -1,19 +1,32 @@ -import re - from .common import InfoExtractor from ..utils import ( int_or_none, + remove_end, + traverse_obj, try_get, + unified_timestamp, + url_or_none, urlencode_postdata, ) -class HungamaIE(InfoExtractor): +class HungamaBaseIE(InfoExtractor): + def _call_api(self, path, content_id, fatal=False): + return traverse_obj(self._download_json( + f'https://cpage.api.hungama.com/v2/page/content/{content_id}/{path}/detail', + content_id, fatal=fatal, query={ + 'device': 'web', + 'platform': 'a', + 'storeId': '1', + }), ('data', {dict})) or {} + + +class HungamaIE(HungamaBaseIE): _VALID_URL = r'''(?x) https?:// - (?:www\.)?hungama\.com/ + (?:www\.|un\.)?hungama\.com/ (?: - (?:video|movie)/[^/]+/| + (?:video|movie|short-film)/[^/]+/| tv-show/(?:[^/]+/){2}\d+/episode/[^/]+/ ) (?P<id>\d+) @@ -25,13 +38,28 @@ class HungamaIE(InfoExtractor): 'id': '39349649', 'ext': 'mp4', 'title': 'Krishna Chants', - 'description': 'Watch Krishna Chants video now. You can also watch other latest videos only at Hungama', + 'description': ' ', 'upload_date': '20180829', 'duration': 264, 'timestamp': 1535500800, 'view_count': int, - 'thumbnail': 'https://images.hungama.com/c/1/0dc/2ca/39349649/39349649_700x394.jpg', - } + 'thumbnail': 'https://images1.hungama.com/tr:n-a_169_m/c/1/0dc/2ca/39349649/39349649_350x197.jpg?v=8', + 'tags': 'count:6', + }, + }, { + 'url': 'https://un.hungama.com/short-film/adira/102524179/', + 'md5': '2278463f5dc9db9054d0c02602d44666', + 'info_dict': { + 'id': '102524179', + 'ext': 'mp4', + 'title': 'Adira', + 'description': 'md5:df20cd4d41eabb33634f06de1025a4b4', + 'upload_date': '20230417', + 'timestamp': 1681689600, + 'view_count': int, + 'thumbnail': 'https://images1.hungama.com/tr:n-a_23_m/c/1/197/ac9/102524179/102524179_350x525.jpg?v=1', + 'tags': 'count:7', + }, }, { 'url': 'https://www.hungama.com/movie/kahaani-2/44129919/', 'only_matching': True, @@ -51,46 +79,64 @@ def _real_extract(self, url): 'c': 'common', 'm': 'get_video_mdn_url', }) - formats = self._extract_m3u8_formats(video_json['stream_url'], video_id, ext='mp4', m3u8_id='hls') - - json_ld = self._search_json_ld( - self._download_webpage(url, video_id, fatal=False) or '', video_id, fatal=False) + metadata = self._call_api('movie', video_id) return { - **json_ld, + **traverse_obj(metadata, ('head', 'data', { + 'title': ('title', {str}), + 'description': ('misc', 'description', {str}), + 'duration': ('duration', {int}), # duration in JSON is incorrect if string + 'timestamp': ('releasedate', {unified_timestamp}), + 'view_count': ('misc', 'playcount', {int_or_none}), + 'thumbnail': ('image', {url_or_none}), + 'tags': ('misc', 'keywords', ..., {str}), + })), 'id': video_id, 'formats': formats, 'subtitles': { 'en': [{ 'url': video_json['sub_title'], 'ext': 'vtt', - }] + }], } if video_json.get('sub_title') else None, } class HungamaSongIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hungama\.com/song/[^/]+/(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.|un\.)?hungama\.com/song/[^/]+/(?P<id>\d+)' + _TESTS = [{ 'url': 'https://www.hungama.com/song/kitni-haseen-zindagi/2931166/', - 'md5': 'd4a6a05a394ad0453a9bea3ca00e6024', + 'md5': '964f46828e8b250aa35e5fdcfdcac367', 'info_dict': { 'id': '2931166', 'ext': 'mp3', 'title': 'Lucky Ali - Kitni Haseen Zindagi', 'track': 'Kitni Haseen Zindagi', 'artist': 'Lucky Ali', - 'album': None, 'release_year': 2000, - } - } + 'thumbnail': 'https://stat2.hungama.ind.in/assets/images/default_images/da-200x200.png', + }, + }, { + 'url': 'https://un.hungama.com/song/tum-kya-mile-from-rocky-aur-rani-kii-prem-kahaani/103553672', + 'md5': '964f46828e8b250aa35e5fdcfdcac367', + 'info_dict': { + 'id': '103553672', + 'ext': 'mp3', + 'title': 'md5:5ebeb1e10771b634ce5f700ce68ae5f4', + 'track': 'Tum Kya Mile (From "Rocky Aur Rani Kii Prem Kahaani")', + 'artist': 'Pritam Chakraborty, Arijit Singh, Shreya Ghoshal, Amitabh Bhattacharya', + 'album': 'Tum Kya Mile (From "Rocky Aur Rani Kii Prem Kahaani")', + 'release_year': 2023, + 'thumbnail': 'https://images.hungama.com/c/1/7c2/c7b/103553671/103553671_200x200.jpg', + }, + }] def _real_extract(self, url): audio_id = self._match_id(url) data = self._download_json( - 'https://www.hungama.com/audio-player-data/track/%s' % audio_id, + f'https://www.hungama.com/audio-player-data/track/{audio_id}', audio_id, query={'_country': 'IN'})[0] track = data['song_name'] artist = data.get('singer_name') @@ -107,7 +153,7 @@ def _real_extract(self, url): 'acodec': media_type, }) - title = '%s - %s' % (artist, track) if artist else track + title = f'{artist} - {track}' if artist else track thumbnail = data.get('img_src') or data.get('album_image') return { @@ -122,8 +168,8 @@ def _real_extract(self, url): } -class HungamaAlbumPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hungama\.com/(?:playlists|album)/[^/]+/(?P<id>\d+)' +class HungamaAlbumPlaylistIE(HungamaBaseIE): + _VALID_URL = r'https?://(?:www\.|un\.)?hungama\.com/(?P<path>playlists|album)/[^/]+/(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.hungama.com/album/bhuj-the-pride-of-india/69481490/', 'playlist_mincount': 7, @@ -132,16 +178,24 @@ class HungamaAlbumPlaylistIE(InfoExtractor): }, }, { 'url': 'https://www.hungama.com/playlists/hindi-jan-to-june-2021/123063/', - 'playlist_mincount': 50, + 'playlist_mincount': 33, 'info_dict': { 'id': '123063', }, + }, { + 'url': 'https://un.hungama.com/album/what-jhumka-%3F-from-rocky-aur-rani-kii-prem-kahaani/103891805/', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '103891805', + }, }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - ptrn = r'<meta[^>]+?property=[\"\']?music:song:url[\"\']?[^>]+?content=[\"\']?([^\"\']+)' - items = re.findall(ptrn, webpage) - entries = [self.url_result(item, ie=HungamaSongIE.ie_key()) for item in items] - return self.playlist_result(entries, video_id) + playlist_id, path = self._match_valid_url(url).group('id', 'path') + data = self._call_api(remove_end(path, 's'), playlist_id, fatal=True) + + def entries(): + for song_url in traverse_obj(data, ('body', 'rows', ..., 'data', 'misc', 'share', {url_or_none})): + yield self.url_result(song_url, HungamaSongIE) + + return self.playlist_result(entries(), playlist_id) diff --git a/yt_dlp/extractor/huya.py b/yt_dlp/extractor/huya.py index c4965f9bce..5663a78a37 100644 --- a/yt_dlp/extractor/huya.py +++ b/yt_dlp/extractor/huya.py @@ -1,9 +1,10 @@ +import base64 import hashlib import random import re +import urllib.parse -from ..compat import compat_urlparse, compat_b64decode - +from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, @@ -13,8 +14,6 @@ update_url_query, ) -from .common import InfoExtractor - class HuyaLiveIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.|m\.)?huya\.com/(?P<id>[^/#?&]+)(?:\D|$)' @@ -34,7 +33,7 @@ class HuyaLiveIE(InfoExtractor): }, }, { 'url': 'https://www.huya.com/xiaoyugame', - 'only_matching': True + 'only_matching': True, }] _RESOLUTION = { @@ -48,8 +47,8 @@ class HuyaLiveIE(InfoExtractor): }, '流畅': { 'width': 800, - 'height': 480 - } + 'height': 480, + }, } def _real_extract(self, url): @@ -72,7 +71,7 @@ def _real_extract(self, url): continue stream_name = stream_info.get('sStreamName') re_secret = not screen_type and live_source_type in (0, 8, 13) - params = dict(compat_urlparse.parse_qsl(unescapeHTML(stream_info['sFlvAntiCode']))) + params = dict(urllib.parse.parse_qsl(unescapeHTML(stream_info['sFlvAntiCode']))) fm, ss = '', '' if re_secret: fm, ss = self.encrypt(params, stream_info, stream_name) @@ -129,6 +128,6 @@ def encrypt(self, params, stream_info, stream_name): 'uuid': int_or_none(ct % 1e7 * 1e6 % 0xffffffff), 't': '100', }) - fm = compat_b64decode(params['fm']).decode().split('_', 1)[0] + fm = base64.b64decode(params['fm']).decode().split('_', 1)[0] ss = hashlib.md5('|'.join([params['seqid'], params['ctype'], params['t']])) return fm, ss diff --git a/yt_dlp/extractor/hypem.py b/yt_dlp/extractor/hypem.py index 54db7b3eb0..204a72e2d6 100644 --- a/yt_dlp/extractor/hypem.py +++ b/yt_dlp/extractor/hypem.py @@ -14,7 +14,7 @@ class HypemIE(InfoExtractor): 'uploader': 'BODYWORK', 'timestamp': 1371810457, 'upload_date': '20130621', - } + }, } def _real_extract(self, url): @@ -30,9 +30,9 @@ def _real_extract(self, url): title = track['song'] final_url = self._download_json( - 'http://hypem.com/serve/source/%s/%s' % (track_id, track['key']), + 'http://hypem.com/serve/source/{}/{}'.format(track_id, track['key']), track_id, 'Downloading metadata', headers={ - 'Content-Type': 'application/json' + 'Content-Type': 'application/json', })['url'] return { diff --git a/yt_dlp/extractor/hypergryph.py b/yt_dlp/extractor/hypergryph.py index 9ca6caebc8..1fb2e9a982 100644 --- a/yt_dlp/extractor/hypergryph.py +++ b/yt_dlp/extractor/hypergryph.py @@ -9,10 +9,10 @@ class MonsterSirenHypergryphMusicIE(InfoExtractor): 'info_dict': { 'id': '514562', 'ext': 'wav', - 'artist': ['塞壬唱片-MSR'], + 'artists': ['塞壬唱片-MSR'], 'album': 'Flame Shadow', 'title': 'Flame Shadow', - } + }, }] def _real_extract(self, url): @@ -27,6 +27,6 @@ def _real_extract(self, url): 'url': traverse_obj(json_data, ('player', 'songDetail', 'sourceUrl')), 'ext': 'wav', 'vcodec': 'none', - 'artist': traverse_obj(json_data, ('player', 'songDetail', 'artists')), - 'album': traverse_obj(json_data, ('musicPlay', 'albumDetail', 'name')) + 'artists': traverse_obj(json_data, ('player', 'songDetail', 'artists', ...)), + 'album': traverse_obj(json_data, ('musicPlay', 'albumDetail', 'name')), } diff --git a/yt_dlp/extractor/hytale.py b/yt_dlp/extractor/hytale.py index 0f4dcc309b..6956c4e5d6 100644 --- a/yt_dlp/extractor/hytale.py +++ b/yt_dlp/extractor/hytale.py @@ -1,7 +1,8 @@ import re +from .cloudflarestream import CloudflareStreamIE from .common import InfoExtractor -from ..utils import traverse_obj +from ..utils.traversal import traverse_obj class HytaleIE(InfoExtractor): @@ -20,8 +21,8 @@ class HytaleIE(InfoExtractor): 'ext': 'mp4', 'title': 'Avatar Personalization', 'thumbnail': r're:https://videodelivery\.net/\w+/thumbnails/thumbnail\.jpg', - } - }] + }, + }], }, { 'url': 'https://www.hytale.com/news/2019/11/hytale-graphics-update', 'info_dict': { @@ -49,7 +50,7 @@ def _real_extract(self, url): entries = [ self.url_result( f'https://cloudflarestream.com/{video_hash}/manifest/video.mpd?parentOrigin=https%3A%2F%2Fhytale.com', - title=self._titles.get(video_hash), url_transparent=True) + CloudflareStreamIE, title=self._titles.get(video_hash), url_transparent=True) for video_hash in re.findall( r'<stream\s+class\s*=\s*"ql-video\s+cf-stream"\s+src\s*=\s*"([a-f0-9]{32})"', webpage) diff --git a/yt_dlp/extractor/icareus.py b/yt_dlp/extractor/icareus.py index d081cf42e2..3d6e1f94d4 100644 --- a/yt_dlp/extractor/icareus.py +++ b/yt_dlp/extractor/icareus.py @@ -65,19 +65,19 @@ class IcareusIE(InfoExtractor): }, }, { 'url': 'https://asahitv.fi/fi/web/asahi/player/vod?assetId=89415818', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://hyvinvointitv.fi/fi/web/hyvinvointitv/player/vod?assetId=89149730', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://inez.fi/fi/web/inez-media/player/vod?assetId=71328822', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.permanto.fi/fi/web/alfatv/player/vod?assetId=135497515', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://videos.minifiddlers.org/web/international-minifiddlers/player/vod?assetId=1982759', - 'only_matching': True + 'only_matching': True, }] def _real_extract(self, url): @@ -166,7 +166,7 @@ def _real_extract(self, url): } thumbnails = info.get('thumbnails') or [{ - 'url': url_or_none(info.get('thumbnail') or assets.get('thumbnail')) + 'url': url_or_none(info.get('thumbnail') or assets.get('thumbnail')), }] return merge_dicts({ diff --git a/yt_dlp/extractor/ichinanalive.py b/yt_dlp/extractor/ichinanalive.py index 9d55ddc021..a37cfe77bd 100644 --- a/yt_dlp/extractor/ichinanalive.py +++ b/yt_dlp/extractor/ichinanalive.py @@ -1,6 +1,5 @@ from .common import InfoExtractor from ..utils import ExtractorError, str_or_none, traverse_obj, unified_strdate -from ..compat import compat_str class IchinanaLiveIE(InfoExtractor): @@ -27,21 +26,21 @@ class IchinanaLiveIE(InfoExtractor): @classmethod def suitable(cls, url): - return not IchinanaLiveClipIE.suitable(url) and super(IchinanaLiveIE, cls).suitable(url) + return not IchinanaLiveClipIE.suitable(url) and super().suitable(url) def _real_extract(self, url): video_id = self._match_id(url) - url = 'https://17.live/live/%s' % video_id + url = f'https://17.live/live/{video_id}' enter = self._download_json( - 'https://api-dsa.17app.co/api/v1/lives/%s/enter' % video_id, video_id, + f'https://api-dsa.17app.co/api/v1/lives/{video_id}/enter', video_id, headers={'Referer': url}, fatal=False, expected_status=420, data=b'\0') if enter and enter.get('message') == 'ended': raise ExtractorError('This live has ended.', expected=True) view_data = self._download_json( - 'https://api-dsa.17app.co/api/v1/lives/%s' % video_id, video_id, + f'https://api-dsa.17app.co/api/v1/lives/{video_id}', video_id, headers={'Referer': url}) uploader = traverse_obj( @@ -52,7 +51,7 @@ def _real_extract(self, url): raise ExtractorError('unable to extract live URL information') formats = [] for (name, value) in video_urls[0].items(): - if not isinstance(value, compat_str): + if not isinstance(value, str): continue if not value.startswith('http'): continue @@ -106,10 +105,10 @@ class IchinanaLiveClipIE(InfoExtractor): def _real_extract(self, url): uploader_id, video_id = self._match_valid_url(url).groups() - url = 'https://17.live/profile/r/%s/clip/%s' % (uploader_id, video_id) + url = f'https://17.live/profile/r/{uploader_id}/clip/{video_id}' view_data = self._download_json( - 'https://api-dsa.17app.co/api/v1/clips/%s' % video_id, video_id, + f'https://api-dsa.17app.co/api/v1/clips/{video_id}', video_id, headers={'Referer': url}) uploader = traverse_obj( diff --git a/yt_dlp/extractor/idolplus.py b/yt_dlp/extractor/idolplus.py new file mode 100644 index 0000000000..3c905b0712 --- /dev/null +++ b/yt_dlp/extractor/idolplus.py @@ -0,0 +1,115 @@ +from .common import InfoExtractor +from ..utils import traverse_obj, try_call, url_or_none + + +class IdolPlusIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?idolplus\.com/z[us]/(?:concert/|contents/?\?(?:[^#]+&)?albumId=)(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://idolplus.com/zs/contents?albumId=M012077298PPV00', + 'md5': '2ace3f4661c943a2f7e79f0b88cea1e7', + 'info_dict': { + 'id': 'M012077298PPV00', + 'ext': 'mp4', + 'title': '[MultiCam] Aegyo on Top of Aegyo (IZ*ONE EATING TRIP)', + 'release_date': '20200707', + 'formats': 'count:65', + }, + 'params': {'format': '532-KIM_MINJU'}, + }, { + 'url': 'https://idolplus.com/zs/contents?albumId=M01232H058PPV00&catId=E9TX5', + 'info_dict': { + 'id': 'M01232H058PPV00', + 'ext': 'mp4', + 'title': 'YENA (CIRCLE CHART MUSIC AWARDS 2022 RED CARPET)', + 'release_date': '20230218', + 'formats': 'count:5', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + # live stream + 'url': 'https://idolplus.com/zu/contents?albumId=M012323174PPV00', + 'info_dict': { + 'id': 'M012323174PPV00', + 'ext': 'mp4', + 'title': 'Hanteo Music Awards 2022 DAY2', + 'release_date': '20230211', + 'formats': 'count:5', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://idolplus.com/zs/concert/M012323039PPV00', + 'info_dict': { + 'id': 'M012323039PPV00', + 'ext': 'mp4', + 'title': 'CIRCLE CHART MUSIC AWARDS 2022', + 'release_date': '20230218', + 'formats': 'count:5', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data_list = traverse_obj(self._download_json( + 'https://idolplus.com/api/zs/viewdata/ruleset/build', video_id, + headers={'App_type': 'web', 'Country_Code': 'KR'}, query={ + 'rulesetId': 'contents', + 'albumId': video_id, + 'distribute': 'PRD', + 'loggedIn': 'false', + 'region': 'zs', + 'countryGroup': '00010', + 'lang': 'en', + 'saId': '999999999998', + }), ('data', 'viewData', ...)) + + player_data = {} + while data_list: + player_data = data_list.pop() + if traverse_obj(player_data, 'type') == 'player': + break + elif traverse_obj(player_data, ('dataList', ...)): + data_list += player_data['dataList'] + + formats = self._extract_m3u8_formats(traverse_obj(player_data, ( + 'vodPlayerList', 'vodProfile', 0, 'vodServer', 0, 'video_url', {url_or_none})), video_id) + + subtitles = {} + for caption in traverse_obj(player_data, ('vodPlayerList', 'caption')) or []: + subtitles.setdefault(caption.get('lang') or 'und', []).append({ + 'url': caption.get('smi_url'), + 'ext': 'vtt', + }) + + # Add member multicams as alternative formats + if (traverse_obj(player_data, ('detail', 'has_cuesheet')) == 'Y' + and traverse_obj(player_data, ('detail', 'is_omni_member')) == 'Y'): + cuesheet = traverse_obj(self._download_json( + 'https://idolplus.com/gapi/contents/v1.0/content/cuesheet', video_id, + 'Downloading JSON metadata for member multicams', + headers={'App_type': 'web', 'Country_Code': 'KR'}, query={ + 'ALBUM_ID': video_id, + 'COUNTRY_GRP': '00010', + 'LANG': 'en', + 'SA_ID': '999999999998', + 'COUNTRY_CODE': 'KR', + }), ('data', 'cuesheet_item', 0)) + + for member in traverse_obj(cuesheet, ('members', ...)): + index = try_call(lambda: int(member['omni_view_index']) - 1) + member_video_url = traverse_obj(cuesheet, ('omni_view', index, 'cdn_url', 0, 'url', {url_or_none})) + if not member_video_url: + continue + member_formats = self._extract_m3u8_formats( + member_video_url, video_id, note=f'Downloading m3u8 for multicam {member["name"]}') + for mf in member_formats: + mf['format_id'] = f'{mf["format_id"]}-{member["name"].replace(" ", "_")}' + formats.extend(member_formats) + + return { + 'id': video_id, + 'title': traverse_obj(player_data, ('detail', 'albumName')), + 'formats': formats, + 'subtitles': subtitles, + 'release_date': traverse_obj(player_data, ('detail', 'broadcastDate')), + } diff --git a/yt_dlp/extractor/ign.py b/yt_dlp/extractor/ign.py index e4db7f9fa9..771c18501c 100644 --- a/yt_dlp/extractor/ign.py +++ b/yt_dlp/extractor/ign.py @@ -1,12 +1,11 @@ import re -import urllib.error +import urllib.parse from .common import InfoExtractor -from ..compat import compat_parse_qs +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, determine_ext, - error_to_compat_str, extract_attributes, int_or_none, merge_dicts, @@ -21,15 +20,15 @@ class IGNBaseIE(InfoExtractor): def _call_api(self, slug): return self._download_json( - 'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug) + f'http://apis.ign.com/{self._PAGE_TYPE}/v3/{self._PAGE_TYPE}s/slug/{slug}', slug) def _checked_call_api(self, slug): try: return self._call_api(slug) except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 404: + if isinstance(e.cause, HTTPError) and e.cause.status == 404: e.cause.args = e.cause.args or [ - e.cause.geturl(), e.cause.getcode(), e.cause.reason] + e.cause.response.url, e.cause.status, e.cause.reason] raise ExtractorError( 'Content not found: expired?', cause=e.cause, expected=True) @@ -105,8 +104,7 @@ class IGNIE(IGNBaseIE): _VIDEO_PATH_RE = r'/(?:\d{4}/\d{2}/\d{2}/)?(?P<id>.+?)' _PLAYLIST_PATH_RE = r'(?:/?\?(?P<filt>[^&#]+))?' _VALID_URL = ( - r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos(?:%s)' - % '|'.join((_VIDEO_PATH_RE + r'(?:[/?&#]|$)', _PLAYLIST_PATH_RE))) + r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos(?:{})'.format('|'.join((_VIDEO_PATH_RE + r'(?:[/?&#]|$)', _PLAYLIST_PATH_RE)))) IE_NAME = 'ign.com' _PAGE_TYPE = 'video' @@ -151,10 +149,10 @@ def _extract_embed_urls(cls, url, webpage): grids = re.findall( r'''(?s)<section\b[^>]+\bclass\s*=\s*['"](?:[\w-]+\s+)*?content-feed-grid(?!\B|-)[^>]+>(.+?)</section[^>]*>''', webpage) - return filter(None, - (urljoin(url, m.group('path')) for m in re.finditer( - r'''<a\b[^>]+\bhref\s*=\s*('|")(?P<path>/videos%s)\1''' - % cls._VIDEO_PATH_RE, grids[0] if grids else ''))) + return filter( + None, (urljoin(url, m.group('path')) for m in re.finditer( + rf'''<a\b[^>]+\bhref\s*=\s*('|")(?P<path>/videos{cls._VIDEO_PATH_RE})\1''', + grids[0] if grids else ''))) def _real_extract(self, url): display_id, filt = self._match_valid_url(url).group('id', 'filt') @@ -196,10 +194,6 @@ class IGNVideoIE(IGNBaseIE): 'thumbnail': 'https://sm.ign.com/ign_me/video/h/how-hitman/how-hitman-aims-to-be-different-than-every-other-s_8z14.jpg', 'duration': 298, 'tags': 'count:13', - 'display_id': '112203', - 'thumbnail': 'https://sm.ign.com/ign_me/video/h/how-hitman/how-hitman-aims-to-be-different-than-every-other-s_8z14.jpg', - 'duration': 298, - 'tags': 'count:13', }, 'expected_warnings': ['HTTP Error 400: Bad Request'], }, { @@ -226,8 +220,8 @@ def _real_extract(self, url): parsed_url._replace(path=parsed_url.path.rsplit('/', 1)[0] + '/embed')) webpage, urlh = self._download_webpage_handle(embed_url, video_id) - new_url = urlh.geturl() - ign_url = compat_parse_qs( + new_url = urlh.url + ign_url = urllib.parse.parse_qs( urllib.parse.urlparse(new_url).query).get('url', [None])[-1] if ign_url: return self.url_result(ign_url, IGNIE.ie_key()) @@ -323,15 +317,15 @@ def _checked_call_api(self, slug): try: return self._call_api(slug) except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError): + if isinstance(e.cause, HTTPError): e.cause.args = e.cause.args or [ - e.cause.geturl(), e.cause.getcode(), e.cause.reason] - if e.cause.code == 404: + e.cause.response.url, e.cause.status, e.cause.reason] + if e.cause.status == 404: raise ExtractorError( 'Content not found: expired?', cause=e.cause, expected=True) - elif e.cause.code == 503: - self.report_warning(error_to_compat_str(e.cause)) + elif e.cause.status == 503: + self.report_warning(str(e.cause)) return raise @@ -370,7 +364,7 @@ def entries(): flashvars = self._search_regex( r'''(<param\b[^>]+\bname\s*=\s*("|')flashvars\2[^>]*>)''', m.group('params'), 'flashvars', default='') - flashvars = compat_parse_qs(extract_attributes(flashvars).get('value') or '') + flashvars = urllib.parse.parse_qs(extract_attributes(flashvars).get('value') or '') v_url = url_or_none((flashvars.get('url') or [None])[-1]) if v_url: yield self.url_result(v_url) diff --git a/yt_dlp/extractor/iheart.py b/yt_dlp/extractor/iheart.py index 2c6a5b6a1d..21870ca044 100644 --- a/yt_dlp/extractor/iheart.py +++ b/yt_dlp/extractor/iheart.py @@ -23,7 +23,7 @@ def _extract_episode(self, episode): class IHeartRadioIE(IHeartRadioBaseIE): - IENAME = 'iheartradio' + IE_NAME = 'iheartradio' _VALID_URL = r'(?:https?://(?:www\.)?iheart\.com/podcast/[^/]+/episode/(?P<display_id>[^/?&#]+)-|iheartradio:)(?P<id>\d+)' _TEST = { 'url': 'https://www.iheart.com/podcast/105-behind-the-bastards-29236323/episode/part-one-alexander-lukashenko-the-dictator-70346499/?embed=true', @@ -35,7 +35,7 @@ class IHeartRadioIE(IHeartRadioBaseIE): 'description': 'md5:96cc7297b3a5a9ebae28643801c96fae', 'timestamp': 1597741200, 'upload_date': '20200818', - } + }, } def _real_extract(self, url): diff --git a/yt_dlp/extractor/ilpost.py b/yt_dlp/extractor/ilpost.py new file mode 100644 index 0000000000..2868f0c62c --- /dev/null +++ b/yt_dlp/extractor/ilpost.py @@ -0,0 +1,69 @@ +import functools + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + url_or_none, + urlencode_postdata, +) +from ..utils.traversal import traverse_obj + + +class IlPostIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ilpost\.it/episodes/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.ilpost.it/episodes/1-avis-akvasas-ka/', + 'md5': '43649f002d85e1c2f319bb478d479c40', + 'info_dict': { + 'id': '2972047', + 'ext': 'mp3', + 'display_id': '1-avis-akvasas-ka', + 'title': '1. Avis akvasas ka', + 'url': 'https://www.ilpost.it/wp-content/uploads/2023/12/28/1703781217-l-invasione-pt1-v6.mp3', + 'timestamp': 1703835014, + 'upload_date': '20231229', + 'duration': 2495.0, + 'availability': 'public', + 'series_id': '235598', + 'description': '', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + endpoint_metadata = self._search_json( + r'var\s+ilpostpodcast\s*=', webpage, 'metadata', display_id) + episode_id = endpoint_metadata['post_id'] + podcast_id = endpoint_metadata['podcast_id'] + podcast_metadata = self._download_json( + endpoint_metadata['ajax_url'], display_id, data=urlencode_postdata({ + 'action': 'checkpodcast', + 'cookie': endpoint_metadata['cookie'], + 'post_id': episode_id, + 'podcast_id': podcast_id, + })) + + episode = traverse_obj(podcast_metadata, ( + 'data', 'postcastList', lambda _, v: str(v['id']) == episode_id, {dict}), get_all=False) + if not episode: + raise ExtractorError('Episode could not be extracted') + + return { + 'id': episode_id, + 'display_id': display_id, + 'series_id': podcast_id, + 'vcodec': 'none', + **traverse_obj(episode, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'url': ('podcast_raw_url', {url_or_none}), + 'thumbnail': ('image', {url_or_none}), + 'timestamp': ('timestamp', {int_or_none}), + 'duration': ('milliseconds', {functools.partial(float_or_none, scale=1000)}), + 'availability': ('free', {lambda v: 'public' if v else 'subscriber_only'}), + }), + } diff --git a/yt_dlp/extractor/iltalehti.py b/yt_dlp/extractor/iltalehti.py index 0e7e82c9c0..37aa47165e 100644 --- a/yt_dlp/extractor/iltalehti.py +++ b/yt_dlp/extractor/iltalehti.py @@ -47,5 +47,5 @@ def _real_extract(self, url): 'state', 'articles', ..., 'items', (('main_media', 'properties'), ('body', ..., 'properties')))) video_ids = traverse_obj(props, (lambda _, v: v['provider'] == 'jwplayer', 'id')) return self.playlist_from_matches( - video_ids, article_id, ie='JWPlatform', getter=lambda id: f'jwplatform:{id}', + video_ids, article_id, ie='JWPlatform', getter=lambda video_id: f'jwplatform:{video_id}', title=traverse_obj(info, ('state', 'articles', ..., 'items', 'canonical_title'), get_all=False)) diff --git a/yt_dlp/extractor/imdb.py b/yt_dlp/extractor/imdb.py index 557a3b7b7b..a786ce32e8 100644 --- a/yt_dlp/extractor/imdb.py +++ b/yt_dlp/extractor/imdb.py @@ -28,7 +28,7 @@ class ImdbIE(InfoExtractor): 'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7', 'duration': 152, 'thumbnail': r're:^https?://.+\.jpg', - } + }, }, { 'url': 'https://www.imdb.com/video/vi3516832537', 'info_dict': { @@ -38,7 +38,7 @@ class ImdbIE(InfoExtractor): 'description': 'md5:17fcc4fe11ec29b4399be9d4c5ef126c', 'duration': 153, 'thumbnail': r're:^https?://.+\.jpg', - } + }, }, { 'url': 'http://www.imdb.com/video/_/vi2524815897', 'only_matching': True, @@ -73,7 +73,7 @@ def _real_extract(self, url): 'key': base64.b64encode(json.dumps({ 'type': 'VIDEO_PLAYER', 'subType': 'FORCE_LEGACY', - 'id': 'vi%s' % video_id, + 'id': f'vi{video_id}', }).encode()).decode(), }), lambda x: x[0]['videoLegacyEncodings']) quality = qualities(('SD', '480p', '720p', '1080p')) @@ -132,7 +132,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, list_id) entries = [ self.url_result('http://www.imdb.com' + m, 'Imdb') - for m in re.findall(r'href="(/list/ls%s/videoplayer/vi[^"]+)"' % list_id, webpage)] + for m in re.findall(rf'href="(/list/ls{list_id}/videoplayer/vi[^"]+)"', webpage)] list_title = self._html_search_regex( r'<h1[^>]+class="[^"]*header[^"]*"[^>]*>(.*?)</h1>', diff --git a/yt_dlp/extractor/imggaming.py b/yt_dlp/extractor/imggaming.py index 8e220fd9f3..3a7b5bd5e2 100644 --- a/yt_dlp/extractor/imggaming.py +++ b/yt_dlp/extractor/imggaming.py @@ -1,7 +1,7 @@ import json from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -52,9 +52,9 @@ def _extract_dve_api_url(self, media_id, media_type): return self._call_api( stream_path, media_id)['playerUrlCallback'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: raise ExtractorError( - self._parse_json(e.cause.read().decode(), media_id)['messages'][0], + self._parse_json(e.cause.response.read().decode(), media_id)['messages'][0], expected=True) raise @@ -73,7 +73,7 @@ def _real_extract(self, url): if not video_id: continue entries.append(self.url_result( - 'https://%s/video/%s' % (domain, video_id), + f'https://{domain}/video/{video_id}', self.ie_key(), video_id)) return self.playlist_result( entries, media_id, playlist.get('title'), diff --git a/yt_dlp/extractor/imgur.py b/yt_dlp/extractor/imgur.py index bff6ed57f5..f0c3419d49 100644 --- a/yt_dlp/extractor/imgur.py +++ b/yt_dlp/extractor/imgur.py @@ -1,99 +1,258 @@ +import functools import re from .common import InfoExtractor from ..utils import ( + ExtractorError, + determine_ext, + float_or_none, int_or_none, js_to_json, mimetype2ext, - ExtractorError, + parse_iso8601, + str_or_none, + strip_or_none, + traverse_obj, + url_or_none, ) -class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|(?:t(?:opic)?|r)/[^/]+)/)(?P<id>[a-zA-Z0-9]+)' +class ImgurBaseIE(InfoExtractor): + _CLIENT_ID = '546c25a59c58ad7' + + @classmethod + def _imgur_result(cls, item_id): + return cls.url_result(f'https://imgur.com/{item_id}', ImgurIE, item_id) + + def _call_api(self, endpoint, video_id, **kwargs): + return self._download_json( + f'https://api.imgur.com/post/v1/{endpoint}/{video_id}?client_id={self._CLIENT_ID}&include=media,account', + video_id, **kwargs) + + @staticmethod + def get_description(s): + if 'Discover the magic of the internet at Imgur' in s: + return None + return s or None + + +class ImgurIE(ImgurBaseIE): + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|t|topic|r)/)(?P<id>[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'https://i.imgur.com/A61SaA1.gifv', + 'url': 'https://imgur.com/A61SaA1', 'info_dict': { 'id': 'A61SaA1', 'ext': 'mp4', - 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', + 'title': 'MRW gifv is up and running without any bugs', + 'timestamp': 1416446068, + 'upload_date': '20141120', + 'dislike_count': int, + 'comment_count': int, + 'release_timestamp': 1416446068, + 'release_date': '20141120', + 'like_count': int, + 'thumbnail': 'https://i.imgur.com/A61SaA1h.jpg', }, }, { - 'url': 'https://imgur.com/A61SaA1', + 'url': 'https://i.imgur.com/A61SaA1.gifv', 'only_matching': True, }, { 'url': 'https://i.imgur.com/crGpqCV.mp4', 'only_matching': True, }, { - # no title 'url': 'https://i.imgur.com/jxBXAMC.gifv', - 'only_matching': True, + 'info_dict': { + 'id': 'jxBXAMC', + 'ext': 'mp4', + 'title': 'Fahaka puffer feeding', + 'timestamp': 1533835503, + 'upload_date': '20180809', + 'release_date': '20180809', + 'like_count': int, + 'duration': 30.0, + 'comment_count': int, + 'release_timestamp': 1533835503, + 'thumbnail': 'https://i.imgur.com/jxBXAMCh.jpg', + 'dislike_count': int, + }, + }, { + # needs Accept header, ref: https://github.com/yt-dlp/yt-dlp/issues/9458 + 'url': 'https://imgur.com/zV03bd5', + 'md5': '59df97884e8ba76143ff6b640a0e2904', + 'info_dict': { + 'id': 'zV03bd5', + 'ext': 'mp4', + 'title': 'Ive - Liz', + 'timestamp': 1710491255, + 'upload_date': '20240315', + 'like_count': int, + 'dislike_count': int, + 'duration': 56.92, + 'comment_count': int, + 'release_timestamp': 1710491255, + 'release_date': '20240315', + }, }] def _real_extract(self, url): video_id = self._match_id(url) + data = self._call_api('media', video_id) + if not traverse_obj(data, ('media', 0, ( + ('type', {lambda t: t == 'video' or None}), + ('metadata', 'is_animated'))), get_all=False): + raise ExtractorError(f'{video_id} is not a video or animated image', expected=True) webpage = self._download_webpage( - 'https://i.imgur.com/{id}.gifv'.format(id=video_id), video_id) + f'https://i.imgur.com/{video_id}.gifv', video_id, fatal=False) or '' + formats = [] - width = int_or_none(self._og_search_property( - 'video:width', webpage, default=None)) - height = int_or_none(self._og_search_property( - 'video:height', webpage, default=None)) + media_fmt = traverse_obj(data, ('media', 0, { + 'url': ('url', {url_or_none}), + 'ext': ('ext', {str}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'filesize': ('size', {int_or_none}), + 'acodec': ('metadata', 'has_sound', {lambda b: None if b else 'none'}), + })) + media_url = media_fmt.get('url') + if media_url: + if not media_fmt.get('ext'): + media_fmt['ext'] = mimetype2ext(traverse_obj( + data, ('media', 0, 'mime_type'))) or determine_ext(media_url) + if traverse_obj(data, ('media', 0, 'type')) == 'image': + media_fmt['acodec'] = 'none' + media_fmt.setdefault('preference', -10) + formats.append(media_fmt) video_elements = self._search_regex( r'(?s)<div class="video-elements">(.*?)</div>', webpage, 'video elements', default=None) - if not video_elements: - raise ExtractorError( - 'No sources found for video %s. Maybe an image?' % video_id, - expected=True) - formats = [] - for m in re.finditer(r'<source\s+src="(?P<src>[^"]+)"\s+type="(?P<type>[^"]+)"', video_elements): - formats.append({ - 'format_id': m.group('type').partition('/')[2], - 'url': self._proto_relative_url(m.group('src')), - 'ext': mimetype2ext(m.group('type')), - 'width': width, - 'height': height, - 'http_headers': { - 'User-Agent': 'yt-dlp (like wget)', - }, - }) + if video_elements: + def og_get_size(media_type): + return { + p: int_or_none(self._og_search_property(f'{media_type}:{p}', webpage, default=None)) + for p in ('width', 'height') + } - gif_json = self._search_regex( - r'(?s)var\s+videoItem\s*=\s*(\{.*?\})', - webpage, 'GIF code', fatal=False) - if gif_json: - gifd = self._parse_json( - gif_json, video_id, transform_source=js_to_json) - formats.append({ - 'format_id': 'gif', - 'preference': -10, # gifs are worse than videos - 'width': width, - 'height': height, - 'ext': 'gif', - 'acodec': 'none', - 'vcodec': 'gif', - 'container': 'gif', - 'url': self._proto_relative_url(gifd['gifUrl']), - 'filesize': gifd.get('size'), - 'http_headers': { - 'User-Agent': 'yt-dlp (like wget)', - }, + size = og_get_size('video') + if not any(size.values()): + size = og_get_size('image') + + formats = traverse_obj( + re.finditer(r'<source\s+src="(?P<src>[^"]+)"\s+type="(?P<type>[^"]+)"', video_elements), + (..., { + 'format_id': ('type', {lambda s: s.partition('/')[2]}), + 'url': ('src', {self._proto_relative_url}), + 'ext': ('type', {mimetype2ext}), + })) + for f in formats: + f.update(size) + + # We can get the original gif format from the webpage as well + gif_json = traverse_obj(self._search_json( + r'var\s+videoItem\s*=', webpage, 'GIF info', video_id, + transform_source=js_to_json, fatal=False), { + 'url': ('gifUrl', {self._proto_relative_url}), + 'filesize': ('size', {int_or_none}), }) + if gif_json: + gif_json.update(size) + gif_json.update({ + 'format_id': 'gif', + 'preference': -10, # gifs < videos + 'ext': 'gif', + 'acodec': 'none', + 'vcodec': 'gif', + 'container': 'gif', + }) + formats.append(gif_json) + + search = functools.partial(self._html_search_meta, html=webpage, default=None) + + twitter_fmt = { + 'format_id': 'twitter', + 'url': url_or_none(search('twitter:player:stream')), + 'ext': mimetype2ext(search('twitter:player:stream:content_type')), + 'width': int_or_none(search('twitter:width')), + 'height': int_or_none(search('twitter:height')), + } + if twitter_fmt['url']: + formats.append(twitter_fmt) + + if not formats: + self.raise_no_formats( + f'No sources found for video {video_id}. Maybe a plain image?', expected=True) + self._remove_duplicate_formats(formats) return { + 'title': self._og_search_title(webpage, default=None), + 'description': self.get_description(self._og_search_description(webpage, default='')), + **traverse_obj(data, { + 'uploader_id': ('account_id', {lambda a: str(a) if int_or_none(a) else None}), + 'uploader': ('account', 'username', {lambda x: strip_or_none(x) or None}), + 'uploader_url': ('account', 'avatar_url', {url_or_none}), + 'like_count': ('upvote_count', {int_or_none}), + 'dislike_count': ('downvote_count', {int_or_none}), + 'comment_count': ('comment_count', {int_or_none}), + 'age_limit': ('is_mature', {lambda x: 18 if x else None}), + 'timestamp': (('updated_at', 'created_at'), {parse_iso8601}), + 'release_timestamp': ('created_at', {parse_iso8601}), + }, get_all=False), + **traverse_obj(data, ('media', 0, 'metadata', { + 'title': ('title', {lambda x: strip_or_none(x) or None}), + 'description': ('description', {self.get_description}), + 'duration': ('duration', {float_or_none}), + 'timestamp': (('updated_at', 'created_at'), {parse_iso8601}), + 'release_timestamp': ('created_at', {parse_iso8601}), + }), get_all=False), 'id': video_id, 'formats': formats, - 'title': self._og_search_title(webpage, default=video_id), + 'thumbnail': url_or_none(search('thumbnailUrl')), + 'http_headers': {'Accept': '*/*'}, } -class ImgurGalleryIE(InfoExtractor): +class ImgurGalleryBaseIE(ImgurBaseIE): + _GALLERY = True + + def _real_extract(self, url): + gallery_id = self._match_id(url) + + data = self._call_api('albums', gallery_id, fatal=False, expected_status=404) + + info = traverse_obj(data, { + 'title': ('title', {lambda x: strip_or_none(x) or None}), + 'description': ('description', {self.get_description}), + }) + + if traverse_obj(data, 'is_album'): + + items = traverse_obj(data, ( + 'media', lambda _, v: v.get('type') == 'video' or v['metadata']['is_animated'], + 'id', {lambda x: str_or_none(x) or None})) + + # if a gallery with exactly one video, apply album metadata to video + media_id = None + if self._GALLERY and len(items) == 1: + media_id = items[0] + + if not media_id: + result = self.playlist_result( + map(self._imgur_result, items), gallery_id) + result.update(info) + return result + gallery_id = media_id + + result = self._imgur_result(gallery_id) + info['_type'] = 'url_transparent' + result.update(info) + return result + + +class ImgurGalleryIE(ImgurGalleryBaseIE): IE_NAME = 'imgur:gallery' - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/]+)/(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/?#]+)/(?P<id>[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'http://imgur.com/gallery/Q95ko', @@ -102,49 +261,121 @@ class ImgurGalleryIE(InfoExtractor): 'title': 'Adding faces make every GIF better', }, 'playlist_count': 25, + 'skip': 'Zoinks! You\'ve taken a wrong turn.', }, { + # TODO: static images - replace with animated/video gallery 'url': 'http://imgur.com/topic/Aww/ll5Vk', 'only_matching': True, }, { 'url': 'https://imgur.com/gallery/YcAQlkx', + 'add_ies': ['Imgur'], 'info_dict': { 'id': 'YcAQlkx', 'ext': 'mp4', 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', - } + 'timestamp': 1358554297, + 'upload_date': '20130119', + 'uploader_id': '1648642', + 'uploader': 'wittyusernamehere', + 'release_timestamp': 1358554297, + 'thumbnail': 'https://i.imgur.com/YcAQlkxh.jpg', + 'release_date': '20130119', + 'uploader_url': 'https://i.imgur.com/u3R4I2S_d.png?maxwidth=290&fidelity=grand', + 'comment_count': int, + 'dislike_count': int, + 'like_count': int, + }, }, { + # TODO: static image - replace with animated/video gallery 'url': 'http://imgur.com/topic/Funny/N8rOudd', 'only_matching': True, }, { 'url': 'http://imgur.com/r/aww/VQcQPhM', - 'only_matching': True, + 'add_ies': ['Imgur'], + 'info_dict': { + 'id': 'VQcQPhM', + 'ext': 'mp4', + 'title': 'The boss is here', + 'timestamp': 1476494751, + 'upload_date': '20161015', + 'uploader_id': '19138530', + 'uploader': 'thematrixcam', + 'comment_count': int, + 'dislike_count': int, + 'uploader_url': 'https://i.imgur.com/qCjr5Pi_d.png?maxwidth=290&fidelity=grand', + 'release_timestamp': 1476494751, + 'like_count': int, + 'release_date': '20161015', + 'thumbnail': 'https://i.imgur.com/VQcQPhMh.jpg', + }, + }, + # from https://github.com/ytdl-org/youtube-dl/pull/16674 + { + 'url': 'https://imgur.com/t/unmuted/6lAn9VQ', + 'info_dict': { + 'id': '6lAn9VQ', + 'title': 'Penguins !', + }, + 'playlist_count': 3, + }, { + 'url': 'https://imgur.com/t/unmuted/kx2uD3C', + 'add_ies': ['Imgur'], + 'info_dict': { + 'id': 'ZVMv45i', + 'ext': 'mp4', + 'title': 'Intruder', + 'timestamp': 1528129683, + 'upload_date': '20180604', + 'release_timestamp': 1528129683, + 'release_date': '20180604', + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'duration': 30.03, + 'thumbnail': 'https://i.imgur.com/ZVMv45ih.jpg', + }, + }, { + 'url': 'https://imgur.com/t/unmuted/wXSK0YH', + 'add_ies': ['Imgur'], + 'info_dict': { + 'id': 'JCAP4io', + 'ext': 'mp4', + 'title': 're:I got the blues$', + 'description': 'Luka’s vocal stylings.\n\nFP edit: don’t encourage me. I’ll never stop posting Luka and friends.', + 'timestamp': 1527809525, + 'upload_date': '20180531', + 'like_count': int, + 'dislike_count': int, + 'duration': 30.03, + 'comment_count': int, + 'release_timestamp': 1527809525, + 'thumbnail': 'https://i.imgur.com/JCAP4ioh.jpg', + 'release_date': '20180531', + }, }] - def _real_extract(self, url): - gallery_id = self._match_id(url) - data = self._download_json( - 'https://imgur.com/gallery/%s.json' % gallery_id, - gallery_id)['data']['image'] - - if data.get('is_album'): - entries = [ - self.url_result('http://imgur.com/%s' % image['hash'], ImgurIE.ie_key(), image['hash']) - for image in data['album_images']['images'] if image.get('hash')] - return self.playlist_result(entries, gallery_id, data.get('title'), data.get('description')) - - return self.url_result('http://imgur.com/%s' % gallery_id, ImgurIE.ie_key(), gallery_id) - - -class ImgurAlbumIE(ImgurGalleryIE): # XXX: Do not subclass from concrete IE +class ImgurAlbumIE(ImgurGalleryBaseIE): IE_NAME = 'imgur:album' _VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P<id>[a-zA-Z0-9]+)' - + _GALLERY = False _TESTS = [{ + # TODO: only static images - replace with animated/video gallery 'url': 'http://imgur.com/a/j6Orj', + 'only_matching': True, + }, + # from https://github.com/ytdl-org/youtube-dl/pull/21693 + { + 'url': 'https://imgur.com/a/iX265HX', 'info_dict': { - 'id': 'j6Orj', - 'title': 'A Literary Analysis of "Star Wars: The Force Awakens"', + 'id': 'iX265HX', + 'title': 'enen-no-shouboutai', }, - 'playlist_count': 12, + 'playlist_count': 2, + }, { + 'url': 'https://imgur.com/a/8pih2Ed', + 'info_dict': { + 'id': '8pih2Ed', + }, + 'playlist_mincount': 1, }] diff --git a/yt_dlp/extractor/ina.py b/yt_dlp/extractor/ina.py index 857013df3e..ba8220176d 100644 --- a/yt_dlp/extractor/ina.py +++ b/yt_dlp/extractor/ina.py @@ -14,7 +14,7 @@ class InaIE(InfoExtractor): 'description': 'md5:19f61e2b4844ed4bb2e3df9ab9f527ff', 'upload_date': '20070712', 'thumbnail': 'https://cdn-hub.ina.fr/notice/690x517/3c4/I12055569.jpeg', - } + }, }, { 'url': 'https://www.ina.fr/video/S806544_001/don-d-organes-des-avancees-mais-d-importants-besoins-video.html', 'only_matching': True, diff --git a/yt_dlp/extractor/inc.py b/yt_dlp/extractor/inc.py index 9b3fe9ac14..f47b8e1ccf 100644 --- a/yt_dlp/extractor/inc.py +++ b/yt_dlp/extractor/inc.py @@ -54,4 +54,4 @@ def _real_extract(self, url): display_id)['vid_kaltura_id'] return self.url_result( - 'kaltura:%s:%s' % (partner_id, kaltura_id), KalturaIE.ie_key()) + f'kaltura:{partner_id}:{kaltura_id}', KalturaIE.ie_key()) diff --git a/yt_dlp/extractor/indavideo.py b/yt_dlp/extractor/indavideo.py index 4fa97d8bba..85e388e0d2 100644 --- a/yt_dlp/extractor/indavideo.py +++ b/yt_dlp/extractor/indavideo.py @@ -1,9 +1,9 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, parse_age_limit, parse_iso8601, + time_seconds, update_url_query, ) @@ -11,15 +11,14 @@ class IndavideoEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P<id>[\da-f]+)' # Some example URLs covered by generic extractor: - # http://indavideo.hu/video/Vicces_cica_1 - # http://index.indavideo.hu/video/2015_0728_beregszasz - # http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko - # http://erotika.indavideo.hu/video/Amator_tini_punci - # http://film.indavideo.hu/video/f_hrom_nagymamm_volt - # http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes - _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)'] + # https://indavideo.hu/video/Vicces_cica_1 + # https://index.indavideo.hu/video/Hod_Nemetorszagban + # https://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko + # https://film.indavideo.hu/video/f_farkaslesen + # https://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)//embed\.indavideo\.hu/player/video/[\da-f]+)'] _TESTS = [{ - 'url': 'http://indavideo.hu/player/video/1bdc3c6d80/', + 'url': 'https://indavideo.hu/player/video/1bdc3c6d80/', 'md5': 'c8a507a1c7410685f83a06eaeeaafeab', 'info_dict': { 'id': '1837039', @@ -36,21 +35,33 @@ class IndavideoEmbedIE(InfoExtractor): 'tags': ['tánc', 'cica', 'cuki', 'cukiajanlo', 'newsroom'], }, }, { - 'url': 'http://embed.indavideo.hu/player/video/1bdc3c6d80?autostart=1&hide=1', - 'only_matching': True, - }, { - 'url': 'http://assets.indavideo.hu/swf/player.swf?v=fe25e500&vID=1bdc3c6d80&autostart=1&hide=1&i=1', + 'url': 'https://embed.indavideo.hu/player/video/1bdc3c6d80?autostart=1&hide=1', 'only_matching': True, }] + _WEBPAGE_TESTS = [{ + 'url': 'https://indavideo.hu/video/Vicces_cica_1', + 'info_dict': { + 'id': '1335611', + 'ext': 'mp4', + 'title': 'Vicces cica', + 'description': 'Játszik a tablettel. :D', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Jet_Pack', + 'uploader_id': '491217', + 'timestamp': 1390821212, + 'upload_date': '20140127', + 'duration': 7, + 'age_limit': 0, + 'tags': ['cica', 'Jet_Pack'], + }, + }] def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( - 'https://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/%s' % video_id, - video_id)['data'] - - title = video['title'] + f'https://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/{video_id}/', + video_id, query={'_': time_seconds()})['data'] video_urls = [] @@ -60,33 +71,21 @@ def _real_extract(self, url): elif isinstance(video_files, dict): video_urls.extend(video_files.values()) - video_file = video.get('video_file') - if video: - video_urls.append(video_file) video_urls = list(set(video_urls)) - video_prefix = video_urls[0].rsplit('/', 1)[0] - - for flv_file in video.get('flv_files', []): - flv_url = '%s/%s' % (video_prefix, flv_file) - if flv_url not in video_urls: - video_urls.append(flv_url) - - filesh = video.get('filesh') + filesh = video.get('filesh') or {} formats = [] for video_url in video_urls: height = int_or_none(self._search_regex( r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)) - if filesh: - if not height: - continue - token = filesh.get(compat_str(height)) - if token is None: - continue - video_url = update_url_query(video_url, {'token': token}) + if not height and len(filesh) == 1: + height = int_or_none(next(iter(filesh.keys()))) + token = filesh.get(str(height)) + if token is None: + continue formats.append({ - 'url': video_url, + 'url': update_url_query(video_url, {'token': token}), 'height': height, }) @@ -96,14 +95,14 @@ def _real_extract(self, url): timestamp = parse_iso8601(timestamp + ' +0200', ' ') thumbnails = [{ - 'url': self._proto_relative_url(thumbnail) + 'url': self._proto_relative_url(thumbnail), } for thumbnail in video.get('thumbnails', [])] tags = [tag['title'] for tag in video.get('tags') or []] return { 'id': video.get('id') or video_id, - 'title': title, + 'title': video.get('title'), 'description': video.get('description'), 'thumbnails': thumbnails, 'uploader': video.get('user_name'), diff --git a/yt_dlp/extractor/infoq.py b/yt_dlp/extractor/infoq.py index 192bcfe35d..5274c9339f 100644 --- a/yt_dlp/extractor/infoq.py +++ b/yt_dlp/extractor/infoq.py @@ -1,15 +1,13 @@ -from ..compat import ( - compat_b64decode, - compat_urllib_parse_unquote, - compat_urlparse, -) +import base64 +import urllib.parse + +from .bokecc import BokeCCBaseIE from ..utils import ( ExtractorError, determine_ext, - update_url_query, traverse_obj, + update_url_query, ) -from .bokecc import BokeCCBaseIE class InfoQIE(BokeCCBaseIE): @@ -59,7 +57,7 @@ def _extract_rtmp_video(self, webpage): encoded_id = self._search_regex( r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id', default=None) - real_id = compat_urllib_parse_unquote(compat_b64decode(encoded_id).decode('utf-8')) + real_id = urllib.parse.unquote(base64.b64decode(encoded_id).decode('utf-8')) playpath = 'mp4:' + real_id return [{ @@ -98,7 +96,7 @@ def _extract_http_audio(self, webpage, video_id): # base URL is found in the Location header in the response returned by # GET https://www.infoq.com/mp3download.action?filename=... when logged in. - http_audio_url = compat_urlparse.urljoin('http://ress.infoq.com/downloads/mp3downloads/', http_audio_url) + http_audio_url = urllib.parse.urljoin('http://ress.infoq.com/downloads/mp3downloads/', http_audio_url) http_audio_url = update_url_query(http_audio_url, self._extract_cf_auth(webpage)) # audio file seem to be missing some times even if there is a download link diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index 02335138f1..754f710ae2 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -3,13 +3,14 @@ import json import re import time -import urllib.error from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, decode_base_n, encode_base_n, + filter_dict, float_or_none, format_field, get_element_by_attribute, @@ -25,9 +26,9 @@ _ENCODING_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' -def _pk_to_id(id): +def _pk_to_id(media_id): """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id""" - return encode_base_n(int(id.split('_')[0]), table=_ENCODING_CHARS) + return encode_base_n(int(media_id.split('_')[0]), table=_ENCODING_CHARS) def _id_to_pk(shortcode): @@ -112,7 +113,7 @@ def _extract_nodes(self, nodes, is_direct=False): 'height': self._get_dimension('height', node), 'http_headers': { 'Referer': 'https://www.instagram.com/', - } + }, } elif not video_id: continue @@ -147,25 +148,25 @@ def _extract_product_media(self, product_media): return {} formats = [{ - 'format_id': format.get('id'), - 'url': format.get('url'), - 'width': format.get('width'), - 'height': format.get('height'), + 'format_id': fmt.get('id'), + 'url': fmt.get('url'), + 'width': fmt.get('width'), + 'height': fmt.get('height'), 'vcodec': vcodec, - } for format in videos_list or []] + } for fmt in videos_list or []] if dash_manifest_raw: formats.extend(self._parse_mpd_formats(self._parse_xml(dash_manifest_raw, media_id), mpd_id='dash')) thumbnails = [{ 'url': thumbnail.get('url'), 'width': thumbnail.get('width'), - 'height': thumbnail.get('height') + 'height': thumbnail.get('height'), } for thumbnail in traverse_obj(product_media, ('image_versions2', 'candidates')) or []] return { 'id': media_id, 'duration': float_or_none(product_media.get('video_duration')), 'formats': formats, - 'thumbnails': thumbnails + 'thumbnails': thumbnails, } def _extract_product(self, product_info): @@ -187,7 +188,7 @@ def _extract_product(self, product_info): '__post_extractor': self.extract_comments(_pk_to_id(product_info.get('pk'))), 'http_headers': { 'Referer': 'https://www.instagram.com/', - } + }, } carousel_media = product_info.get('carousel_media') if carousel_media: @@ -203,7 +204,7 @@ def _extract_product(self, product_info): return { **info_dict, - **self._extract_product_media(product_info) + **self._extract_product_media(product_info), } def _get_comments(self, video_id): @@ -245,7 +246,7 @@ class InstagramIOSIE(InfoExtractor): 'comment_count': int, 'comments': list, }, - 'add_ie': ['Instagram'] + 'add_ie': ['Instagram'], }] def _real_extract(self, url): @@ -254,7 +255,7 @@ def _real_extract(self, url): class InstagramIE(InstagramBaseIE): - _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com(?:/[^/]+)?/(?:p|tv|reel)/(?P<id>[^/?#&]+))' + _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com(?:/[^/]+)?/(?:p|tv|reels?(?!/audio/))/(?P<id>[^/?#&]+))' _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1'] _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', @@ -378,6 +379,9 @@ class InstagramIE(InstagramBaseIE): }, { 'url': 'https://www.instagram.com/marvelskies.fc/reel/CWqAgUZgCku/', 'only_matching': True, + }, { + 'url': 'https://www.instagram.com/reels/Cop84x6u7CP/', + 'only_matching': True, }] @classmethod @@ -442,14 +446,14 @@ def _real_extract(self, url): shared_data = self._search_json( r'window\._sharedData\s*=', webpage, 'shared data', video_id, fatal=False) or {} - if shared_data and self._LOGIN_URL not in urlh.geturl(): + if shared_data and self._LOGIN_URL not in urlh.url: media.update(traverse_obj( shared_data, ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'), ('entry_data', 'PostPage', 0, 'media'), expected_type=dict) or {}) else: self.report_warning('Main webpage is locked behind the login page. Retrying with embed webpage (some metadata might be missing).') webpage = self._download_webpage( - f'{url}/embed/', video_id, note='Downloading embed webpage', fatal=False) + f'{url}/embed/', video_id, note='Downloading embed webpage', fatal=False) or '' additional_data = self._search_json( r'window\.__additionalDataLoaded\s*\(\s*[^,]+,', webpage, 'additional data', video_id, fatal=False) if not additional_data and not media: @@ -516,7 +520,7 @@ def _real_extract(self, url): return { 'id': video_id, 'formats': formats, - 'title': media.get('title') or 'Video by %s' % username, + 'title': media.get('title') or f'Video by {username}', 'description': description, 'duration': float_or_none(media.get('video_duration')), 'timestamp': traverse_obj(media, 'taken_at_timestamp', 'date', expected_type=int_or_none), @@ -530,7 +534,7 @@ def _real_extract(self, url): 'thumbnails': thumbnails, 'http_headers': { 'Referer': 'https://www.instagram.com/', - } + }, } @@ -563,10 +567,10 @@ def _extract_graphql(self, data, url): gis_tmpls = [self._gis_tmpl] else: gis_tmpls = [ - '%s' % rhx_gis, + f'{rhx_gis}', '', - '%s:%s' % (rhx_gis, csrf_token), - '%s:%s:%s' % (rhx_gis, csrf_token, self.get_param('http_headers')['User-Agent']), + f'{rhx_gis}:{csrf_token}', + '{}:{}:{}'.format(rhx_gis, csrf_token, self.get_param('http_headers')['User-Agent']), ] # try all of the ways to generate a GIS query, and not only use the @@ -575,10 +579,10 @@ def _extract_graphql(self, data, url): try: json_data = self._download_json( 'https://www.instagram.com/graphql/query/', uploader_id, - 'Downloading JSON page %d' % page_num, headers={ + f'Downloading JSON page {page_num}', headers={ 'X-Requested-With': 'XMLHttpRequest', 'X-Instagram-GIS': hashlib.md5( - ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(), + (f'{gis_tmpl}:{variables}').encode()).hexdigest(), }, query={ 'query_hash': self._QUERY_HASH, 'variables': variables, @@ -589,7 +593,7 @@ def _extract_graphql(self, data, url): except ExtractorError as e: # if it's an error caused by a bad query, and there are # more GIS templates to try, ignore it and keep trying - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: if gis_tmpl != gis_tmpls[-1]: continue raise @@ -616,6 +620,7 @@ def _real_extract(self, url): class InstagramUserIE(InstagramPlaylistBaseIE): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])' IE_DESC = 'Instagram user profile' IE_NAME = 'instagram:user' @@ -630,10 +635,10 @@ class InstagramUserIE(InstagramPlaylistBaseIE): 'extract_flat': True, 'skip_download': True, 'playlistend': 5, - } + }, }] - _QUERY_HASH = '42323d64886122307be10013ad2dcc44', + _QUERY_HASH = ('42323d64886122307be10013ad2dcc44',) @staticmethod def _parse_timeline_from(data): @@ -645,7 +650,7 @@ def _query_vars_for(data): # returns a dictionary of variables to add to the timeline query based # on the GraphQL of the original page return { - 'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] + 'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id'], } @@ -664,10 +669,10 @@ class InstagramTagIE(InstagramPlaylistBaseIE): 'extract_flat': True, 'skip_download': True, 'playlistend': 50, - } + }, }] - _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314', + _QUERY_HASH = ('f92f56d47dc7a55b606908374b43a314',) @staticmethod def _parse_timeline_from(data): @@ -680,7 +685,7 @@ def _query_vars_for(data): # on the GraphQL of the original page return { 'tag_name': - data['entry_data']['TagPage'][0]['graphql']['hashtag']['name'] + data['entry_data']['TagPage'][0]['graphql']['hashtag']['name'], } @@ -694,7 +699,7 @@ class InstagramStoryIE(InstagramBaseIE): 'id': '18090946048123978', 'title': 'Rare', }, - 'playlist_mincount': 50 + 'playlist_mincount': 50, }] def _real_extract(self, url): @@ -703,28 +708,31 @@ def _real_extract(self, url): user_info = self._search_json(r'"user":', story_info, 'user info', story_id, fatal=False) if not user_info: self.raise_login_required('This content is unreachable') - user_id = user_info.get('id') + user_id = traverse_obj(user_info, 'pk', 'id', expected_type=str) story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}' + if not story_info_url: # user id is only mandatory for non-highlights + raise ExtractorError('Unable to extract user id') + videos = traverse_obj(self._download_json( f'{self._API_BASE_URL}/feed/reels_media/?reel_ids={story_info_url}', story_id, errnote=False, fatal=False, headers=self._API_HEADERS), 'reels') if not videos: self.raise_login_required('You need to log in to access this content') - full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (str(user_id), 'user', 'full_name')) + full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (user_id, 'user', 'full_name')) story_title = traverse_obj(videos, (f'highlight:{story_id}', 'title')) if not story_title: story_title = f'Story by {username}' - highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (str(user_id), 'items')) + highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (user_id, 'items')) info_data = [] for highlight in highlights: highlight_data = self._extract_product(highlight) if highlight_data.get('formats'): info_data.append({ - **highlight_data, 'uploader': full_name, 'uploader_id': user_id, + **filter_dict(highlight_data), }) return self.playlist_result(info_data, playlist_id=story_id, playlist_title=story_title) diff --git a/yt_dlp/extractor/internazionale.py b/yt_dlp/extractor/internazionale.py index 1b1cb574ad..3c3ad7ea30 100644 --- a/yt_dlp/extractor/internazionale.py +++ b/yt_dlp/extractor/internazionale.py @@ -52,8 +52,8 @@ def _real_extract(self, url): 'video available aboard', default='1', group='value') video_available_abroad = video_available_abroad == '1' - video_base = 'https://video%s.internazionale.it/%s/%s.' % \ - ('' if video_available_abroad else '-ita', video_path, video_id) + video_base = 'https://video{}.internazionale.it/{}/{}.'.format( + '' if video_available_abroad else '-ita', video_path, video_id) formats = self._extract_m3u8_formats( video_base + 'm3u8', display_id, 'mp4', diff --git a/yt_dlp/extractor/iprima.py b/yt_dlp/extractor/iprima.py index 181820542c..ab26dc5efe 100644 --- a/yt_dlp/extractor/iprima.py +++ b/yt_dlp/extractor/iprima.py @@ -3,11 +3,12 @@ from .common import InfoExtractor from ..utils import ( + ExtractorError, determine_ext, js_to_json, + parse_qs, + traverse_obj, urlencode_postdata, - ExtractorError, - parse_qs ) @@ -15,8 +16,7 @@ class IPrimaIE(InfoExtractor): _VALID_URL = r'https?://(?!cnn)(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' _GEO_BYPASS = False _NETRC_MACHINE = 'iprima' - _LOGIN_URL = 'https://auth.iprima.cz/oauth2/login' - _TOKEN_URL = 'https://auth.iprima.cz/oauth2/token' + _AUTH_ROOT = 'https://auth.iprima.cz' access_token = None _TESTS = [{ @@ -67,7 +67,7 @@ def _perform_login(self, username, password): return login_page = self._download_webpage( - self._LOGIN_URL, None, note='Downloading login page', + f'{self._AUTH_ROOT}/oauth2/login', None, note='Downloading login page', errnote='Downloading login page failed') login_form = self._hidden_inputs(login_page) @@ -76,11 +76,20 @@ def _perform_login(self, username, password): '_email': username, '_password': password}) - _, login_handle = self._download_webpage_handle( - self._LOGIN_URL, None, data=urlencode_postdata(login_form), + profile_select_html, login_handle = self._download_webpage_handle( + f'{self._AUTH_ROOT}/oauth2/login', None, data=urlencode_postdata(login_form), note='Logging in') - code = parse_qs(login_handle.geturl()).get('code')[0] + # a profile may need to be selected first, even when there is only a single one + if '/profile-select' in login_handle.url: + profile_id = self._search_regex( + r'data-identifier\s*=\s*["\']?(\w+)', profile_select_html, 'profile id') + + login_handle = self._request_webpage( + f'{self._AUTH_ROOT}/user/profile-select-perform/{profile_id}', None, + query={'continueUrl': '/user/login?redirect_uri=/user/'}, note='Selecting profile') + + code = traverse_obj(login_handle.url, ({parse_qs}, 'code', 0)) if not code: raise ExtractorError('Login failed', expected=True) @@ -89,10 +98,10 @@ def _perform_login(self, username, password): 'client_id': 'prima_sso', 'grant_type': 'authorization_code', 'code': code, - 'redirect_uri': 'https://auth.iprima.cz/sso/auth-check'} + 'redirect_uri': f'{self._AUTH_ROOT}/sso/auth-check'} token_data = self._download_json( - self._TOKEN_URL, None, + f'{self._AUTH_ROOT}/oauth2/token', None, note='Downloading token', errnote='Downloading token failed', data=urlencode_postdata(token_request_data)) @@ -115,14 +124,29 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) - title = self._html_search_meta( + title = self._html_extract_title(webpage) or self._html_search_meta( ['og:title', 'twitter:title'], webpage, 'title', default=None) video_id = self._search_regex(( r'productId\s*=\s*([\'"])(?P<id>p\d+)\1', - r'pproduct_id\s*=\s*([\'"])(?P<id>p\d+)\1'), - webpage, 'real id', group='id') + r'pproduct_id\s*=\s*([\'"])(?P<id>p\d+)\1', + ), webpage, 'real id', group='id', default=None) + + if not video_id: + nuxt_data = self._search_nuxt_data(webpage, video_id, traverse='data', fatal=False) + video_id = traverse_obj( + nuxt_data, (..., 'content', 'additionals', 'videoPlayId', {str}), get_all=False) + + if not video_id: + nuxt_data = self._search_json( + r'<script[^>]+\bid=["\']__NUXT_DATA__["\'][^>]*>', + webpage, 'nuxt data', None, end_pattern=r'</script>', contains_pattern=r'\[(?s:.+)\]') + + video_id = traverse_obj(nuxt_data, lambda _, v: re.fullmatch(r'p\d+', v), get_all=False) + + if not video_id: + self.raise_no_formats('Unable to extract video ID from webpage') metadata = self._download_json( f'https://api.play-backend.iprima.cz/api/v1//products/id-{video_id}/play', @@ -176,8 +200,8 @@ class IPrimaCNNIE(InfoExtractor): 'title': 'md5:277c6b1ed0577e51b40ddd35602ff43e', }, 'params': { - 'skip_download': 'm3u8' - } + 'skip_download': 'm3u8', + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/iqiyi.py b/yt_dlp/extractor/iqiyi.py index 4443b1991a..735b44637c 100644 --- a/yt_dlp/extractor/iqiyi.py +++ b/yt_dlp/extractor/iqiyi.py @@ -2,22 +2,18 @@ import itertools import re import time +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_urlencode, - compat_urllib_parse_unquote -) from .openload import PhantomJSwrapper from ..utils import ( + ExtractorError, clean_html, decode_packed_codes, - ExtractorError, float_or_none, format_field, - get_element_by_id, get_element_by_attribute, + get_element_by_id, int_or_none, js_to_json, ohdave_rsa_encrypt, @@ -34,7 +30,7 @@ def md5_text(text): - return hashlib.md5(text.encode('utf-8')).hexdigest() + return hashlib.md5(text.encode()).hexdigest() class IqiyiSDK: @@ -45,17 +41,17 @@ def __init__(self, target, ip, timestamp): @staticmethod def split_sum(data): - return compat_str(sum(map(lambda p: int(p, 16), list(data)))) + return str(sum(int(p, 16) for p in data)) @staticmethod def digit_sum(num): if isinstance(num, int): - num = compat_str(num) - return compat_str(sum(map(int, num))) + num = str(num) + return str(sum(map(int, num))) def even_odd(self): - even = self.digit_sum(compat_str(self.timestamp)[::2]) - odd = self.digit_sum(compat_str(self.timestamp)[1::2]) + even = self.digit_sum(str(self.timestamp)[::2]) + odd = self.digit_sum(str(self.timestamp)[1::2]) return even, odd def preprocess(self, chunksize): @@ -69,7 +65,7 @@ def preprocess(self, chunksize): def mod(self, modulus): chunks, ip = self.preprocess(32) - self.target = chunks[0] + ''.join(map(lambda p: compat_str(p % modulus), ip)) + self.target = chunks[0] + ''.join(str(p % modulus) for p in ip) def split(self, chunksize): modulus_map = { @@ -81,7 +77,7 @@ def split(self, chunksize): chunks, ip = self.preprocess(chunksize) ret = '' for i in range(len(chunks)): - ip_part = compat_str(ip[i] % modulus_map[chunksize]) if i < 4 else '' + ip_part = str(ip[i] % modulus_map[chunksize]) if i < 4 else '' if chunksize == 8: ret += ip_part + chunks[i] else: @@ -108,11 +104,11 @@ def date(self, scheme): self.target = md5_text(self.target) d = time.localtime(self.timestamp) strings = { - 'y': compat_str(d.tm_year), + 'y': str(d.tm_year), 'm': '%02d' % d.tm_mon, 'd': '%02d' % d.tm_mday, } - self.target += ''.join(map(lambda c: strings[c], list(scheme))) + self.target += ''.join(strings[c] for c in scheme) def split_time_even_odd(self): even, odd = self.even_odd() @@ -124,11 +120,11 @@ def split_time_odd_even(self): def split_ip_time_sum(self): chunks, ip = self.preprocess(32) - self.target = compat_str(sum(ip)) + chunks[0] + self.digit_sum(self.timestamp) + self.target = str(sum(ip)) + chunks[0] + self.digit_sum(self.timestamp) def split_time_ip_sum(self): chunks, ip = self.preprocess(32) - self.target = self.digit_sum(self.timestamp) + chunks[0] + compat_str(sum(ip)) + self.target = self.digit_sum(self.timestamp) + chunks[0] + str(sum(ip)) class IqiyiSDKInterpreter: @@ -161,7 +157,7 @@ def run(self, target, ip, timestamp): elif function in other_functions: other_functions[function]() else: - raise ExtractorError('Unknown function %s' % function) + raise ExtractorError(f'Unknown function {function}') return sdk.target @@ -181,7 +177,7 @@ class IqiyiIE(InfoExtractor): 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73', 'ext': 'mp4', 'title': '美国德州空中惊现奇异云团 酷似UFO', - } + }, }, { 'url': 'http://www.iqiyi.com/v_19rrhnnclk.html', 'md5': 'b7dc800a4004b1b57749d9abae0472da', @@ -253,8 +249,9 @@ def _perform_login(self, username, password): note='Get token for logging', errnote='Unable to get token for logging') sdk = data['sdk'] timestamp = int(time.time()) - target = '/apis/reglogin/login.action?lang=zh_TW&area_code=null&email=%s&passwd=%s&agenttype=1&from=undefined&keeplogin=0&piccode=&fromurl=&_pos=1' % ( - username, self._rsa_fun(password.encode('utf-8'))) + target = ( + f'/apis/reglogin/login.action?lang=zh_TW&area_code=null&email={username}' + f'&passwd={self._rsa_fun(password.encode())}&agenttype=1&from=undefined&keeplogin=0&piccode=&fromurl=&_pos=1') interp = IqiyiSDKInterpreter(sdk) sign = interp.run(target, data['ip'], timestamp) @@ -268,7 +265,7 @@ def _perform_login(self, username, password): 'bird_t': timestamp, } validation_result = self._download_json( - 'http://kylin.iqiyi.com/validate?' + compat_urllib_parse_urlencode(validation_params), None, + 'http://kylin.iqiyi.com/validate?' + urllib.parse.urlencode(validation_params), None, note='Validate credentials', errnote='Unable to validate credentials') MSG_MAP = { @@ -280,7 +277,7 @@ def _perform_login(self, username, password): if code != 'A00000': msg = MSG_MAP.get(code) if not msg: - msg = 'error %s' % code + msg = f'error {code}' if validation_result.get('msg'): msg += ': ' + validation_result['msg'] self.report_warning('unable to log in: ' + msg) @@ -292,7 +289,7 @@ def get_raw_data(self, tvid, video_id): tm = int(time.time() * 1000) key = 'd5fb4bd9d50c4be6948c97edd7254b0e' - sc = md5_text(compat_str(tm) + key + tvid) + sc = md5_text(str(tm) + key + tvid) params = { 'tvid': tvid, 'vid': video_id, @@ -302,7 +299,7 @@ def get_raw_data(self, tvid, video_id): } return self._download_json( - 'http://cache.m.iqiyi.com/jp/tmts/%s/%s/' % (tvid, video_id), + f'http://cache.m.iqiyi.com/jp/tmts/{tvid}/{video_id}/', video_id, transform_source=lambda s: remove_start(s, 'var tvInfoJs='), query=params, headers=self.geo_verification_headers()) @@ -325,10 +322,10 @@ def _extract_playlist(self, webpage): # Start from 2 because links in the first page are already on webpage for page_num in itertools.count(2): pagelist_page = self._download_webpage( - 'http://cache.video.qiyi.com/jp/avlist/%s/%d/%d/' % (album_id, page_num, PAGE_SIZE), + f'http://cache.video.qiyi.com/jp/avlist/{album_id}/{page_num}/{PAGE_SIZE}/', album_id, - note='Download playlist page %d' % page_num, - errnote='Failed to download playlist page %d' % page_num) + note=f'Download playlist page {page_num}', + errnote=f'Failed to download playlist page {page_num}') pagelist = self._parse_json( remove_start(pagelist_page, 'var tvInfoJs='), album_id) vlist = pagelist['data']['vlist'] @@ -371,7 +368,7 @@ def _real_extract(self, url): for stream in data['vidl']: if 'm3utx' not in stream: continue - vd = compat_str(stream['vd']) + vd = str(stream['vd']) formats.append({ 'url': stream['m3utx'], 'format_id': vd, @@ -420,11 +417,11 @@ class IqIE(InfoExtractor): 'params': { 'format': '500', }, - 'expected_warnings': ['format is restricted'] + 'expected_warnings': ['format is restricted'], }, { # VIP-restricted video 'url': 'https://www.iq.com/play/mermaid-in-the-fog-2021-gbdpx13bs4', - 'only_matching': True + 'only_matching': True, }] _BID_TAGS = { '100': '240P', @@ -440,12 +437,14 @@ class IqIE(InfoExtractor): '1': 'zh_CN', '2': 'zh_TW', '3': 'en', - '4': 'kor', + '4': 'ko', + '5': 'ja', '18': 'th', '21': 'my', '23': 'vi', '24': 'id', '26': 'es', + '27': 'pt', '28': 'ar', } @@ -497,9 +496,10 @@ class IqIE(InfoExtractor): 'tm': tm, 'qdy': 'a', 'qds': 0, - 'k_ft1': 141287244169348, - 'k_ft4': 34359746564, - 'k_ft5': 1, + 'k_ft1': '143486267424900', + 'k_ft4': '1572868', + 'k_ft7': '4', + 'k_ft5': '1', 'bop': JSON.stringify({ 'version': '10.0', 'dfp': dfp @@ -525,16 +525,24 @@ def _extract_vms_player_js(self, webpage, video_id): if player_js_cache: return player_js_cache webpack_js_url = self._proto_relative_url(self._search_regex( - r'<script src="((?:https?)?//stc.iqiyipic.com/_next/static/chunks/webpack-\w+\.js)"', webpage, 'webpack URL')) + r'<script src="((?:https?:)?//stc\.iqiyipic\.com/_next/static/chunks/webpack-\w+\.js)"', webpage, 'webpack URL')) webpack_js = self._download_webpage(webpack_js_url, video_id, note='Downloading webpack JS', errnote='Unable to download webpack JS') + webpack_map = self._search_json( r'["\']\s*\+\s*', webpack_js, 'JS locations', video_id, contains_pattern=r'{\s*(?:\d+\s*:\s*["\'][\da-f]+["\']\s*,?\s*)+}', end_pattern=r'\[\w+\]\+["\']\.js', transform_source=js_to_json) + replacement_map = self._search_json( + r'["\']\s*\+\(\s*', webpack_js, 'replacement map', video_id, + contains_pattern=r'{\s*(?:\d+\s*:\s*["\'][\w.-]+["\']\s*,?\s*)+}', + end_pattern=r'\[\w+\]\|\|\w+\)\+["\']\.', transform_source=js_to_json, + fatal=False) or {} + for module_index in reversed(webpack_map): + real_module = replacement_map.get(module_index) or module_index module_js = self._download_webpage( - f'https://stc.iqiyipic.com/_next/static/chunks/{module_index}.{webpack_map[module_index]}.js', + f'https://stc.iqiyipic.com/_next/static/chunks/{real_module}.{webpack_map[module_index]}.js', video_id, note=f'Downloading #{module_index} module JS', errnote='Unable to download module JS', fatal=False) or '' if 'vms request' in module_js: self.cache.store('iq', 'player_js', module_js) @@ -555,7 +563,7 @@ def _update_bid_tags(self, webpage, video_id): return self._BID_TAGS = { bid: traverse_obj(extracted_bid_tags, (bid, 'value'), expected_type=str, default=self._BID_TAGS.get(bid)) - for bid in extracted_bid_tags.keys() + for bid in extracted_bid_tags } def _get_cookie(self, name, default=None): @@ -573,7 +581,7 @@ def _real_extract(self, url): uid = traverse_obj( self._parse_json( - self._get_cookie('I00002', '{}'), video_id, transform_source=compat_urllib_parse_unquote, fatal=False), + self._get_cookie('I00002', '{}'), video_id, transform_source=urllib.parse.unquote, fatal=False), ('data', 'uid'), default=0) if uid: @@ -583,7 +591,7 @@ def _real_extract(self, url): 'platformId': 3, 'modeCode': self._get_cookie('mod', 'intl'), 'langCode': self._get_cookie('lang', 'en_us'), - 'deviceId': self._get_cookie('QC005', '') + 'deviceId': self._get_cookie('QC005', ''), }, fatal=False) ut_list = traverse_obj(vip_data, ('data', 'all_vip', ..., 'vipType'), expected_type=str_or_none) else: @@ -614,7 +622,7 @@ def _real_extract(self, url): preview_time = traverse_obj( initial_format_data, ('boss_ts', (None, 'data'), ('previewTime', 'rtime')), expected_type=float_or_none, get_all=False) if traverse_obj(initial_format_data, ('boss_ts', 'data', 'prv'), expected_type=int_or_none): - self.report_warning('This preview video is limited%s' % format_field(preview_time, None, ' to %s seconds')) + self.report_warning('This preview video is limited{}'.format(format_field(preview_time, None, ' to %s seconds'))) # TODO: Extract audio-only formats for bid in set(traverse_obj(initial_format_data, ('program', 'video', ..., 'bid'), expected_type=str_or_none)): @@ -665,7 +673,7 @@ def _real_extract(self, url): f.update({ 'quality': qualities(list(self._BID_TAGS.keys()))(bid), 'format_note': self._BID_TAGS[bid], - **parse_resolution(video_format.get('scrsz')) + **parse_resolution(video_format.get('scrsz')), }) formats.extend(extracted_formats) @@ -673,7 +681,7 @@ def _real_extract(self, url): lang = self._LID_TAGS.get(str_or_none(sub_format.get('lid')), sub_format.get('_name')) subtitles.setdefault(lang, []).extend([{ 'ext': format_ext, - 'url': urljoin(initial_format_data.get('dstl', 'http://meta.video.iqiyi.com'), sub_format[format_key]) + 'url': urljoin(initial_format_data.get('dstl', 'http://meta.video.iqiyi.com'), sub_format[format_key]), } for format_key, format_ext in [('srt', 'srt'), ('webvtt', 'vtt')] if sub_format.get(format_key)]) extra_metadata = page_data.get('albumInfo') if video_info.get('albumId') and page_data.get('albumInfo') else video_info @@ -702,9 +710,9 @@ class IqAlbumIE(InfoExtractor): 'info_dict': { 'id': '1bk9icvr331', 'title': 'One Piece', - 'description': 'Subtitle available on Sunday 4PM(GMT+8).' + 'description': 'Subtitle available on Sunday 4PM(GMT+8).', }, - 'playlist_mincount': 238 + 'playlist_mincount': 238, }, { # Movie/single video 'url': 'https://www.iq.com/album/九龙城寨-2021-22yjnij099k', @@ -721,7 +729,7 @@ class IqAlbumIE(InfoExtractor): 'age_limit': 13, 'average_rating': float, }, - 'expected_warnings': ['format is restricted'] + 'expected_warnings': ['format is restricted'], }] def _entries(self, album_id_num, page_ranges, album_id=None, mode_code='intl', lang_code='en_us'): @@ -734,7 +742,7 @@ def _entries(self, album_id_num, page_ranges, album_id=None, mode_code='intl', l 'modeCode': mode_code, 'langCode': lang_code, 'endOrder': page_range['to'], - 'startOrder': page_range['from'] + 'startOrder': page_range['from'], }) for video in page['data']['epg']: yield self.url_result('https://www.iq.com/play/%s' % (video.get('playLocSuffix') or video['qipuIdStr']), @@ -747,7 +755,7 @@ def _real_extract(self, url): album_data = next_data['props']['initialState']['album']['videoAlbumInfo'] if album_data.get('videoType') == 'singleVideo': - return self.url_result('https://www.iq.com/play/%s' % album_id, IqIE.ie_key()) + return self.url_result(f'https://www.iq.com/play/{album_id}', IqIE.ie_key()) return self.playlist_result( self._entries(album_data['albumId'], album_data['totalPageRange'], album_id, traverse_obj(next_data, ('props', 'initialProps', 'pageProps', 'modeCode')), diff --git a/yt_dlp/extractor/islamchannel.py b/yt_dlp/extractor/islamchannel.py index 253a846b7a..f70c3add47 100644 --- a/yt_dlp/extractor/islamchannel.py +++ b/yt_dlp/extractor/islamchannel.py @@ -14,7 +14,7 @@ class IslamChannelIE(InfoExtractor): 'description': 'md5:5cc7ddecef064ea7afe52eb5e0e33b55', 'thumbnail': r're:https?://.+', 'ext': 'mp4', - } + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/israelnationalnews.py b/yt_dlp/extractor/israelnationalnews.py index 35040f576a..76e54d9cf9 100644 --- a/yt_dlp/extractor/israelnationalnews.py +++ b/yt_dlp/extractor/israelnationalnews.py @@ -7,7 +7,7 @@ class IsraelNationalNewsIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.israelnationalnews.com/news/354520', 'info_dict': { - 'id': '354520' + 'id': '354520', }, 'playlist': [{ 'info_dict': { @@ -34,8 +34,8 @@ class IsraelNationalNewsIE(InfoExtractor): 'channel_url': 'https://www.youtube.com/channel/UCJdKr0Bgd_5saZYqLCa9mng', 'upload_date': '20220606', 'uploader': 'The Rubin Report', - } - }] + }, + }], }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/itprotv.py b/yt_dlp/extractor/itprotv.py index 4ac12603ae..71001c4692 100644 --- a/yt_dlp/extractor/itprotv.py +++ b/yt_dlp/extractor/itprotv.py @@ -1,19 +1,18 @@ import re from .common import InfoExtractor - from ..utils import ( int_or_none, str_or_none, traverse_obj, - urljoin + urljoin, ) class ITProTVBaseIE(InfoExtractor): _ENDPOINTS = { 'course': 'course?url={}&brand=00002560-0000-3fa9-0000-1d61000035f3', - 'episode': 'brand/00002560-0000-3fa9-0000-1d61000035f3/episode?url={}' + 'episode': 'brand/00002560-0000-3fa9-0000-1d61000035f3/episode?url={}', } def _call_api(self, ep, item_id, webpage): @@ -31,7 +30,7 @@ def _check_if_logged_in(self, webpage): class ITProTVIE(ITProTVBaseIE): - _VALID_URL = r'https://app.itpro.tv/course/(?P<course>[\w-]+)/(?P<id>[\w-]+)' + _VALID_URL = r'https?://app\.itpro\.tv/course/(?P<course>[\w-]+)/(?P<id>[\w-]+)' _TESTS = [{ 'url': 'https://app.itpro.tv/course/guided-tour/introductionitprotv', 'md5': 'bca4a28c2667fd1a63052e71a94bb88c', @@ -47,7 +46,7 @@ class ITProTVIE(ITProTVBaseIE): 'availability': 'needs_auth', 'chapter': 'ITProTV 101', 'chapter_number': 1, - 'chapter_id': '5dbb3de426b46c0010b5d1b6' + 'chapter_id': '5dbb3de426b46c0010b5d1b6', }, }, { @@ -65,7 +64,7 @@ class ITProTVIE(ITProTVBaseIE): 'availability': 'needs_auth', 'chapter': 'Job Development', 'chapter_number': 2, - 'chapter_id': '5f7c78d424330c000edf04d9' + 'chapter_id': '5f7c78d424330c000edf04d9', }, }] @@ -96,13 +95,13 @@ def _real_extract(self, url): 'chapter_number': chapter_number, 'chapter_id': str_or_none(chapter.get('id')), 'subtitles': { - 'en': [{'ext': 'vtt', 'data': episode['enCaptionData']}] + 'en': [{'ext': 'vtt', 'data': episode['enCaptionData']}], } if episode.get('enCaptionData') else None, } class ITProTVCourseIE(ITProTVBaseIE): - _VALID_URL = r'https?://app.itpro.tv/course/(?P<id>[\w-]+)/?(?:$|[#?])' + _VALID_URL = r'https?://app\.itpro\.tv/course/(?P<id>[\w-]+)/?(?:$|[#?])' _TESTS = [ { 'url': 'https://app.itpro.tv/course/guided-tour', @@ -111,16 +110,16 @@ class ITProTVCourseIE(ITProTVBaseIE): 'description': 'md5:b175c2c3061ce35a4dd33865b2c1da4e', 'title': 'ITProTV 101', }, - 'playlist_count': 6 + 'playlist_count': 6, }, { 'url': 'https://app.itpro.tv/course/beyond-tech', 'info_dict': { 'id': 'beyond-tech', 'description': 'md5:44cd99855e7f81a15ce1269bd0621fed', - 'title': 'Beyond Tech' + 'title': 'Beyond Tech', }, - 'playlist_count': 15 + 'playlist_count': 15, }, ] diff --git a/yt_dlp/extractor/itv.py b/yt_dlp/extractor/itv.py index 9ac7be3074..89e6f189cb 100644 --- a/yt_dlp/extractor/itv.py +++ b/yt_dlp/extractor/itv.py @@ -1,23 +1,21 @@ import json -from .common import InfoExtractor from .brightcove import BrightcoveNewIE - -from ..compat import compat_str +from .common import InfoExtractor from ..utils import ( + JSON_LD_RE, + ExtractorError, base_url, clean_html, determine_ext, extract_attributes, - ExtractorError, get_element_by_class, - JSON_LD_RE, merge_dicts, parse_duration, smuggle_url, try_get, - url_or_none, url_basename, + url_or_none, urljoin, ) @@ -35,7 +33,7 @@ class ITVIE(InfoExtractor): 'series': 'Plebs', 'season_number': 1, 'episode_number': 1, - 'thumbnail': r're:https?://hubimages\.itv\.com/episode/2_1873_0002' + 'thumbnail': r're:https?://hubimages\.itv\.com/episode/2_1873_0002', }, 'params': { # m3u8 download @@ -51,7 +49,7 @@ class ITVIE(InfoExtractor): 'series': 'The Jonathan Ross Show', 'episode_number': 8, 'season_number': 17, - 'thumbnail': r're:https?://hubimages\.itv\.com/episode/2_1873_0002' + 'thumbnail': r're:https?://hubimages\.itv\.com/episode/2_1873_0002', }, 'params': { # m3u8 download @@ -84,7 +82,7 @@ def _call_api(self, video_id, playlist_url, headers, platform_tag, featureset, f 'user': { 'itvUserId': '', 'entitlements': [], - 'token': '' + 'token': '', }, 'device': { 'manufacturer': 'Safari', @@ -92,20 +90,20 @@ def _call_api(self, video_id, playlist_url, headers, platform_tag, featureset, f 'os': { 'name': 'Windows NT', 'version': '6.1', - 'type': 'desktop' - } + 'type': 'desktop', + }, }, 'client': { 'version': '4.1', - 'id': 'browser' + 'id': 'browser', }, 'variantAvailability': { 'featureset': { 'min': featureset, - 'max': featureset + 'max': featureset, }, - 'platformTag': platform_tag - } + 'platformTag': platform_tag, + }, }).encode(), headers=headers, fatal=fatal) def _get_subtitles(self, video_id, variants, ios_playlist_url, headers, *args, **kwargs): @@ -137,7 +135,7 @@ def _real_extract(self, url): params = extract_attributes(self._search_regex( r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params')) variants = self._parse_json( - try_get(params, lambda x: x['data-video-variants'], compat_str) or '{}', + try_get(params, lambda x: x['data-video-variants'], str) or '{}', video_id, fatal=False) # Prefer last matching featureset # See: https://github.com/yt-dlp/yt-dlp/issues/986 @@ -186,7 +184,7 @@ def _real_extract(self, url): break thumbnails = [] - thumbnail_url = try_get(params, lambda x: x['data-video-posterframe'], compat_str) + thumbnail_url = try_get(params, lambda x: x['data-video-posterframe'], str) if thumbnail_url: thumbnails.extend([{ 'url': thumbnail_url.format(width=1920, height=1080, quality=100, blur=0, bg='false'), @@ -194,7 +192,7 @@ def _real_extract(self, url): 'height': 1080, }, { 'url': urljoin(base_url(thumbnail_url), url_basename(thumbnail_url)), - 'preference': -2 + 'preference': -2, }]) thumbnail_url = self._html_search_meta(['og:image', 'twitter:image'], webpage, default=None) @@ -211,7 +209,7 @@ def _real_extract(self, url): 'subtitles': self.extract_subtitles(video_id, variants, ios_playlist_url, headers), 'duration': parse_duration(video_data.get('Duration')), 'description': clean_html(get_element_by_class('episode-info__synopsis', webpage)), - 'thumbnails': thumbnails + 'thumbnails': thumbnails, }, info) @@ -228,9 +226,9 @@ class ITVBTCCIE(InfoExtractor): 'url': 'https://www.itv.com/news/2021-10-27/i-have-to-protect-the-country-says-rishi-sunak-as-uk-faces-interest-rate-hike', 'info_dict': { 'id': 'i-have-to-protect-the-country-says-rishi-sunak-as-uk-faces-interest-rate-hike', - 'title': 'md5:6ef054dd9f069330db3dcc66cb772d32' + 'title': 'md5:6ef054dd9f069330db3dcc66cb772d32', }, - 'playlist_count': 4 + 'playlist_count': 4, }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' @@ -255,7 +253,7 @@ def _real_extract(self, url): # ITV does not like some GB IP ranges, so here are some # IP blocks it accepts 'geo_ip_blocks': [ - '193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21' + '193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21', ], 'referrer': url, }), diff --git a/yt_dlp/extractor/ivi.py b/yt_dlp/extractor/ivi.py index 96220bea9c..57c276a673 100644 --- a/yt_dlp/extractor/ivi.py +++ b/yt_dlp/extractor/ivi.py @@ -82,31 +82,31 @@ def _real_extract(self, url): 'params': [ video_id, { 'site': 's%d', - 'referrer': 'http://www.ivi.ru/watch/%s' % video_id, - 'contentid': video_id - } - ] + 'referrer': f'http://www.ivi.ru/watch/{video_id}', + 'contentid': video_id, + }, + ], }) for site in (353, 183): content_data = (data % site).encode() if site == 353: - if not Cryptodome: + if not Cryptodome.CMAC: continue timestamp = (self._download_json( self._LIGHT_URL, video_id, 'Downloading timestamp JSON', data=json.dumps({ 'method': 'da.timestamp.get', - 'params': [] + 'params': [], }).encode(), fatal=False) or {}).get('result') if not timestamp: continue query = { 'ts': timestamp, - 'sign': Cryptodome.Hash.CMAC.new(self._LIGHT_KEY, timestamp.encode() + content_data, - Cryptodome.Cipher.Blowfish).hexdigest(), + 'sign': Cryptodome.CMAC.new(self._LIGHT_KEY, timestamp.encode() + content_data, + Cryptodome.Blowfish).hexdigest(), } else: query = {} @@ -126,7 +126,7 @@ def _real_extract(self, url): extractor_msg = 'Video %s does not exist' elif site == 353: continue - elif not Cryptodome: + elif not Cryptodome.CMAC: raise ExtractorError('pycryptodomex not found. Please install', expected=True) elif message: extractor_msg += ': ' + message @@ -158,7 +158,7 @@ def _real_extract(self, url): compilation = result.get('compilation') episode = title if compilation else None - title = '%s - %s' % (compilation, title) if compilation is not None else title + title = f'{compilation} - {title}' if compilation is not None else title thumbnails = [{ 'url': preview['url'], @@ -219,9 +219,9 @@ class IviCompilationIE(InfoExtractor): def _extract_entries(self, html, compilation_id): return [ self.url_result( - 'http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), IviIE.ie_key()) + f'http://www.ivi.ru/watch/{compilation_id}/{serie}', IviIE.ie_key()) for serie in re.findall( - r'<a\b[^>]+\bhref=["\']/watch/%s/(\d+)["\']' % compilation_id, html)] + rf'<a\b[^>]+\bhref=["\']/watch/{compilation_id}/(\d+)["\']', html)] def _real_extract(self, url): mobj = self._match_valid_url(url) @@ -230,8 +230,8 @@ def _real_extract(self, url): if season_id is not None: # Season link season_page = self._download_webpage( - url, compilation_id, 'Downloading season %s web page' % season_id) - playlist_id = '%s/season%s' % (compilation_id, season_id) + url, compilation_id, f'Downloading season {season_id} web page') + playlist_id = f'{compilation_id}/season{season_id}' playlist_title = self._html_search_meta('title', season_page, 'title') entries = self._extract_entries(season_page, compilation_id) else: # Compilation link @@ -239,15 +239,15 @@ def _real_extract(self, url): playlist_id = compilation_id playlist_title = self._html_search_meta('title', compilation_page, 'title') seasons = re.findall( - r'<a href="/watch/%s/season(\d+)' % compilation_id, compilation_page) + rf'<a href="/watch/{compilation_id}/season(\d+)', compilation_page) if not seasons: # No seasons in this compilation entries = self._extract_entries(compilation_page, compilation_id) else: entries = [] for season_id in seasons: season_page = self._download_webpage( - 'http://www.ivi.ru/watch/%s/season%s' % (compilation_id, season_id), - compilation_id, 'Downloading season %s web page' % season_id) + f'http://www.ivi.ru/watch/{compilation_id}/season{season_id}', + compilation_id, f'Downloading season {season_id} web page') entries.extend(self._extract_entries(season_page, compilation_id)) return self.playlist_result(entries, playlist_id, playlist_title) diff --git a/yt_dlp/extractor/ivideon.py b/yt_dlp/extractor/ivideon.py index 7d1e554c27..eb860c7a6c 100644 --- a/yt_dlp/extractor/ivideon.py +++ b/yt_dlp/extractor/ivideon.py @@ -1,8 +1,6 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlencode, - compat_urlparse, -) from ..utils import qualities @@ -21,7 +19,7 @@ class IvideonIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, }, { 'url': 'https://www.ivideon.com/tv/camera/100-c4ee4cb9ede885cf62dfbe93d7b53783/589824/?lang=ru', 'only_matching': True, @@ -36,8 +34,8 @@ def _real_extract(self, url): mobj = self._match_valid_url(url) server_id, camera_id = mobj.group('id'), mobj.group('camera_id') camera_name, description = None, None - camera_url = compat_urlparse.urljoin( - url, '/tv/camera/%s/%s/' % (server_id, camera_id)) + camera_url = urllib.parse.urljoin( + url, f'/tv/camera/{server_id}/{camera_id}/') webpage = self._download_webpage(camera_url, server_id, fatal=False) if webpage: @@ -57,12 +55,12 @@ def _real_extract(self, url): quality = qualities(self._QUALITIES) formats = [{ - 'url': 'https://streaming.ivideon.com/flv/live?%s' % compat_urllib_parse_urlencode({ + 'url': 'https://streaming.ivideon.com/flv/live?{}'.format(urllib.parse.urlencode({ 'server': server_id, 'camera': camera_id, 'sessionId': 'demo', 'q': quality(format_id), - }), + })), 'format_id': format_id, 'ext': 'flv', 'quality': quality(format_id), diff --git a/yt_dlp/extractor/iwara.py b/yt_dlp/extractor/iwara.py index ec3e59c6d0..5b5c367ad8 100644 --- a/yt_dlp/extractor/iwara.py +++ b/yt_dlp/extractor/iwara.py @@ -1,239 +1,297 @@ -import itertools -import re +import functools +import hashlib +import json +import time import urllib.parse from .common import InfoExtractor from ..utils import ( + ExtractorError, + OnDemandPagedList, int_or_none, + jwt_decode_hs256, mimetype2ext, - remove_end, - strip_or_none, - unified_strdate, - url_or_none, - urljoin, + qualities, + traverse_obj, + try_call, + unified_timestamp, ) class IwaraBaseIE(InfoExtractor): - _BASE_REGEX = r'(?P<base_url>https?://(?:www\.|ecchi\.)?iwara\.tv)' + _NETRC_MACHINE = 'iwara' + _USERTOKEN = None + _MEDIATOKEN = None - def _extract_playlist(self, base_url, webpage): - for path in re.findall(r'class="title">\s*<a[^<]+href="([^"]+)', webpage): - yield self.url_result(urljoin(base_url, path)) + def _is_token_expired(self, token, token_type): + # User token TTL == ~3 weeks, Media token TTL == ~1 hour + if (try_call(lambda: jwt_decode_hs256(token)['exp']) or 0) <= int(time.time() - 120): + self.to_screen(f'{token_type} token has expired') + return True + + def _get_user_token(self): + username, password = self._get_login_info() + if not username or not password: + return + + user_token = IwaraBaseIE._USERTOKEN or self.cache.load(self._NETRC_MACHINE, username) + if not user_token or self._is_token_expired(user_token, 'User'): + response = self._download_json( + 'https://api.iwara.tv/user/login', None, note='Logging in', + headers={'Content-Type': 'application/json'}, data=json.dumps({ + 'email': username, + 'password': password, + }).encode(), expected_status=lambda x: True) + user_token = traverse_obj(response, ('token', {str})) + if not user_token: + error = traverse_obj(response, ('message', {str})) + if 'invalidLogin' in error: + raise ExtractorError('Invalid login credentials', expected=True) + else: + raise ExtractorError(f'Iwara API said: {error or "nothing"}') + + self.cache.store(self._NETRC_MACHINE, username, user_token) + + IwaraBaseIE._USERTOKEN = user_token + + def _get_media_token(self): + self._get_user_token() + if not IwaraBaseIE._USERTOKEN: + return # user has not passed credentials + + if not IwaraBaseIE._MEDIATOKEN or self._is_token_expired(IwaraBaseIE._MEDIATOKEN, 'Media'): + IwaraBaseIE._MEDIATOKEN = self._download_json( + 'https://api.iwara.tv/user/token', None, note='Fetching media token', + data=b'', headers={ + 'Authorization': f'Bearer {IwaraBaseIE._USERTOKEN}', + 'Content-Type': 'application/json', + })['accessToken'] + + return {'Authorization': f'Bearer {IwaraBaseIE._MEDIATOKEN}'} + + def _perform_login(self, username, password): + self._get_media_token() class IwaraIE(IwaraBaseIE): - _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/videos/(?P<id>[a-zA-Z0-9]+)' + IE_NAME = 'iwara' + _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos?/(?P<id>[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'http://iwara.tv/videos/amVwUl1EHpAD9RD', - # md5 is unstable + 'url': 'https://www.iwara.tv/video/k2ayoueezfkx6gvq', 'info_dict': { - 'id': 'amVwUl1EHpAD9RD', + 'id': 'k2ayoueezfkx6gvq', 'ext': 'mp4', - 'title': '【MMD R-18】ガールフレンド carry_me_off', 'age_limit': 18, - 'thumbnail': 'https://i.iwara.tv/sites/default/files/videos/thumbnails/7951/thumbnail-7951_0001.png', - 'uploader': 'Reimu丨Action', - 'upload_date': '20150828', - 'description': 'md5:1d4905ce48c66c9299c617f08e106e0f', + 'title': 'Defeat of Irybelda - アイリベルダの敗北', + 'description': 'md5:70278abebe706647a8b4cb04cf23e0d3', + 'uploader': 'Inwerwm', + 'uploader_id': 'inwerwm', + 'tags': 'count:1', + 'like_count': 6133, + 'view_count': 1050343, + 'comment_count': 1, + 'timestamp': 1677843869, + 'modified_timestamp': 1679056362, + }, + 'skip': 'this video cannot be played because of migration', + }, { + 'url': 'https://iwara.tv/video/1ywe1sbkqwumpdxz5/', + 'md5': '7645f966f069b8ec9210efd9130c9aad', + 'info_dict': { + 'id': '1ywe1sbkqwumpdxz5', + 'ext': 'mp4', + 'age_limit': 18, + 'title': 'Aponia アポニア SEX Party Tonight 手の脱衣 巨乳 ', + 'description': 'md5:3f60016fff22060eef1ef26d430b1f67', + 'uploader': 'Lyu ya', + 'uploader_id': 'user792540', + 'tags': [ + 'uncategorized', + ], + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'timestamp': 1678732213, + 'modified_timestamp': int, + 'thumbnail': 'https://files.iwara.tv/image/thumbnail/581d12b5-46f4-4f15-beb2-cfe2cde5d13d/thumbnail-00.jpg', + 'modified_date': '20230614', + 'upload_date': '20230313', }, }, { - 'url': 'http://ecchi.iwara.tv/videos/Vb4yf2yZspkzkBO', - 'md5': '7e5f1f359cd51a027ba4a7b7710a50f0', + 'url': 'https://iwara.tv/video/blggmfno8ghl725bg', 'info_dict': { - 'id': '0B1LvuHnL-sRFNXB1WHNqbGw4SXc', - 'ext': 'mp4', - 'title': '[3D Hentai] Kyonyu × Genkai × Emaki Shinobi Girls.mp4', - 'age_limit': 18, - }, - 'add_ie': ['GoogleDrive'], - }, { - 'url': 'http://www.iwara.tv/videos/nawkaumd6ilezzgq', - # md5 is unstable - 'info_dict': { - 'id': '6liAP9s2Ojc', + 'id': 'blggmfno8ghl725bg', 'ext': 'mp4', 'age_limit': 18, - 'title': '[MMD] Do It Again Ver.2 [1080p 60FPS] (Motion,Camera,Wav+DL)', - 'description': 'md5:590c12c0df1443d833fbebe05da8c47a', - 'upload_date': '20160910', - 'uploader': 'aMMDsork', - 'uploader_id': 'UCVOFyOSCyFkXTYYHITtqB7A', + 'title': 'お外でおしっこしちゃう猫耳ロリメイド', + 'description': 'md5:0342ba9bf6db09edbbb28729657c3611', + 'uploader': 'Fe_Kurosabi', + 'uploader_id': 'fekurosabi', + 'tags': [ + 'pee', + ], + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'timestamp': 1598880567, + 'modified_timestamp': int, + 'upload_date': '20200831', + 'modified_date': '20230605', + 'thumbnail': 'https://files.iwara.tv/image/thumbnail/7693e881-d302-42a4-a780-f16d66b5dadd/thumbnail-00.jpg', + # 'availability': 'needs_auth', }, - 'add_ie': ['Youtube'], }] + def _extract_formats(self, video_id, fileurl): + up = urllib.parse.urlparse(fileurl) + q = urllib.parse.parse_qs(up.query) + paths = up.path.rstrip('/').split('/') + # https://github.com/yt-dlp/yt-dlp/issues/6549#issuecomment-1473771047 + x_version = hashlib.sha1('_'.join((paths[-1], q['expires'][0], '5nFp9kmbNnHdAFhaqMvt')).encode()).hexdigest() + + preference = qualities(['preview', '360', '540', 'Source']) + + files = self._download_json(fileurl, video_id, headers={'X-Version': x_version}) + for fmt in files: + yield traverse_obj(fmt, { + 'format_id': 'name', + 'url': ('src', ('view', 'download'), {self._proto_relative_url}), + 'ext': ('type', {mimetype2ext}), + 'quality': ('name', {preference}), + 'height': ('name', {int_or_none}), + }, get_all=False) + def _real_extract(self, url): video_id = self._match_id(url) + username, _ = self._get_login_info() + video_data = self._download_json( + f'https://api.iwara.tv/video/{video_id}', video_id, + expected_status=lambda x: True, headers=self._get_media_token()) + errmsg = video_data.get('message') + # at this point we can actually get uploaded user info, but do we need it? + if errmsg == 'errors.privateVideo': + self.raise_login_required('Private video. Login if you have permissions to watch', method='password') + elif errmsg == 'errors.notFound' and not username: + self.raise_login_required('Video may need login to view', method='password') + elif errmsg: # None if success + raise ExtractorError(f'Iwara says: {errmsg}') - webpage, urlh = self._download_webpage_handle(url, video_id) - - hostname = urllib.parse.urlparse(urlh.geturl()).hostname - # ecchi is 'sexy' in Japanese - age_limit = 18 if hostname.split('.')[0] == 'ecchi' else 0 - - video_data = self._download_json('http://www.iwara.tv/api/video/%s' % video_id, video_id) - - if not video_data: - iframe_url = self._html_search_regex( - r'<iframe[^>]+src=([\'"])(?P<url>[^\'"]+)\1', - webpage, 'iframe URL', group='url') - return { - '_type': 'url_transparent', - 'url': iframe_url, - 'age_limit': age_limit, - } - - title = remove_end(self._html_extract_title(webpage), ' | Iwara') - - thumbnail = self._html_search_regex( - r'poster=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None) - - uploader = self._html_search_regex( - r'class="username">([^<]+)', webpage, 'uploader', fatal=False) - - upload_date = unified_strdate(self._html_search_regex( - r'作成日:([^\s]+)', webpage, 'upload_date', fatal=False)) - - description = strip_or_none(self._search_regex( - r'<p>(.+?(?=</div))', webpage, 'description', fatal=False, - flags=re.DOTALL)) - - formats = [] - for a_format in video_data: - format_uri = url_or_none(a_format.get('uri')) - if not format_uri: - continue - format_id = a_format.get('resolution') - height = int_or_none(self._search_regex( - r'(\d+)p', format_id, 'height', default=None)) - formats.append({ - 'url': self._proto_relative_url(format_uri, 'https:'), - 'format_id': format_id, - 'ext': mimetype2ext(a_format.get('mime')) or 'mp4', - 'height': height, - 'width': int_or_none(height / 9.0 * 16.0 if height else None), - 'quality': 1 if format_id == 'Source' else 0, - }) + if not video_data.get('fileUrl'): + if video_data.get('embedUrl'): + return self.url_result(video_data.get('embedUrl')) + raise ExtractorError('This video is unplayable', expected=True) return { 'id': video_id, - 'title': title, - 'age_limit': age_limit, - 'formats': formats, - 'thumbnail': self._proto_relative_url(thumbnail, 'https:'), - 'uploader': uploader, - 'upload_date': upload_date, - 'description': description, - } - - -class IwaraPlaylistIE(IwaraBaseIE): - _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/playlist/(?P<id>[^/?#&]+)' - IE_NAME = 'iwara:playlist' - - _TESTS = [{ - 'url': 'https://ecchi.iwara.tv/playlist/best-enf', - 'info_dict': { - 'title': 'Best enf', - 'uploader': 'Jared98112', - 'id': 'best-enf', - }, - 'playlist_mincount': 1097, - }, { - # urlencoded - 'url': 'https://ecchi.iwara.tv/playlist/%E3%83%97%E3%83%AC%E3%82%A4%E3%83%AA%E3%82%B9%E3%83%88-2', - 'info_dict': { - 'id': 'プレイリスト-2', - 'title': 'プレイリスト', - 'uploader': 'mainyu', - }, - 'playlist_mincount': 91, - }] - - def _real_extract(self, url): - playlist_id, base_url = self._match_valid_url(url).group('id', 'base_url') - playlist_id = urllib.parse.unquote(playlist_id) - webpage = self._download_webpage(url, playlist_id) - - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': self._html_search_regex(r'class="title"[^>]*>([^<]+)', webpage, 'title', fatal=False), - 'uploader': self._html_search_regex(r'<h2>([^<]+)', webpage, 'uploader', fatal=False), - 'entries': self._extract_playlist(base_url, webpage), + 'age_limit': 18 if video_data.get('rating') == 'ecchi' else 0, # ecchi is 'sexy' in Japanese + **traverse_obj(video_data, { + 'title': 'title', + 'description': 'body', + 'uploader': ('user', 'name'), + 'uploader_id': ('user', 'username'), + 'tags': ('tags', ..., 'id'), + 'like_count': 'numLikes', + 'view_count': 'numViews', + 'comment_count': 'numComments', + 'timestamp': ('createdAt', {unified_timestamp}), + 'modified_timestamp': ('updatedAt', {unified_timestamp}), + 'thumbnail': ('file', 'id', {str}, { + lambda x: f'https://files.iwara.tv/image/thumbnail/{x}/thumbnail-00.jpg'}), + }), + 'formats': list(self._extract_formats(video_id, video_data.get('fileUrl'))), } class IwaraUserIE(IwaraBaseIE): - _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/users/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?iwara\.tv/profile/(?P<id>[^/?#&]+)' IE_NAME = 'iwara:user' + _PER_PAGE = 32 _TESTS = [{ - 'note': 'number of all videos page is just 1 page. less than 40 videos', - 'url': 'https://ecchi.iwara.tv/users/infinityyukarip', + 'url': 'https://iwara.tv/profile/user792540/videos', 'info_dict': { - 'title': 'Uploaded videos from Infinity_YukariP', - 'id': 'infinityyukarip', - 'uploader': 'Infinity_YukariP', - 'uploader_id': 'infinityyukarip', + 'id': 'user792540', + 'title': 'Lyu ya', }, - 'playlist_mincount': 39, + 'playlist_mincount': 70, }, { - 'note': 'no even all videos page. probably less than 10 videos', - 'url': 'https://ecchi.iwara.tv/users/mmd-quintet', + 'url': 'https://iwara.tv/profile/theblackbirdcalls/videos', 'info_dict': { - 'title': 'Uploaded videos from mmd quintet', - 'id': 'mmd-quintet', - 'uploader': 'mmd quintet', - 'uploader_id': 'mmd-quintet', - }, - 'playlist_mincount': 6, - }, { - 'note': 'has paging. more than 40 videos', - 'url': 'https://ecchi.iwara.tv/users/theblackbirdcalls', - 'info_dict': { - 'title': 'Uploaded videos from TheBlackbirdCalls', 'id': 'theblackbirdcalls', - 'uploader': 'TheBlackbirdCalls', - 'uploader_id': 'theblackbirdcalls', + 'title': 'TheBlackbirdCalls', }, - 'playlist_mincount': 420, + 'playlist_mincount': 723, }, { - 'note': 'foreign chars in URL. there must be foreign characters in URL', - 'url': 'https://ecchi.iwara.tv/users/ぶた丼', + 'url': 'https://iwara.tv/profile/user792540', + 'only_matching': True, + }, { + 'url': 'https://iwara.tv/profile/theblackbirdcalls', + 'only_matching': True, + }, { + 'url': 'https://www.iwara.tv/profile/lumymmd', 'info_dict': { - 'title': 'Uploaded videos from ぶた丼', - 'id': 'ぶた丼', - 'uploader': 'ぶた丼', - 'uploader_id': 'ぶた丼', + 'id': 'lumymmd', + 'title': 'Lumy MMD', }, - 'playlist_mincount': 170, + 'playlist_mincount': 1, }] - def _entries(self, playlist_id, base_url): - webpage = self._download_webpage( - f'{base_url}/users/{playlist_id}', playlist_id) - videos_url = self._search_regex(r'<a href="(/users/[^/]+/videos)(?:\?[^"]+)?">', webpage, 'all videos url', default=None) - if not videos_url: - yield from self._extract_playlist(base_url, webpage) - return - - videos_url = urljoin(base_url, videos_url) - - for n in itertools.count(1): - page = self._download_webpage( - videos_url, playlist_id, note=f'Downloading playlist page {n}', - query={'page': str(n - 1)} if n > 1 else {}) - yield from self._extract_playlist( - base_url, page) - - if f'page={n}' not in page: - break + def _entries(self, playlist_id, user_id, page): + videos = self._download_json( + 'https://api.iwara.tv/videos', playlist_id, + note=f'Downloading page {page}', + query={ + 'page': page, + 'sort': 'date', + 'user': user_id, + 'limit': self._PER_PAGE, + }, headers=self._get_media_token()) + for x in traverse_obj(videos, ('results', ..., 'id')): + yield self.url_result(f'https://iwara.tv/video/{x}') def _real_extract(self, url): - playlist_id, base_url = self._match_valid_url(url).group('id', 'base_url') - playlist_id = urllib.parse.unquote(playlist_id) + playlist_id = self._match_id(url) + user_info = self._download_json( + f'https://api.iwara.tv/profile/{playlist_id}', playlist_id, + note='Requesting user info') + user_id = traverse_obj(user_info, ('user', 'id')) return self.playlist_result( - self._entries(playlist_id, base_url), playlist_id) + OnDemandPagedList( + functools.partial(self._entries, playlist_id, user_id), + self._PER_PAGE), + playlist_id, traverse_obj(user_info, ('user', 'name'))) + + +class IwaraPlaylistIE(IwaraBaseIE): + _VALID_URL = r'https?://(?:www\.)?iwara\.tv/playlist/(?P<id>[0-9a-f-]+)' + IE_NAME = 'iwara:playlist' + _PER_PAGE = 32 + + _TESTS = [{ + 'url': 'https://iwara.tv/playlist/458e5486-36a4-4ac0-b233-7e9eef01025f', + 'info_dict': { + 'id': '458e5486-36a4-4ac0-b233-7e9eef01025f', + }, + 'playlist_mincount': 3, + }] + + def _entries(self, playlist_id, first_page, page): + videos = self._download_json( + 'https://api.iwara.tv/videos', playlist_id, f'Downloading page {page}', + query={'page': page, 'limit': self._PER_PAGE}, + headers=self._get_media_token()) if page else first_page + for x in traverse_obj(videos, ('results', ..., 'id')): + yield self.url_result(f'https://iwara.tv/video/{x}') + + def _real_extract(self, url): + playlist_id = self._match_id(url) + page_0 = self._download_json( + f'https://api.iwara.tv/playlist/{playlist_id}?page=0&limit={self._PER_PAGE}', playlist_id, + note='Requesting playlist info', headers=self._get_media_token()) + + return self.playlist_result( + OnDemandPagedList( + functools.partial(self._entries, playlist_id, page_0), + self._PER_PAGE), + playlist_id, traverse_obj(page_0, ('title', 'name'))) diff --git a/yt_dlp/extractor/ixigua.py b/yt_dlp/extractor/ixigua.py index 1f086d2bdc..2868c2fc7c 100644 --- a/yt_dlp/extractor/ixigua.py +++ b/yt_dlp/extractor/ixigua.py @@ -29,7 +29,7 @@ class IxiguaIE(InfoExtractor): 'thumbnail': r're:^https?://.+\.(avif|webp)', 'timestamp': 1629088414, 'duration': 1030, - } + }, }] def _get_json_data(self, webpage, video_id): diff --git a/yt_dlp/extractor/izlesene.py b/yt_dlp/extractor/izlesene.py index 5cdf8709dc..cf2a269c38 100644 --- a/yt_dlp/extractor/izlesene.py +++ b/yt_dlp/extractor/izlesene.py @@ -1,8 +1,6 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, -) from ..utils import ( determine_ext, float_or_none, @@ -33,7 +31,7 @@ class IzleseneIE(InfoExtractor): 'upload_date': '20140702', 'duration': 95.395, 'age_limit': 0, - } + }, }, { 'url': 'http://www.izlesene.com/video/tarkan-dortmund-2006-konseri/17997', @@ -48,14 +46,14 @@ class IzleseneIE(InfoExtractor): 'upload_date': '20061112', 'duration': 253.666, 'age_limit': 0, - } + }, }, ] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage('http://www.izlesene.com/video/%s' % video_id, video_id) + webpage = self._download_webpage(f'http://www.izlesene.com/video/{video_id}', video_id) video = self._parse_json( self._search_regex( @@ -67,14 +65,14 @@ def _real_extract(self, url): formats = [] for stream in video['media']['level']: source_url = stream.get('source') - if not source_url or not isinstance(source_url, compat_str): + if not source_url or not isinstance(source_url, str): continue ext = determine_ext(url, 'mp4') quality = stream.get('value') height = int_or_none(quality) formats.append({ - 'format_id': '%sp' % quality if quality else 'sd', - 'url': compat_urllib_parse_unquote(source_url), + 'format_id': f'{quality}p' if quality else 'sd', + 'url': urllib.parse.unquote(source_url), 'ext': ext, 'height': height, }) diff --git a/yt_dlp/extractor/jable.py b/yt_dlp/extractor/jable.py deleted file mode 100644 index 84c3225e48..0000000000 --- a/yt_dlp/extractor/jable.py +++ /dev/null @@ -1,103 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import ( - InAdvancePagedList, - int_or_none, - orderedSet, - unified_strdate, -) - - -class JableIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?jable.tv/videos/(?P<id>[\w-]+)' - _TESTS = [{ - 'url': 'https://jable.tv/videos/pppd-812/', - 'md5': 'f1537283a9bc073c31ff86ca35d9b2a6', - 'info_dict': { - 'id': 'pppd-812', - 'ext': 'mp4', - 'title': 'PPPD-812 只要表現好巨乳女教師吉根柚莉愛就獎勵學生們在白虎穴內射出精液', - 'description': 'md5:5b6d4199a854f62c5e56e26ccad19967', - 'thumbnail': r're:^https?://.*\.jpg$', - 'age_limit': 18, - 'like_count': int, - 'view_count': int, - }, - }, { - 'url': 'https://jable.tv/videos/apak-220/', - 'md5': '71f9239d69ced58ab74a816908847cc1', - 'info_dict': { - 'id': 'apak-220', - 'ext': 'mp4', - 'title': 'md5:5c3861b7cf80112a6e2b70bccf170824', - 'description': '', - 'thumbnail': r're:^https?://.*\.jpg$', - 'age_limit': 18, - 'like_count': int, - 'view_count': int, - 'upload_date': '20220319', - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - formats = self._extract_m3u8_formats( - self._search_regex(r'var\s+hlsUrl\s*=\s*\'([^\']+)', webpage, 'hls_url'), video_id, 'mp4', m3u8_id='hls') - - return { - 'id': video_id, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage, default=''), - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - 'formats': formats, - 'age_limit': 18, - 'upload_date': unified_strdate(self._search_regex( - r'class="inactive-color">\D+\s+(\d{4}-\d+-\d+)', webpage, 'upload_date', default=None)), - 'view_count': int_or_none(self._search_regex( - r'#icon-eye"></use></svg>\n*<span class="mr-3">([\d ]+)', - webpage, 'view_count', default='').replace(' ', '')), - 'like_count': int_or_none(self._search_regex( - r'#icon-heart"></use></svg><span class="count">(\d+)', webpage, 'link_count', default=None)), - } - - -class JablePlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?jable.tv/(?:categories|models|tags)/(?P<id>[\w-]+)' - _TESTS = [{ - 'url': 'https://jable.tv/models/kaede-karen/', - 'info_dict': { - 'id': 'kaede-karen', - 'title': '楓カレン', - }, - 'playlist_count': 34, - }, { - 'url': 'https://jable.tv/categories/roleplay/', - 'only_matching': True, - }, { - 'url': 'https://jable.tv/tags/girl/', - 'only_matching': True, - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - - def page_func(page_num): - return [ - self.url_result(player_url, JableIE) - for player_url in orderedSet(re.findall( - r'href="(https://jable.tv/videos/[\w-]+/?)"', - self._download_webpage(url, playlist_id, query={ - 'mode': 'async', - 'from': page_num + 1, - 'function': 'get_block', - 'block_id': 'list_videos_common_videos_list', - }, note=f'Downloading page {page_num + 1}')))] - - return self.playlist_result( - InAdvancePagedList(page_func, int_or_none(self._search_regex( - r'from:(\d+)">[^<]+\s*»', webpage, 'last page number', default=1)), 24), - playlist_id, self._search_regex( - r'<h2 class="h3-md mb-1">([^<]+)', webpage, 'playlist title', default=None)) diff --git a/yt_dlp/extractor/jamendo.py b/yt_dlp/extractor/jamendo.py index a2bbba3979..16540c4147 100644 --- a/yt_dlp/extractor/jamendo.py +++ b/yt_dlp/extractor/jamendo.py @@ -1,7 +1,6 @@ import hashlib import random -from ..compat import compat_str from .common import InfoExtractor from ..utils import ( clean_html, @@ -40,20 +39,20 @@ class JamendoIE(InfoExtractor): 'like_count': int, 'average_rating': int, 'tags': ['piano', 'peaceful', 'newage', 'strings', 'upbeat'], - } + }, }, { 'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock', 'only_matching': True, }] def _call_api(self, resource, resource_id, fatal=True): - path = '/api/%ss' % resource - rand = compat_str(random.random()) + path = f'/api/{resource}s' + rand = str(random.random()) return self._download_json( 'https://www.jamendo.com' + path, resource_id, fatal=fatal, query={ 'id[]': resource_id, }, headers={ - 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand) + 'X-Jam-Call': f'${hashlib.sha1((path + rand).encode()).hexdigest()}*{rand}~', })[0] def _real_extract(self, url): @@ -72,12 +71,11 @@ def _real_extract(self, url): # if artist_name: # title = '%s - %s' % (artist_name, title) # album = get_model('album') - artist = self._call_api("artist", track.get('artistId'), fatal=False) - album = self._call_api("album", track.get('albumId'), fatal=False) + artist = self._call_api('artist', track.get('artistId'), fatal=False) + album = self._call_api('album', track.get('albumId'), fatal=False) formats = [{ - 'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294' - % (sub_domain, track_id, format_id), + 'url': f'https://{sub_domain}.jamendo.com/?trackid={track_id}&format={format_id}&from=app-97dab294', 'format_id': format_id, 'ext': ext, 'quality': quality, @@ -111,7 +109,7 @@ def _real_extract(self, url): tags.append(tag_name) stats = track.get('stats') or {} - license = track.get('licenseCC') or [] + video_license = track.get('licenseCC') or [] return { 'id': track_id, @@ -124,7 +122,7 @@ def _real_extract(self, url): 'track': track_name, 'album': album.get('name'), 'formats': formats, - 'license': '-'.join(license) if license else None, + 'license': '-'.join(video_license) if video_license else None, 'timestamp': int_or_none(track.get('dateCreated')), 'view_count': int_or_none(stats.get('listenedAll')), 'like_count': int_or_none(stats.get('favorited')), @@ -160,7 +158,7 @@ class JamendoAlbumIE(JamendoIE): # XXX: Do not subclass from concrete IE 'average_rating': 4, 'tags': ['rock', 'drums', 'bass', 'world', 'punk', 'neutral'], 'like_count': int, - } + }, }, { 'md5': '1f358d7b2f98edfe90fd55dac0799d50', 'info_dict': { @@ -179,11 +177,11 @@ class JamendoAlbumIE(JamendoIE): # XXX: Do not subclass from concrete IE 'average_rating': 4, 'license': 'by', 'like_count': int, - } + }, }], 'params': { - 'playlistend': 2 - } + 'playlistend': 2, + }, }] def _real_extract(self, url): @@ -196,7 +194,7 @@ def _real_extract(self, url): track_id = track.get('id') if not track_id: continue - track_id = compat_str(track_id) + track_id = str(track_id) entries.append({ '_type': 'url_transparent', 'url': 'https://www.jamendo.com/track/' + track_id, @@ -207,4 +205,4 @@ def _real_extract(self, url): return self.playlist_result( entries, album_id, album_name, - clean_html(try_get(album, lambda x: x['description']['en'], compat_str))) + clean_html(try_get(album, lambda x: x['description']['en'], str))) diff --git a/yt_dlp/extractor/japandiet.py b/yt_dlp/extractor/japandiet.py index 6c650568ac..2ef091aff2 100644 --- a/yt_dlp/extractor/japandiet.py +++ b/yt_dlp/extractor/japandiet.py @@ -1,5 +1,6 @@ import re +from .common import InfoExtractor from ..utils import ( ExtractorError, clean_html, @@ -9,9 +10,8 @@ smuggle_url, traverse_obj, try_call, - unsmuggle_url + unsmuggle_url, ) -from .common import InfoExtractor def _parse_japanese_date(text): @@ -41,7 +41,7 @@ def _parse_japanese_duration(text): mobj = re.search(r'(?:(\d+)日間?)?(?:(\d+)時間?)?(?:(\d+)分)?(?:(\d+)秒)?', re.sub(r'[\s\u3000]+', '', text or '')) if not mobj: return - days, hours, mins, secs = [int_or_none(x, default=0) for x in mobj.groups()] + days, hours, mins, secs = (int_or_none(x, default=0) for x in mobj.groups()) return secs + mins * 60 + hours * 60 * 60 + days * 24 * 60 * 60 @@ -142,10 +142,10 @@ class ShugiinItvVodIE(ShugiinItvBaseIE): 'title': 'ウクライナ大統領国会演説(オンライン)', 'release_date': '20220323', 'chapters': 'count:4', - } + }, }, { 'url': 'https://www.shugiintv.go.jp/en/index.php?ex=VL&media_type=&deli_id=53846', - 'only_matching': True + 'only_matching': True, }] def _real_extract(self, url): @@ -232,7 +232,7 @@ class SangiinIE(InfoExtractor): 'is_live': True, }, 'skip': 'this live is turned into archive after it ends', - }, ] + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/jeuxvideo.py b/yt_dlp/extractor/jeuxvideo.py index 56ea15cf95..793820600e 100644 --- a/yt_dlp/extractor/jeuxvideo.py +++ b/yt_dlp/extractor/jeuxvideo.py @@ -2,6 +2,8 @@ class JeuxVideoIE(InfoExtractor): + _WORKING = False + _ENABLED = None # XXX: pass through to GenericIE _VALID_URL = r'https?://.*?\.jeuxvideo\.com/.*/(.*?)\.htm' _TESTS = [{ diff --git a/yt_dlp/extractor/jiocinema.py b/yt_dlp/extractor/jiocinema.py new file mode 100644 index 0000000000..30d98ba796 --- /dev/null +++ b/yt_dlp/extractor/jiocinema.py @@ -0,0 +1,408 @@ +import base64 +import itertools +import json +import random +import re +import string +import time + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + jwt_decode_hs256, + parse_age_limit, + try_call, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class JioCinemaBaseIE(InfoExtractor): + _NETRC_MACHINE = 'jiocinema' + _GEO_BYPASS = False + _ACCESS_TOKEN = None + _REFRESH_TOKEN = None + _GUEST_TOKEN = None + _USER_ID = None + _DEVICE_ID = None + _API_HEADERS = {'Origin': 'https://www.jiocinema.com', 'Referer': 'https://www.jiocinema.com/'} + _APP_NAME = {'appName': 'RJIL_JioCinema'} + _APP_VERSION = {'appVersion': '5.0.0'} + _API_SIGNATURES = 'o668nxgzwff' + _METADATA_API_BASE = 'https://content-jiovoot.voot.com/psapi' + _ACCESS_HINT = 'the `accessToken` from your browser local storage' + _LOGIN_HINT = ( + 'Log in with "-u phone -p <PHONE_NUMBER>" to authenticate with OTP, ' + f'or use "-u token -p <ACCESS_TOKEN>" to log in with {_ACCESS_HINT}. ' + 'If you have previously logged in with yt-dlp and your session ' + 'has been cached, you can use "-u device -p <DEVICE_ID>"') + + def _cache_token(self, token_type): + assert token_type in ('access', 'refresh', 'all') + if token_type in ('access', 'all'): + self.cache.store( + JioCinemaBaseIE._NETRC_MACHINE, f'{JioCinemaBaseIE._DEVICE_ID}-access', JioCinemaBaseIE._ACCESS_TOKEN) + if token_type in ('refresh', 'all'): + self.cache.store( + JioCinemaBaseIE._NETRC_MACHINE, f'{JioCinemaBaseIE._DEVICE_ID}-refresh', JioCinemaBaseIE._REFRESH_TOKEN) + + def _call_api(self, url, video_id, note='Downloading API JSON', headers={}, data={}): + return self._download_json( + url, video_id, note, data=json.dumps(data, separators=(',', ':')).encode(), headers={ + 'Content-Type': 'application/json', + 'Accept': 'application/json', + **self._API_HEADERS, + **headers, + }, expected_status=(400, 403, 474)) + + def _call_auth_api(self, service, endpoint, note, headers={}, data={}): + return self._call_api( + f'https://auth-jiocinema.voot.com/{service}service/apis/v4/{endpoint}', + None, note=note, headers=headers, data=data) + + def _refresh_token(self): + if not JioCinemaBaseIE._REFRESH_TOKEN or not JioCinemaBaseIE._DEVICE_ID: + raise ExtractorError('User token has expired', expected=True) + response = self._call_auth_api( + 'token', 'refreshtoken', 'Refreshing token', + headers={'accesstoken': self._ACCESS_TOKEN}, data={ + **self._APP_NAME, + 'deviceId': self._DEVICE_ID, + 'refreshToken': self._REFRESH_TOKEN, + **self._APP_VERSION, + }) + refresh_token = response.get('refreshTokenId') + if refresh_token and refresh_token != JioCinemaBaseIE._REFRESH_TOKEN: + JioCinemaBaseIE._REFRESH_TOKEN = refresh_token + self._cache_token('refresh') + JioCinemaBaseIE._ACCESS_TOKEN = response['authToken'] + self._cache_token('access') + + def _fetch_guest_token(self): + JioCinemaBaseIE._DEVICE_ID = ''.join(random.choices(string.digits, k=10)) + guest_token = self._call_auth_api( + 'token', 'guest', 'Downloading guest token', data={ + **self._APP_NAME, + 'deviceType': 'phone', + 'os': 'ios', + 'deviceId': self._DEVICE_ID, + 'freshLaunch': False, + 'adId': self._DEVICE_ID, + **self._APP_VERSION, + }) + self._GUEST_TOKEN = guest_token['authToken'] + self._USER_ID = guest_token['userId'] + + def _call_login_api(self, endpoint, guest_token, data, note): + return self._call_auth_api( + 'user', f'loginotp/{endpoint}', note, headers={ + **self.geo_verification_headers(), + 'accesstoken': self._GUEST_TOKEN, + **self._APP_NAME, + **traverse_obj(guest_token, 'data', { + 'deviceType': ('deviceType', {str}), + 'os': ('os', {str}), + })}, data=data) + + def _is_token_expired(self, token): + return (try_call(lambda: jwt_decode_hs256(token)['exp']) or 0) <= int(time.time() - 180) + + def _perform_login(self, username, password): + if self._ACCESS_TOKEN and not self._is_token_expired(self._ACCESS_TOKEN): + return + + UUID_RE = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}' + + if username.lower() == 'token': + if try_call(lambda: jwt_decode_hs256(password)): + JioCinemaBaseIE._ACCESS_TOKEN = password + refresh_hint = 'the `refreshToken` UUID from your browser local storage' + refresh_token = self._configuration_arg('refresh_token', [''], ie_key=JioCinemaIE)[0] + if not refresh_token: + self.to_screen( + 'To extend the life of your login session, in addition to your access token, ' + 'you can pass --extractor-args "jiocinema:refresh_token=REFRESH_TOKEN" ' + f'where REFRESH_TOKEN is {refresh_hint}') + elif re.fullmatch(UUID_RE, refresh_token): + JioCinemaBaseIE._REFRESH_TOKEN = refresh_token + else: + self.report_warning(f'Invalid refresh_token value. Use {refresh_hint}') + else: + raise ExtractorError( + f'The password given could not be decoded as a token; use {self._ACCESS_HINT}', expected=True) + + elif username.lower() == 'device' and re.fullmatch(rf'(?:{UUID_RE}|\d+)', password): + JioCinemaBaseIE._REFRESH_TOKEN = self.cache.load(JioCinemaBaseIE._NETRC_MACHINE, f'{password}-refresh') + JioCinemaBaseIE._ACCESS_TOKEN = self.cache.load(JioCinemaBaseIE._NETRC_MACHINE, f'{password}-access') + if not JioCinemaBaseIE._REFRESH_TOKEN or not JioCinemaBaseIE._ACCESS_TOKEN: + raise ExtractorError(f'Failed to load cached tokens for device ID "{password}"', expected=True) + + elif username.lower() == 'phone' and re.fullmatch(r'\+?\d+', password): + self._fetch_guest_token() + guest_token = jwt_decode_hs256(self._GUEST_TOKEN) + initial_data = { + 'number': base64.b64encode(password.encode()).decode(), + **self._APP_VERSION, + } + response = self._call_login_api('send', guest_token, initial_data, 'Requesting OTP') + if not traverse_obj(response, ('OTPInfo', {dict})): + raise ExtractorError('There was a problem with the phone number login attempt') + + is_iphone = guest_token.get('os') == 'ios' + response = self._call_login_api('verify', guest_token, { + 'deviceInfo': { + 'consumptionDeviceName': 'iPhone' if is_iphone else 'Android', + 'info': { + 'platform': {'name': 'iPhone OS' if is_iphone else 'Android'}, + 'androidId': self._DEVICE_ID, + 'type': 'iOS' if is_iphone else 'Android', + }, + }, + **initial_data, + 'otp': self._get_tfa_info('the one-time password sent to your phone'), + }, 'Submitting OTP') + if traverse_obj(response, 'code') == 1043: + raise ExtractorError('Wrong OTP', expected=True) + JioCinemaBaseIE._REFRESH_TOKEN = response['refreshToken'] + JioCinemaBaseIE._ACCESS_TOKEN = response['authToken'] + + else: + raise ExtractorError(self._LOGIN_HINT, expected=True) + + user_token = jwt_decode_hs256(JioCinemaBaseIE._ACCESS_TOKEN)['data'] + JioCinemaBaseIE._USER_ID = user_token['userId'] + JioCinemaBaseIE._DEVICE_ID = user_token['deviceId'] + if JioCinemaBaseIE._REFRESH_TOKEN and username != 'device': + self._cache_token('all') + if self.get_param('cachedir') is not False: + self.to_screen( + f'NOTE: For subsequent logins you can use "-u device -p {JioCinemaBaseIE._DEVICE_ID}"') + elif not JioCinemaBaseIE._REFRESH_TOKEN: + JioCinemaBaseIE._REFRESH_TOKEN = self.cache.load( + JioCinemaBaseIE._NETRC_MACHINE, f'{JioCinemaBaseIE._DEVICE_ID}-refresh') + if JioCinemaBaseIE._REFRESH_TOKEN: + self._cache_token('access') + self.to_screen(f'Logging in as device ID "{JioCinemaBaseIE._DEVICE_ID}"') + if self._is_token_expired(JioCinemaBaseIE._ACCESS_TOKEN): + self._refresh_token() + + +class JioCinemaIE(JioCinemaBaseIE): + IE_NAME = 'jiocinema' + _VALID_URL = r'https?://(?:www\.)?jiocinema\.com/?(?:movies?/[^/?#]+/|tv-shows/(?:[^/?#]+/){3})(?P<id>\d{3,})' + _TESTS = [{ + 'url': 'https://www.jiocinema.com/tv-shows/agnisakshi-ek-samjhauta/1/pradeep-to-stop-the-wedding/3759931', + 'info_dict': { + 'id': '3759931', + 'ext': 'mp4', + 'title': 'Pradeep to stop the wedding?', + 'description': 'md5:75f72d1d1a66976633345a3de6d672b1', + 'episode': 'Pradeep to stop the wedding?', + 'episode_number': 89, + 'season': 'Agnisakshi…Ek Samjhauta-S1', + 'season_number': 1, + 'series': 'Agnisakshi Ek Samjhauta', + 'duration': 1238.0, + 'thumbnail': r're:https?://.+\.jpg', + 'age_limit': 13, + 'season_id': '3698031', + 'upload_date': '20230606', + 'timestamp': 1686009600, + 'release_date': '20230607', + 'genres': ['Drama'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.jiocinema.com/movies/bhediya/3754021/watch', + 'info_dict': { + 'id': '3754021', + 'ext': 'mp4', + 'title': 'Bhediya', + 'description': 'md5:a6bf2900371ac2fc3f1447401a9f7bb0', + 'episode': 'Bhediya', + 'duration': 8500.0, + 'thumbnail': r're:https?://.+\.jpg', + 'age_limit': 13, + 'upload_date': '20230525', + 'timestamp': 1685026200, + 'release_date': '20230524', + 'genres': ['Comedy'], + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _extract_formats_and_subtitles(self, playback, video_id): + m3u8_url = traverse_obj(playback, ( + 'data', 'playbackUrls', lambda _, v: v['streamtype'] == 'hls', 'url', {url_or_none}, any)) + if not m3u8_url: # DRM-only content only serves dash urls + self.report_drm(video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, m3u8_id='hls') + self._remove_duplicate_formats(formats) + + return { + # '/_definst_/smil:vod/' m3u8 manifests claim to have 720p+ formats but max out at 480p + 'formats': traverse_obj(formats, ( + lambda _, v: '/_definst_/smil:vod/' not in v['url'] or v['height'] <= 480)), + 'subtitles': subtitles, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + if not self._ACCESS_TOKEN and self._is_token_expired(self._GUEST_TOKEN): + self._fetch_guest_token() + elif self._ACCESS_TOKEN and self._is_token_expired(self._ACCESS_TOKEN): + self._refresh_token() + + playback = self._call_api( + f'https://apis-jiovoot.voot.com/playbackjv/v3/{video_id}', video_id, + 'Downloading playback JSON', headers={ + **self.geo_verification_headers(), + 'accesstoken': self._ACCESS_TOKEN or self._GUEST_TOKEN, + **self._APP_NAME, + 'deviceid': self._DEVICE_ID, + 'uniqueid': self._USER_ID, + 'x-apisignatures': self._API_SIGNATURES, + 'x-platform': 'androidweb', + 'x-platform-token': 'web', + }, data={ + '4k': False, + 'ageGroup': '18+', + 'appVersion': '3.4.0', + 'bitrateProfile': 'xhdpi', + 'capability': { + 'drmCapability': { + 'aesSupport': 'yes', + 'fairPlayDrmSupport': 'none', + 'playreadyDrmSupport': 'none', + 'widevineDRMSupport': 'none', + }, + 'frameRateCapability': [{ + 'frameRateSupport': '30fps', + 'videoQuality': '1440p', + }], + }, + 'continueWatchingRequired': False, + 'dolby': False, + 'downloadRequest': False, + 'hevc': False, + 'kidsSafe': False, + 'manufacturer': 'Windows', + 'model': 'Windows', + 'multiAudioRequired': True, + 'osVersion': '10', + 'parentalPinValid': True, + 'x-apisignatures': self._API_SIGNATURES, + }) + + status_code = traverse_obj(playback, ('code', {int})) + if status_code == 474: + self.raise_geo_restricted(countries=['IN']) + elif status_code == 1008: + error_msg = 'This content is only available for premium users' + if self._ACCESS_TOKEN: + raise ExtractorError(error_msg, expected=True) + self.raise_login_required(f'{error_msg}. {self._LOGIN_HINT}', method=None) + elif status_code == 400: + raise ExtractorError('The requested content is not available', expected=True) + elif status_code is not None and status_code != 200: + raise ExtractorError( + f'JioCinema says: {traverse_obj(playback, ("message", {str})) or status_code}') + + metadata = self._download_json( + f'{self._METADATA_API_BASE}/voot/v1/voot-web/content/query/asset-details', + video_id, fatal=False, query={ + 'ids': f'include:{video_id}', + 'responseType': 'common', + 'devicePlatformType': 'desktop', + }) + + return { + 'id': video_id, + 'http_headers': self._API_HEADERS, + **self._extract_formats_and_subtitles(playback, video_id), + **traverse_obj(playback, ('data', { + # fallback metadata + 'title': ('name', {str}), + 'description': ('fullSynopsis', {str}), + 'series': ('show', 'name', {str}, {lambda x: x or None}), + 'season': ('tournamentName', {str}, {lambda x: x if x != 'Season 0' else None}), + 'season_number': ('episode', 'season', {int_or_none}, {lambda x: x or None}), + 'episode': ('fullTitle', {str}), + 'episode_number': ('episode', 'episodeNo', {int_or_none}, {lambda x: x or None}), + 'age_limit': ('ageNemonic', {parse_age_limit}), + 'duration': ('totalDuration', {float_or_none}), + 'thumbnail': ('images', {url_or_none}), + })), + **traverse_obj(metadata, ('result', 0, { + 'title': ('fullTitle', {str}), + 'description': ('fullSynopsis', {str}), + 'series': ('showName', {str}, {lambda x: x or None}), + 'season': ('seasonName', {str}, {lambda x: x or None}), + 'season_number': ('season', {int_or_none}), + 'season_id': ('seasonId', {str}, {lambda x: x or None}), + 'episode': ('fullTitle', {str}), + 'episode_number': ('episode', {int_or_none}), + 'timestamp': ('uploadTime', {int_or_none}), + 'release_date': ('telecastDate', {str}), + 'age_limit': ('ageNemonic', {parse_age_limit}), + 'duration': ('duration', {float_or_none}), + 'genres': ('genres', ..., {str}), + 'thumbnail': ('seo', 'ogImage', {url_or_none}), + })), + } + + +class JioCinemaSeriesIE(JioCinemaBaseIE): + IE_NAME = 'jiocinema:series' + _VALID_URL = r'https?://(?:www\.)?jiocinema\.com/tv-shows/(?P<slug>[\w-]+)/(?P<id>\d{3,})' + _TESTS = [{ + 'url': 'https://www.jiocinema.com/tv-shows/naagin/3499917', + 'info_dict': { + 'id': '3499917', + 'title': 'naagin', + }, + 'playlist_mincount': 120, + }, { + 'url': 'https://www.jiocinema.com/tv-shows/mtv-splitsvilla-x5/3499820', + 'info_dict': { + 'id': '3499820', + 'title': 'mtv-splitsvilla-x5', + }, + 'playlist_mincount': 310, + }] + + def _entries(self, series_id): + seasons = traverse_obj(self._download_json( + f'{self._METADATA_API_BASE}/voot/v1/voot-web/view/show/{series_id}', series_id, + 'Downloading series metadata JSON', query={'responseType': 'common'}), ( + 'trays', lambda _, v: v['trayId'] == 'season-by-show-multifilter', + 'trayTabs', lambda _, v: v['id'])) + + for season_num, season in enumerate(seasons, start=1): + season_id = season['id'] + label = season.get('label') or season_num + for page_num in itertools.count(1): + episodes = traverse_obj(self._download_json( + f'{self._METADATA_API_BASE}/voot/v1/voot-web/content/generic/series-wise-episode', + season_id, f'Downloading season {label} page {page_num} JSON', query={ + 'sort': 'episode:asc', + 'id': season_id, + 'responseType': 'common', + 'page': page_num, + }), ('result', lambda _, v: v['id'] and url_or_none(v['slug']))) + if not episodes: + break + for episode in episodes: + yield self.url_result( + episode['slug'], JioCinemaIE, **traverse_obj(episode, { + 'video_id': 'id', + 'video_title': ('fullTitle', {str}), + 'season_number': ('season', {int_or_none}), + 'episode_number': ('episode', {int_or_none}), + })) + + def _real_extract(self, url): + slug, series_id = self._match_valid_url(url).group('slug', 'id') + return self.playlist_result(self._entries(series_id), series_id, slug) diff --git a/yt_dlp/extractor/jiosaavn.py b/yt_dlp/extractor/jiosaavn.py new file mode 100644 index 0000000000..030fe686bd --- /dev/null +++ b/yt_dlp/extractor/jiosaavn.py @@ -0,0 +1,201 @@ +import functools +import math +import re + +from .common import InfoExtractor +from ..utils import ( + InAdvancePagedList, + clean_html, + int_or_none, + make_archive_id, + smuggle_url, + unsmuggle_url, + url_basename, + url_or_none, + urlencode_postdata, +) +from ..utils.traversal import traverse_obj + + +class JioSaavnBaseIE(InfoExtractor): + _API_URL = 'https://www.jiosaavn.com/api.php' + _VALID_BITRATES = {'16', '32', '64', '128', '320'} + + @functools.cached_property + def requested_bitrates(self): + requested_bitrates = self._configuration_arg('bitrate', ['128', '320'], ie_key='JioSaavn') + if invalid_bitrates := set(requested_bitrates) - self._VALID_BITRATES: + raise ValueError( + f'Invalid bitrate(s): {", ".join(invalid_bitrates)}. ' + f'Valid bitrates are: {", ".join(sorted(self._VALID_BITRATES, key=int))}') + return requested_bitrates + + def _extract_formats(self, song_data): + for bitrate in self.requested_bitrates: + media_data = self._download_json( + self._API_URL, song_data['id'], + f'Downloading format info for {bitrate}', + fatal=False, data=urlencode_postdata({ + '__call': 'song.generateAuthToken', + '_format': 'json', + 'bitrate': bitrate, + 'url': song_data['encrypted_media_url'], + })) + if not traverse_obj(media_data, ('auth_url', {url_or_none})): + self.report_warning(f'Unable to extract format info for {bitrate}') + continue + ext = media_data.get('type') + yield { + 'url': media_data['auth_url'], + 'ext': 'm4a' if ext == 'mp4' else ext, + 'format_id': bitrate, + 'abr': int(bitrate), + 'vcodec': 'none', + } + + def _extract_song(self, song_data, url=None): + info = traverse_obj(song_data, { + 'id': ('id', {str}), + 'title': ('song', {clean_html}), + 'album': ('album', {clean_html}), + 'thumbnail': ('image', {url_or_none}, {lambda x: re.sub(r'-\d+x\d+\.', '-500x500.', x)}), + 'duration': ('duration', {int_or_none}), + 'view_count': ('play_count', {int_or_none}), + 'release_year': ('year', {int_or_none}), + 'artists': ('primary_artists', {lambda x: x.split(', ') if x else None}), + 'webpage_url': ('perma_url', {url_or_none}), + }) + if webpage_url := info.get('webpage_url') or url: + info['display_id'] = url_basename(webpage_url) + info['_old_archive_ids'] = [make_archive_id(JioSaavnSongIE, info['display_id'])] + + return info + + def _call_api(self, type_, token, note='API', params={}): + return self._download_json( + self._API_URL, token, f'Downloading {note} JSON', f'Unable to download {note} JSON', + query={ + '__call': 'webapi.get', + '_format': 'json', + '_marker': '0', + 'ctx': 'web6dot0', + 'token': token, + 'type': type_, + **params, + }) + + def _yield_songs(self, playlist_data): + for song_data in traverse_obj(playlist_data, ('songs', lambda _, v: v['id'] and v['perma_url'])): + song_info = self._extract_song(song_data) + url = smuggle_url(song_info['webpage_url'], { + 'id': song_data['id'], + 'encrypted_media_url': song_data['encrypted_media_url'], + }) + yield self.url_result(url, JioSaavnSongIE, url_transparent=True, **song_info) + + +class JioSaavnSongIE(JioSaavnBaseIE): + IE_NAME = 'jiosaavn:song' + _VALID_URL = r'https?://(?:www\.)?(?:jiosaavn\.com/song/[^/?#]+/|saavn\.com/s/song/(?:[^/?#]+/){3})(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/song/leja-re/OQsEfQFVUXk', + 'md5': '3b84396d15ed9e083c3106f1fa589c04', + 'info_dict': { + 'id': 'IcoLuefJ', + 'display_id': 'OQsEfQFVUXk', + 'ext': 'm4a', + 'title': 'Leja Re', + 'album': 'Leja Re', + 'thumbnail': r're:https?://c.saavncdn.com/258/Leja-Re-Hindi-2018-20181124024539-500x500.jpg', + 'duration': 205, + 'view_count': int, + 'release_year': 2018, + 'artists': ['Sandesh Shandilya', 'Dhvani Bhanushali', 'Tanishk Bagchi'], + '_old_archive_ids': ['jiosaavnsong OQsEfQFVUXk'], + }, + }, { + 'url': 'https://www.saavn.com/s/song/hindi/Saathiya/O-Humdum-Suniyo-Re/KAMiazoCblU', + 'only_matching': True, + }] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url) + song_data = traverse_obj(smuggled_data, ({ + 'id': ('id', {str}), + 'encrypted_media_url': ('encrypted_media_url', {str}), + })) + + if 'id' in song_data and 'encrypted_media_url' in song_data: + result = {'id': song_data['id']} + else: + # only extract metadata if this is not a url_transparent result + song_data = self._call_api('song', self._match_id(url))['songs'][0] + result = self._extract_song(song_data, url) + + result['formats'] = list(self._extract_formats(song_data)) + return result + + +class JioSaavnAlbumIE(JioSaavnBaseIE): + IE_NAME = 'jiosaavn:album' + _VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/album/[^/?#]+/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/album/96/buIOjYZDrNA_', + 'info_dict': { + 'id': 'buIOjYZDrNA_', + 'title': '96', + }, + 'playlist_count': 10, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + album_data = self._call_api('album', display_id) + + return self.playlist_result( + self._yield_songs(album_data), display_id, traverse_obj(album_data, ('title', {str}))) + + +class JioSaavnPlaylistIE(JioSaavnBaseIE): + IE_NAME = 'jiosaavn:playlist' + _VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/(?:s/playlist/(?:[^/?#]+/){2}|featured/[^/?#]+/)(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/s/playlist/2279fbe391defa793ad7076929a2f5c9/mood-english/LlJ8ZWT1ibN5084vKHRj2Q__', + 'info_dict': { + 'id': 'LlJ8ZWT1ibN5084vKHRj2Q__', + 'title': 'Mood English', + }, + 'playlist_mincount': 301, + }, { + 'url': 'https://www.jiosaavn.com/s/playlist/2279fbe391defa793ad7076929a2f5c9/mood-hindi/DVR,pFUOwyXqIp77B1JF,A__', + 'info_dict': { + 'id': 'DVR,pFUOwyXqIp77B1JF,A__', + 'title': 'Mood Hindi', + }, + 'playlist_mincount': 801, + }, { + 'url': 'https://www.jiosaavn.com/featured/taaza-tunes/Me5RridRfDk_', + 'info_dict': { + 'id': 'Me5RridRfDk_', + 'title': 'Taaza Tunes', + }, + 'playlist_mincount': 301, + }] + _PAGE_SIZE = 50 + + def _fetch_page(self, token, page): + return self._call_api( + 'playlist', token, f'playlist page {page}', {'p': page, 'n': self._PAGE_SIZE}) + + def _entries(self, token, first_page_data, page): + page_data = first_page_data if not page else self._fetch_page(token, page + 1) + yield from self._yield_songs(page_data) + + def _real_extract(self, url): + display_id = self._match_id(url) + playlist_data = self._fetch_page(display_id, 1) + total_pages = math.ceil(int(playlist_data['list_count']) / self._PAGE_SIZE) + + return self.playlist_result(InAdvancePagedList( + functools.partial(self._entries, display_id, playlist_data), + total_pages, self._PAGE_SIZE), display_id, traverse_obj(playlist_data, ('listname', {str}))) diff --git a/yt_dlp/extractor/joj.py b/yt_dlp/extractor/joj.py index ea46042404..0c8e999cdf 100644 --- a/yt_dlp/extractor/joj.py +++ b/yt_dlp/extractor/joj.py @@ -1,5 +1,4 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( format_field, int_or_none, @@ -25,7 +24,7 @@ class JojIE(InfoExtractor): 'title': 'NOVÉ BÝVANIE', 'thumbnail': r're:^https?://.*?$', 'duration': 3118, - } + }, }, { 'url': 'https://media.joj.sk/embed/CSM0Na0l0p1', 'info_dict': { @@ -35,7 +34,7 @@ class JojIE(InfoExtractor): 'title': 'Extrémne rodiny 2 - POKRAČOVANIE (2012/04/09 21:30:00)', 'duration': 3937, 'thumbnail': r're:^https?://.*?$', - } + }, }, { 'url': 'https://media.joj.sk/embed/9i1cxv', 'only_matching': True, @@ -51,7 +50,7 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - 'https://media.joj.sk/embed/%s' % video_id, video_id) + f'https://media.joj.sk/embed/{video_id}', video_id) title = (self._search_json(r'videoTitle\s*:', webpage, 'title', video_id, contains_pattern=r'["\'].+["\']', default=None) @@ -66,7 +65,7 @@ def _real_extract(self, url): formats = [] for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []: - if isinstance(format_url, compat_str): + if isinstance(format_url, str): height = self._search_regex( r'(\d+)[pP]|(pal)\.', format_url, 'height', default=None) if height == 'pal': @@ -78,7 +77,7 @@ def _real_extract(self, url): }) if not formats: playlist = self._download_xml( - 'https://media.joj.sk/services/Video.php?clip=%s' % video_id, + f'https://media.joj.sk/services/Video.php?clip={video_id}', video_id) for file_el in playlist.findall('./files/file'): path = file_el.get('path') @@ -86,8 +85,8 @@ def _real_extract(self, url): continue format_id = file_el.get('id') or file_el.get('label') formats.append({ - 'url': 'http://n16.joj.sk/storage/%s' % path.replace( - 'dat/', '', 1), + 'url': 'http://n16.joj.sk/storage/{}'.format(path.replace( + 'dat/', '', 1)), 'format_id': format_id, 'height': int_or_none(self._search_regex( r'(\d+)[pP]', format_id or path, 'height', diff --git a/yt_dlp/extractor/joqrag.py b/yt_dlp/extractor/joqrag.py new file mode 100644 index 0000000000..7a91d4a235 --- /dev/null +++ b/yt_dlp/extractor/joqrag.py @@ -0,0 +1,112 @@ +import datetime as dt +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + clean_html, + datetime_from_str, + unified_timestamp, + urljoin, +) + + +class JoqrAgIE(InfoExtractor): + IE_DESC = '超!A&G+ 文化放送 (f.k.a. AGQR) Nippon Cultural Broadcasting, Inc. (JOQR)' + _VALID_URL = [r'https?://www\.uniqueradio\.jp/agplayer5/(?:player|inc-player-hls)\.php', + r'https?://(?:www\.)?joqr\.co\.jp/ag/', + r'https?://(?:www\.)?joqr\.co\.jp/qr/ag(?:daily|regular)program/?(?:$|[#?])'] + _TESTS = [{ + 'url': 'https://www.uniqueradio.jp/agplayer5/player.php', + 'info_dict': { + 'id': 'live', + 'title': str, + 'channel': '超!A&G+', + 'description': str, + 'live_status': 'is_live', + 'release_timestamp': int, + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + }, { + 'url': 'https://www.uniqueradio.jp/agplayer5/inc-player-hls.php', + 'only_matching': True, + }, { + 'url': 'https://www.joqr.co.jp/ag/article/103760/', + 'only_matching': True, + }, { + 'url': 'http://www.joqr.co.jp/qr/agdailyprogram/', + 'only_matching': True, + }, { + 'url': 'http://www.joqr.co.jp/qr/agregularprogram/', + 'only_matching': True, + }] + + def _extract_metadata(self, variable, html): + return clean_html(urllib.parse.unquote_plus(self._search_regex( + rf'var\s+{variable}\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', + html, 'metadata', group='value', default=''))) or None + + def _extract_start_timestamp(self, video_id, is_live): + def extract_start_time_from(date_str): + dt_ = datetime_from_str(date_str) + dt.timedelta(hours=9) + date = dt_.strftime('%Y%m%d') + start_time = self._search_regex( + r'<h3[^>]+\bclass="dailyProgram-itemHeaderTime"[^>]*>[\s\d:]+–\s*(\d{1,2}:\d{1,2})', + self._download_webpage( + f'https://www.joqr.co.jp/qr/agdailyprogram/?date={date}', video_id, + note=f'Downloading program list of {date}', fatal=False, + errnote=f'Failed to download program list of {date}') or '', + 'start time', default=None) + if start_time: + return unified_timestamp(f'{dt_.strftime("%Y/%m/%d")} {start_time} +09:00') + return None + + start_timestamp = extract_start_time_from('today') + if not start_timestamp: + return None + + if not is_live or start_timestamp < datetime_from_str('now').timestamp(): + return start_timestamp + else: + return extract_start_time_from('yesterday') + + def _real_extract(self, url): + video_id = 'live' + + metadata = self._download_webpage( + 'https://www.uniqueradio.jp/aandg', video_id, + note='Downloading metadata', errnote='Failed to download metadata') + title = self._extract_metadata('Program_name', metadata) + + if not title or title == '放送休止': + formats = [] + live_status = 'is_upcoming' + release_timestamp = self._extract_start_timestamp(video_id, False) + msg = 'This stream is not currently live' + if release_timestamp: + msg += (' and will start at ' + + dt.datetime.fromtimestamp(release_timestamp).strftime('%Y-%m-%d %H:%M:%S')) + self.raise_no_formats(msg, expected=True) + else: + m3u8_path = self._search_regex( + r'<source\s[^>]*\bsrc="([^"]+)"', + self._download_webpage( + 'https://www.uniqueradio.jp/agplayer5/inc-player-hls.php', video_id, + note='Downloading player data', errnote='Failed to download player data'), + 'm3u8 url') + formats = self._extract_m3u8_formats( + urljoin('https://www.uniqueradio.jp/', m3u8_path), video_id) + live_status = 'is_live' + release_timestamp = self._extract_start_timestamp(video_id, True) + + return { + 'id': video_id, + 'title': title, + 'channel': '超!A&G+', + 'description': self._extract_metadata('Program_text', metadata), + 'formats': formats, + 'live_status': live_status, + 'release_timestamp': release_timestamp, + } diff --git a/yt_dlp/extractor/jove.py b/yt_dlp/extractor/jove.py index 245fe73d4a..6b37ccfdcc 100644 --- a/yt_dlp/extractor/jove.py +++ b/yt_dlp/extractor/jove.py @@ -1,8 +1,5 @@ from .common import InfoExtractor -from ..utils import ( - ExtractorError, - unified_strdate -) +from ..utils import ExtractorError, unified_strdate class JoveIE(InfoExtractor): @@ -19,7 +16,7 @@ class JoveIE(InfoExtractor): 'description': 'md5:015dd4509649c0908bc27f049e0262c6', 'thumbnail': r're:^https?://.*\.png$', 'upload_date': '20110523', - } + }, }, { 'url': 'http://www.jove.com/video/51796/culturing-caenorhabditis-elegans-axenic-liquid-media-creation', @@ -31,7 +28,7 @@ class JoveIE(InfoExtractor): 'description': 'md5:35ff029261900583970c4023b70f1dc9', 'thumbnail': r're:^https?://.*\.png$', 'upload_date': '20140802', - } + }, }, ] diff --git a/yt_dlp/extractor/jstream.py b/yt_dlp/extractor/jstream.py new file mode 100644 index 0000000000..00ac7ccca3 --- /dev/null +++ b/yt_dlp/extractor/jstream.py @@ -0,0 +1,73 @@ +import base64 +import json +import re + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + js_to_json, + remove_start, +) + + +class JStreamIE(InfoExtractor): + # group "id" only exists for compliance, not directly used in requests + # also all components are mandatory + _VALID_URL = r'jstream:(?P<host>www\d+):(?P<id>(?P<publisher>[a-z0-9]+):(?P<mid>\d+))' + + _TESTS = [{ + 'url': 'jstream:www50:eqd638pvwx:752', + 'info_dict': { + 'id': 'eqd638pvwx:752', + 'ext': 'mp4', + 'title': '阪神淡路大震災 激震の記録2020年版 解説動画', + 'duration': 672, + 'thumbnail': r're:https?://eqd638pvwx\.eq\.webcdn\.stream\.ne\.jp/.+\.jpg', + }, + }] + + def _parse_jsonp(self, callback, string, video_id): + return self._search_json(rf'\s*{re.escape(callback)}\s*\(', string, callback, video_id) + + def _find_formats(self, video_id, movie_list_hls, host, publisher, subtitles): + for value in movie_list_hls: + text = value.get('text') or '' + if not text.startswith('auto'): + continue + m3u8_id = remove_start(remove_start(text, 'auto'), '_') or None + fmts, subs = self._extract_m3u8_formats_and_subtitles( + f'https://{publisher}.eq.webcdn.stream.ne.jp/{host}/{publisher}/jmc_pub/{value.get("url")}', video_id, 'mp4', m3u8_id=m3u8_id) + self._merge_subtitles(subs, target=subtitles) + yield from fmts + + def _real_extract(self, url): + host, publisher, mid, video_id = self._match_valid_url(url).group('host', 'publisher', 'mid', 'id') + video_info_jsonp = self._download_webpage( + f'https://{publisher}.eq.webcdn.stream.ne.jp/{host}/{publisher}/jmc_pub/eq_meta/v1/{mid}.jsonp', + video_id, 'Requesting video info') + video_info = self._parse_jsonp('metaDataResult', video_info_jsonp, video_id)['movie'] + subtitles = {} + formats = list(self._find_formats(video_id, video_info.get('movie_list_hls'), host, publisher, subtitles)) + self._remove_duplicate_formats(formats) + return { + 'id': video_id, + 'title': video_info.get('title'), + 'duration': float_or_none(video_info.get('duration')), + 'thumbnail': video_info.get('thumbnail_url'), + 'formats': formats, + 'subtitles': subtitles, + } + + @classmethod + def _extract_embed_urls(cls, url, webpage): + # check for eligiblity of webpage + # https://support.eq.stream.co.jp/hc/ja/articles/115008388147-%E3%83%97%E3%83%AC%E3%82%A4%E3%83%A4%E3%83%BCAPI%E3%81%AE%E3%82%B5%E3%83%B3%E3%83%97%E3%83%AB%E3%82%B3%E3%83%BC%E3%83%89 + script_tag = re.search(r'<script\s*[^>]+?src="https://ssl-cache\.stream\.ne\.jp/(?P<host>www\d+)/(?P<publisher>[a-z0-9]+)/[^"]+?/if\.js"', webpage) + if not script_tag: + return + host, publisher = script_tag.groups() + for m in re.finditer(r'(?s)PlayerFactoryIF\.create\(\s*({[^\}]+?})\s*\)\s*;', webpage): + # TODO: using json.loads here as InfoExtractor._parse_json is not classmethod + info = json.loads(js_to_json(m.group(1))) + mid = base64.b64decode(info.get('m')).decode() + yield f'jstream:{host}:{publisher}:{mid}' diff --git a/yt_dlp/extractor/jtbc.py b/yt_dlp/extractor/jtbc.py new file mode 100644 index 0000000000..573f7492fe --- /dev/null +++ b/yt_dlp/extractor/jtbc.py @@ -0,0 +1,156 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class JTBCIE(InfoExtractor): + IE_DESC = 'jtbc.co.kr' + _VALID_URL = r'''(?x) + https?://(?: + vod\.jtbc\.co\.kr/player/(?:program|clip) + |tv\.jtbc\.co\.kr/(?:replay|trailer|clip)/pr\d+/pm\d+ + )/(?P<id>(?:ep|vo)\d+)''' + _GEO_COUNTRIES = ['KR'] + + _TESTS = [{ + 'url': 'https://tv.jtbc.co.kr/replay/pr10011629/pm10067930/ep20216321/view', + 'md5': 'e6ade71d8c8685bbfd6e6ce4167c6a6c', + 'info_dict': { + 'id': 'VO10721192', + 'display_id': 'ep20216321', + 'ext': 'mp4', + 'title': '힘쎈여자 강남순 2회 다시보기', + 'description': 'md5:043c1d9019100ce271dba09995dbd1e2', + 'duration': 3770.0, + 'release_date': '20231008', + 'age_limit': 15, + 'thumbnail': 'https://fs.jtbc.co.kr//joydata/CP00000001/prog/drama/stronggirlnamsoon/img/20231008_163541_522_1.jpg', + 'series': '힘쎈여자 강남순', + }, + }, { + 'url': 'https://vod.jtbc.co.kr/player/program/ep20216733', + 'md5': '217a6d190f115a75e4bda0ceaa4cd7f4', + 'info_dict': { + 'id': 'VO10721429', + 'display_id': 'ep20216733', + 'ext': 'mp4', + 'title': '헬로 마이 닥터 친절한 진료실 149회 다시보기', + 'description': 'md5:1d70788a982dd5de26874a92fcffddb8', + 'duration': 2720.0, + 'release_date': '20231009', + 'age_limit': 15, + 'thumbnail': 'https://fs.jtbc.co.kr//joydata/CP00000001/prog/culture/hellomydoctor/img/20231009_095002_528_1.jpg', + 'series': '헬로 마이 닥터 친절한 진료실', + }, + }, { + 'url': 'https://vod.jtbc.co.kr/player/clip/vo10721270', + 'md5': '05782e2dc22a9c548aebefe62ae4328a', + 'info_dict': { + 'id': 'VO10721270', + 'display_id': 'vo10721270', + 'ext': 'mp4', + 'title': '뭉쳐야 찬다3 2회 예고편 - A매치로 향하는 마지막 관문💥', + 'description': 'md5:d48b51a8655c84843b4ed8d0c39aae68', + 'duration': 46.0, + 'release_date': '20231015', + 'age_limit': 15, + 'thumbnail': 'https://fs.jtbc.co.kr//joydata/CP00000001/prog/enter/soccer3/img/20231008_210957_775_1.jpg', + 'series': '뭉쳐야 찬다3', + }, + }, { + 'url': 'https://tv.jtbc.co.kr/trailer/pr10010392/pm10032526/vo10720912/view', + 'md5': '367d480eb3ef54a9cd7a4b4d69c4b32d', + 'info_dict': { + 'id': 'VO10720912', + 'display_id': 'vo10720912', + 'ext': 'mp4', + 'title': '아는 형님 404회 예고편 | 10월 14일(토) 저녁 8시 50분 방송!', + 'description': 'md5:2743bb1079ceb85bb00060f2ad8f0280', + 'duration': 148.0, + 'release_date': '20231014', + 'age_limit': 15, + 'thumbnail': 'https://fs.jtbc.co.kr//joydata/CP00000001/prog/enter/jtbcbros/img/20231006_230023_802_1.jpg', + 'series': '아는 형님', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + if display_id.startswith('vo'): + video_id = display_id.upper() + else: + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex(r'data-vod="(VO\d+)"', webpage, 'vod id') + + playback_data = self._download_json( + f'https://api.jtbc.co.kr/vod/{video_id}', video_id, note='Downloading VOD playback data') + + subtitles = {} + for sub in traverse_obj(playback_data, ('tracks', lambda _, v: v['file'])): + subtitles.setdefault(sub.get('label', 'und'), []).append({'url': sub['file']}) + + formats = [] + for stream_url in traverse_obj(playback_data, ('sources', 'HLS', ..., 'file', {url_or_none})): + stream_url = re.sub(r'/playlist(?:_pd\d+)?\.m3u8', '/index.m3u8', stream_url) + formats.extend(self._extract_m3u8_formats(stream_url, video_id, fatal=False)) + + metadata = self._download_json( + 'https://now-api.jtbc.co.kr/v1/vod/detail', video_id, + note='Downloading mobile details', fatal=False, query={'vodFileId': video_id}) + return { + 'id': video_id, + 'display_id': display_id, + **traverse_obj(metadata, ('vodDetail', { + 'title': 'vodTitleView', + 'series': 'programTitle', + 'age_limit': ('watchAge', {int_or_none}), + 'release_date': ('broadcastDate', {lambda x: re.match(r'\d{8}', x.replace('.', ''))}, 0), + 'description': 'episodeContents', + 'thumbnail': ('imgFileUrl', {url_or_none}), + })), + 'duration': parse_duration(playback_data.get('playTime')), + 'formats': formats, + 'subtitles': subtitles, + } + + +class JTBCProgramIE(InfoExtractor): + IE_NAME = 'JTBC:program' + _VALID_URL = r'https?://(?:vod\.jtbc\.co\.kr/program|tv\.jtbc\.co\.kr/replay)/(?P<id>pr\d+)/(?:replay|pm\d+)/?(?:$|[?#])' + + _TESTS = [{ + 'url': 'https://tv.jtbc.co.kr/replay/pr10010392/pm10032710', + 'info_dict': { + '_type': 'playlist', + 'id': 'pr10010392', + }, + 'playlist_count': 398, + }, { + 'url': 'https://vod.jtbc.co.kr/program/pr10011491/replay', + 'info_dict': { + '_type': 'playlist', + 'id': 'pr10011491', + }, + 'playlist_count': 59, + }] + + def _real_extract(self, url): + program_id = self._match_id(url) + + vod_list = self._download_json( + 'https://now-api.jtbc.co.kr/v1/vodClip/programHome/programReplayVodList', program_id, + note='Downloading program replay list', query={ + 'programId': program_id, + 'rowCount': '10000', + }) + + entries = [self.url_result(f'https://vod.jtbc.co.kr/player/program/{video_id}', JTBCIE, video_id) + for video_id in traverse_obj(vod_list, ('programReplayVodList', ..., 'episodeId'))] + return self.playlist_result(entries, program_id) diff --git a/yt_dlp/extractor/jwplatform.py b/yt_dlp/extractor/jwplatform.py index c949689430..7d5a931b5f 100644 --- a/yt_dlp/extractor/jwplatform.py +++ b/yt_dlp/extractor/jwplatform.py @@ -8,15 +8,17 @@ class JWPlatformIE(InfoExtractor): _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview|manifest)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' _TESTS = [{ 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', - 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', + 'md5': '3aa16e4f6860e6e78b7df5829519aed3', 'info_dict': { 'id': 'nPripu9l', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'Big Buck Bunny Trailer', 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.', 'upload_date': '20081127', 'timestamp': 1227796140, - } + 'duration': 32.0, + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/nPripu9l/poster.jpg?width=720', + }, }, { 'url': 'https://cdn.jwplayer.com/players/nPripu9l-ALJ3XQCI.js', 'only_matching': True, @@ -37,18 +39,31 @@ class JWPlatformIE(InfoExtractor): }, }, { # Player url not surrounded by quotes - 'url': 'https://www.deutsche-kinemathek.de/en/online/streaming/darling-berlin', + 'url': 'https://www.deutsche-kinemathek.de/en/online/streaming/school-trip', 'info_dict': { - 'id': 'R10NQdhY', - 'title': 'Playgirl', + 'id': 'jUxh5uin', + 'title': 'Klassenfahrt', 'ext': 'mp4', - 'upload_date': '20220624', - 'thumbnail': 'https://cdn.jwplayer.com/v2/media/R10NQdhY/poster.jpg?width=720', - 'timestamp': 1656064800, - 'description': 'BRD 1966, Will Tremper', - 'duration': 5146.0, + 'upload_date': '20230109', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/jUxh5uin/poster.jpg?width=720', + 'timestamp': 1673270298, + 'description': '', + 'duration': 5193.0, }, 'params': {'allowed_extractors': ['generic', 'jwplatform']}, + }, { + # iframe src attribute includes backslash before URL string + 'url': 'https://www.elespectador.com/colombia/video-asi-se-evito-la-fuga-de-john-poulos-presunto-feminicida-de-valentina-trespalacios-explicacion', + 'info_dict': { + 'id': 'QD3gsexj', + 'title': 'Así se evitó la fuga de John Poulos, presunto feminicida de Valentina Trespalacios', + 'ext': 'mp4', + 'upload_date': '20230127', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/QD3gsexj/poster.jpg?width=720', + 'timestamp': 1674862986, + 'description': 'md5:128fd74591c4e1fc2da598c5cb6f5ce4', + 'duration': 263.0, + }, }] @classmethod @@ -57,7 +72,7 @@ def _extract_embed_urls(cls, url, webpage): # <input value=URL> is used by hyland.com # if we find <iframe>, dont look for <input> ret = re.findall( - r'<%s[^>]+?%s=["\']?((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{8})' % (tag, key), + rf'<{tag}[^>]+?{key}=\\?["\']?((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{{8}})', webpage) if ret: return ret diff --git a/yt_dlp/extractor/kakao.py b/yt_dlp/extractor/kakao.py index 1f0f0a5d5c..6f3459db82 100644 --- a/yt_dlp/extractor/kakao.py +++ b/yt_dlp/extractor/kakao.py @@ -1,10 +1,10 @@ from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, - strip_or_none, str_or_none, + strip_or_none, traverse_obj, unified_timestamp, ) @@ -33,7 +33,7 @@ class KakaoIE(InfoExtractor): 'view_count': int, 'duration': 1503, 'comment_count': int, - } + }, }, { 'url': 'http://tv.kakao.com/channel/2653210/cliplink/300103180', 'md5': 'a8917742069a4dd442516b86e7d66529', @@ -52,7 +52,7 @@ class KakaoIE(InfoExtractor): 'view_count': int, 'duration': 184, 'comment_count': int, - } + }, }, { # geo restricted 'url': 'https://tv.kakao.com/channel/3643855/cliplink/412069491', @@ -76,7 +76,7 @@ def _real_extract(self, url): 'description', 'channelId', 'createTime', 'duration', 'playCount', 'likeCount', 'commentCount', 'tagList', 'channel', 'name', 'clipChapterThumbnailList', 'thumbnailUrl', 'timeInSec', 'isDefault', - 'videoOutputList', 'width', 'height', 'kbps', 'profile', 'label']) + 'videoOutputList', 'width', 'height', 'kbps', 'profile', 'label']), } api_json = self._download_json( @@ -99,10 +99,10 @@ def _real_extract(self, url): try: fmt_url_json = self._download_json( cdn_api_base, video_id, query=query, - note='Downloading video URL for profile %s' % profile_name) + note=f'Downloading video URL for profile {profile_name}') except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - resp = self._parse_json(e.cause.read().decode(), video_id) + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + resp = self._parse_json(e.cause.response.read().decode(), video_id) if resp.get('code') == 'GeoBlocked': self.raise_geo_restricted() raise @@ -126,7 +126,7 @@ def _real_extract(self, url): thumbs.append({ 'url': thumb.get('thumbnailUrl'), 'id': str(thumb.get('timeInSec')), - 'preference': -1 if thumb.get('isDefault') else 0 + 'preference': -1 if thumb.get('isDefault') else 0, }) top_thumbnail = clip.get('thumbnailUrl') if top_thumbnail: diff --git a/yt_dlp/extractor/kaltura.py b/yt_dlp/extractor/kaltura.py index 95e2deea5b..e5737b1e9e 100644 --- a/yt_dlp/extractor/kaltura.py +++ b/yt_dlp/extractor/kaltura.py @@ -1,21 +1,19 @@ import base64 +import contextlib import json import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_parse_qs, -) from ..utils import ( - clean_html, ExtractorError, + clean_html, format_field, int_or_none, - unsmuggle_url, + remove_start, smuggle_url, traverse_obj, - remove_start + unsmuggle_url, ) @@ -57,7 +55,7 @@ class KalturaIE(InfoExtractor): 'thumbnail': 're:^https?://.*/thumbnail/.*', 'timestamp': int, }, - 'skip': 'The access to this service is forbidden since the specified partner is blocked' + 'skip': 'The access to this service is forbidden since the specified partner is blocked', }, { 'url': 'http://www.kaltura.com/index.php/kwidget/cache_st/1300318621/wid/_269692/uiconf_id/3873291/entry_id/1_1jc2y3e4', @@ -124,14 +122,14 @@ class KalturaIE(InfoExtractor): 'view_count': int, 'upload_date': '20140815', 'thumbnail': 'http://cfvod.kaltura.com/p/691292/sp/69129200/thumbnail/entry_id/0_c076mna6/version/100022', - } + }, }, { # html5lib playlist URL using kwidget player 'url': 'https://cdnapisec.kaltura.com/html5/html5lib/v2.89/mwEmbedFrame.php/p/2019031/uiconf_id/40436601?wid=1_4j3m32cv&iframeembed=true&playerId=kaltura_player_&flashvars[playlistAPI.kpl0Id]=1_jovey5nu&flashvars[ks]=&&flashvars[imageDefaultDuration]=30&flashvars[localizationCode]=en&flashvars[leadWithHTML5]=true&flashvars[forceMobileHTML5]=true&flashvars[nextPrevBtn.plugin]=true&flashvars[hotspots.plugin]=true&flashvars[sideBarContainer.plugin]=true&flashvars[sideBarContainer.position]=left&flashvars[sideBarContainer.clickToClose]=true&flashvars[chapters.plugin]=true&flashvars[chapters.layout]=vertical&flashvars[chapters.thumbnailRotator]=false&flashvars[streamSelector.plugin]=true&flashvars[EmbedPlayer.SpinnerTarget]=videoHolder&flashvars[dualScreen.plugin]=true&flashvars[playlistAPI.playlistUrl]=https://canvasgatechtest.kaf.kaltura.com/playlist/details/{playlistAPI.kpl0Id}/categoryid/126428551', 'info_dict': { 'id': '1_jovey5nu', - 'title': '00-00 Introduction' + 'title': '00-00 Introduction', }, 'playlist': [ { @@ -145,7 +143,7 @@ class KalturaIE(InfoExtractor): 'timestamp': 1533154447, 'upload_date': '20180801', 'uploader_id': 'djoyner3', - } + }, }, { 'info_dict': { 'id': '1_jfb7mdpn', @@ -157,7 +155,7 @@ class KalturaIE(InfoExtractor): 'timestamp': 1533154489, 'upload_date': '20180801', 'uploader_id': 'djoyner3', - } + }, }, { 'info_dict': { 'id': '1_8xflxdp7', @@ -169,7 +167,7 @@ class KalturaIE(InfoExtractor): 'timestamp': 1533154512, 'upload_date': '20180801', 'uploader_id': 'djoyner3', - } + }, }, { 'info_dict': { 'id': '1_3hqew8kn', @@ -181,10 +179,10 @@ class KalturaIE(InfoExtractor): 'timestamp': 1533154536, 'upload_date': '20180801', 'uploader_id': 'djoyner3', - } - } - ] - } + }, + }, + ], + }, ] @classmethod @@ -192,14 +190,14 @@ def _extract_embed_urls(cls, url, webpage): # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site finditer = ( list(re.finditer( - r"""(?xs) + r'''(?xs) kWidget\.(?:thumb)?[Ee]mbed\( \{.*? (?P<q1>['"])wid(?P=q1)\s*:\s* (?P<q2>['"])_?(?P<partner_id>(?:(?!(?P=q2)).)+)(?P=q2),.*? (?P<q3>['"])entry_?[Ii]d(?P=q3)\s*:\s* (?P<q4>['"])(?P<id>(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\}) - """, webpage)) + ''', webpage)) or list(re.finditer( r'''(?xs) (?P<q1>["']) @@ -230,34 +228,34 @@ def _extract_embed_urls(cls, url, webpage): for k, v in embed_info.items(): if v: embed_info[k] = v.strip() - embed_url = 'kaltura:%(partner_id)s:%(id)s' % embed_info + embed_url = 'kaltura:{partner_id}:{id}'.format(**embed_info) escaped_pid = re.escape(embed_info['partner_id']) service_mobj = re.search( - r'<script[^>]+src=(["\'])(?P<id>(?:https?:)?//(?:(?!\1).)+)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid), + rf'<script[^>]+src=(["\'])(?P<id>(?:https?:)?//(?:(?!\1).)+)/p/{escaped_pid}/sp/{escaped_pid}00/embedIframeJs', webpage) if service_mobj: embed_url = smuggle_url(embed_url, {'service_url': service_mobj.group('id')}) urls.append(embed_url) return urls - def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs): + def _kaltura_api_call(self, video_id, actions, service_url=None, **kwargs): params = actions[0] - params.update({i: a for i, a in enumerate(actions[1:], start=1)}) + params.update(dict(enumerate(actions[1:], start=1))) data = self._download_json( (service_url or self._SERVICE_URL) + self._SERVICE_BASE, - video_id, data=json.dumps(params).encode('utf-8'), + video_id, data=json.dumps(params).encode(), headers={ 'Content-Type': 'application/json', 'Accept-Encoding': 'gzip, deflate, br', - }, *args, **kwargs) + }, **kwargs) for idx, status in enumerate(data): if not isinstance(status, dict): continue if status.get('objectType') == 'KalturaAPIException': raise ExtractorError( - '%s said: %s (%d)' % (self.IE_NAME, status['message'], idx)) + '{} said: {} ({})'.format(self.IE_NAME, status['message'], idx)) data[1] = traverse_obj(data, (1, 'objects', 0)) @@ -342,7 +340,7 @@ def _get_video_info_kwidget(self, video_id, partner_id, service_url=None): 'apiVersion': '3.1', 'clientTag': 'kwidget:v2.89', 'ignoreNull': 1, - 'ks': '{1:result:ks}' + 'ks': '{1:result:ks}', }, # info { @@ -397,10 +395,10 @@ def _real_extract(self, url): raise ExtractorError('Invalid URL', expected=True) params = {} if query: - params = compat_parse_qs(query) + params = urllib.parse.parse_qs(query) if path: splitted_path = path.split('/') - params.update(dict((zip(splitted_path[::2], [[v] for v in splitted_path[1::2]])))) + params.update(dict(zip(splitted_path[::2], [[v] for v in splitted_path[1::2]]))) if 'wid' in params: partner_id = remove_start(params['wid'][0], '_') elif 'p' in params: @@ -423,14 +421,11 @@ def _real_extract(self, url): # Unfortunately, data returned in kalturaIframePackageData lacks # captions so we will try requesting the complete data using # regular approach since we now know the entry_id - try: + # Even if this fails we already have everything extracted + # apart from captions and can process at least with this + with contextlib.suppress(ExtractorError): _, info, flavor_assets, captions = self._get_video_info( entry_id, partner_id, player_type=player_type) - except ExtractorError: - # Regular scenario failed but we already have everything - # extracted apart from captions and can process at least - # with this - pass elif 'uiconf_id' in params and 'flashvars[playlistAPI.kpl0Id]' in params: playlist_id = params['flashvars[playlistAPI.kpl0Id]'][0] webpage = self._download_webpage(url, playlist_id) @@ -451,16 +446,16 @@ def _per_video_extract(self, smuggled_data, entry_id, info, ks, flavor_assets, c source_url = smuggled_data.get('source_url') if source_url: referrer = base64.b64encode( - '://'.join(compat_urlparse.urlparse(source_url)[:2]) - .encode('utf-8')).decode('utf-8') + '://'.join(urllib.parse.urlparse(source_url)[:2]) + .encode()).decode('utf-8') else: referrer = None def sign_url(unsigned_url): if ks: - unsigned_url += '/ks/%s' % ks + unsigned_url += f'/ks/{ks}' if referrer: - unsigned_url += '?referrer=%s' % referrer + unsigned_url += f'?referrer={referrer}' return unsigned_url data_url = info['dataUrl'] @@ -487,8 +482,8 @@ def sign_url(unsigned_url): else: f['fileExt'] = 'mp4' video_url = sign_url( - '%s/flavorId/%s' % (data_url, f['id'])) - format_id = '%(fileExt)s-%(bitrate)s' % f + '{}/flavorId/{}'.format(data_url, f['id'])) + format_id = '{fileExt}-{bitrate}'.format(**f) # Source format may not be available (e.g. kaltura:513551:1_66x4rg7o) if f.get('isOriginal') is True and not self._is_valid_url( video_url, entry_id, format_id): @@ -527,7 +522,7 @@ def sign_url(unsigned_url): continue caption_format = int_or_none(caption.get('format')) subtitles.setdefault(caption.get('languageCode') or caption.get('language'), []).append({ - 'url': '%s/api_v3/service/caption_captionasset/action/serve/captionAssetId/%s' % (self._SERVICE_URL, caption['id']), + 'url': '{}/api_v3/service/caption_captionasset/action/serve/captionAssetId/{}'.format(self._SERVICE_URL, caption['id']), 'ext': caption.get('fileExt') or self._CAPTION_TYPES.get(caption_format) or 'ttml', }) diff --git a/yt_dlp/extractor/kanal2.py b/yt_dlp/extractor/kanal2.py deleted file mode 100644 index 3c0efe5981..0000000000 --- a/yt_dlp/extractor/kanal2.py +++ /dev/null @@ -1,66 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - join_nonempty, - traverse_obj, - unified_timestamp, - update_url_query, -) - - -class Kanal2IE(InfoExtractor): - _VALID_URL = r'https?://kanal2\.postimees\.ee/[^?#]+\?([^#]+&)?id=(?P<id>\d+)' - _TESTS = [{ - 'note': 'Test standard url (#5575)', - 'url': 'https://kanal2.postimees.ee/pluss/video/?id=40792', - 'md5': '7ea7b16266ec1798743777df241883dd', - 'info_dict': { - 'id': '40792', - 'ext': 'mp4', - 'title': 'Aedniku aabits / Osa 53 (05.08.2016 20:00)', - 'thumbnail': r're:https?://.*\.jpg$', - 'description': 'md5:53cabf3c5d73150d594747f727431248', - 'upload_date': '20160805', - 'timestamp': 1470420000, - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - playlist = self._download_json( - f'https://kanal2.postimees.ee/player/playlist/{video_id}', - video_id, query={'type': 'episodes'}, - headers={'X-Requested-With': 'XMLHttpRequest'}) - - return { - 'id': video_id, - 'title': join_nonempty(*traverse_obj(playlist, ('info', ('title', 'subtitle'))), delim=' / '), - 'description': traverse_obj(playlist, ('info', 'description')), - 'thumbnail': traverse_obj(playlist, ('data', 'image')), - 'formats': self.get_formats(playlist, video_id), - 'timestamp': unified_timestamp(self._search_regex( - r'\((\d{2}\.\d{2}\.\d{4}\s\d{2}:\d{2})\)$', - traverse_obj(playlist, ('info', 'subtitle')), 'timestamp', default='') + ' +0200'), - } - - def get_formats(self, playlist, video_id): - path = traverse_obj(playlist, ('data', 'path')) - if not path: - raise ExtractorError('Path value not found in playlist JSON response') - session = self._download_json( - 'https://sts.postimees.ee/session/register', - video_id, note='Creating session', errnote='Error creating session', - headers={ - 'X-Original-URI': path, - 'Accept': 'application/json', - }) - if session.get('reason') != 'OK' or not session.get('session'): - reason = session.get('reason', 'unknown error') - raise ExtractorError(f'Unable to obtain session: {reason}') - - formats = [] - for stream in traverse_obj(playlist, ('data', 'streams', ..., 'file')): - formats.extend(self._extract_m3u8_formats( - update_url_query(stream, {'s': session['session']}), video_id, 'mp4')) - - return formats diff --git a/yt_dlp/extractor/kankanews.py b/yt_dlp/extractor/kankanews.py index 46e239bd6c..a39ff78ce6 100644 --- a/yt_dlp/extractor/kankanews.py +++ b/yt_dlp/extractor/kankanews.py @@ -1,13 +1,14 @@ -import time +import hashlib import random import string -import hashlib +import time import urllib.parse from .common import InfoExtractor class KankaNewsIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?kankanews\.com/a/\d+\-\d+\-\d+/(?P<id>\d+)\.shtml' _TESTS = [{ 'url': 'https://www.kankanews.com/a/2022-11-08/00310276054.shtml?appid=1088227', @@ -18,7 +19,7 @@ class KankaNewsIE(InfoExtractor): 'ext': 'mp4', 'title': '视频|第23个中国记者节,我们在进博切蛋糕', 'thumbnail': r're:^https?://.*\.jpg*', - } + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/karaoketv.py b/yt_dlp/extractor/karaoketv.py index 381dc00ad7..8168b1a492 100644 --- a/yt_dlp/extractor/karaoketv.py +++ b/yt_dlp/extractor/karaoketv.py @@ -13,7 +13,7 @@ class KaraoketvIE(InfoExtractor): 'params': { # rtmp download 'skip_download': True, - } + }, } def _real_extract(self, url): @@ -45,7 +45,7 @@ def _real_extract(self, url): servers = ('wowzail.video-cdn.com:80/vodcdn', ) formats = [{ - 'url': 'rtmp://%s' % server if not server.startswith('rtmp') else server, + 'url': f'rtmp://{server}' if not server.startswith('rtmp') else server, 'play_path': play_path, 'app': 'vodcdn', 'page_url': video_cdn_url, diff --git a/yt_dlp/extractor/karrierevideos.py b/yt_dlp/extractor/karrierevideos.py deleted file mode 100644 index 28d4841aa3..0000000000 --- a/yt_dlp/extractor/karrierevideos.py +++ /dev/null @@ -1,96 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - fix_xml_ampersands, - float_or_none, - xpath_with_ns, - xpath_text, -) - - -class KarriereVideosIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)' - _TESTS = [{ - 'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin', - 'info_dict': { - 'id': '32c91', - 'ext': 'flv', - 'title': 'AltenpflegerIn', - 'description': 'md5:dbadd1259fde2159a9b28667cb664ae2', - 'thumbnail': r're:^http://.*\.png', - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - # broken ampersands - 'url': 'http://www.karrierevideos.at/orientierung/vaeterkarenz-und-neue-chancen-fuer-muetter-baby-was-nun', - 'info_dict': { - 'id': '5sniu', - 'ext': 'flv', - 'title': 'Väterkarenz und neue Chancen für Mütter - "Baby - was nun?"', - 'description': 'md5:97092c6ad1fd7d38e9d6a5fdeb2bcc33', - 'thumbnail': r're:^http://.*\.png', - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - title = (self._html_search_meta('title', webpage, default=None) - or self._search_regex(r'<h1 class="title">([^<]+)</h1>', webpage, 'video title')) - - video_id = self._search_regex( - r'/config/video/(.+?)\.xml', webpage, 'video id') - # Server returns malformed headers - # Force Accept-Encoding: * to prevent gzipped results - playlist = self._download_xml( - 'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % video_id, - video_id, transform_source=fix_xml_ampersands, - headers={'Accept-Encoding': '*'}) - - NS_MAP = { - 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats' - } - - def ns(path): - return xpath_with_ns(path, NS_MAP) - - item = playlist.find('./tracklist/item') - video_file = xpath_text( - item, ns('./jwplayer:file'), 'video url', fatal=True) - streamer = xpath_text( - item, ns('./jwplayer:streamer'), 'streamer', fatal=True) - - uploader = xpath_text( - item, ns('./jwplayer:author'), 'uploader') - duration = float_or_none( - xpath_text(item, ns('./jwplayer:duration'), 'duration')) - - description = self._html_search_regex( - r'(?s)<div class="leadtext">(.+?)</div>', - webpage, 'description') - - thumbnail = self._html_search_meta( - 'thumbnail', webpage, 'thumbnail') - if thumbnail: - thumbnail = compat_urlparse.urljoin(url, thumbnail) - - return { - 'id': video_id, - 'url': streamer.replace('rtmpt', 'rtmp'), - 'play_path': 'mp4:%s' % video_file, - 'ext': 'flv', - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'duration': duration, - } diff --git a/yt_dlp/extractor/keezmovies.py b/yt_dlp/extractor/keezmovies.py deleted file mode 100644 index b50da420cb..0000000000 --- a/yt_dlp/extractor/keezmovies.py +++ /dev/null @@ -1,125 +0,0 @@ -import re - -from .common import InfoExtractor -from ..aes import aes_decrypt_text -from ..compat import compat_urllib_parse_unquote -from ..utils import ( - determine_ext, - format_field, - int_or_none, - str_to_int, - strip_or_none, - url_or_none, -) - - -class KeezMoviesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/(?:(?P<display_id>[^/]+)-)?(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://www.keezmovies.com/video/arab-wife-want-it-so-bad-i-see-she-thirsty-and-has-tiny-money-18070681', - 'md5': '2ac69cdb882055f71d82db4311732a1a', - 'info_dict': { - 'id': '18070681', - 'display_id': 'arab-wife-want-it-so-bad-i-see-she-thirsty-and-has-tiny-money', - 'ext': 'mp4', - 'title': 'Arab wife want it so bad I see she thirsty and has tiny money.', - 'thumbnail': None, - 'view_count': int, - 'age_limit': 18, - } - }, { - 'url': 'http://www.keezmovies.com/video/18070681', - 'only_matching': True, - }] - - def _extract_info(self, url, fatal=True): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - display_id = (mobj.group('display_id') - if 'display_id' in mobj.groupdict() - else None) or mobj.group('id') - - webpage = self._download_webpage( - url, display_id, headers={'Cookie': 'age_verified=1'}) - - formats = [] - format_urls = set() - - title = None - thumbnail = None - duration = None - encrypted = False - - def extract_format(format_url, height=None): - format_url = url_or_none(format_url) - if not format_url or not format_url.startswith(('http', '//')): - return - if format_url in format_urls: - return - format_urls.add(format_url) - tbr = int_or_none(self._search_regex( - r'[/_](\d+)[kK][/_]', format_url, 'tbr', default=None)) - if not height: - height = int_or_none(self._search_regex( - r'[/_](\d+)[pP][/_]', format_url, 'height', default=None)) - if encrypted: - format_url = aes_decrypt_text( - video_url, title, 32).decode('utf-8') - formats.append({ - 'url': format_url, - 'format_id': format_field(height, None, '%dp'), - 'height': height, - 'tbr': tbr, - }) - - flashvars = self._parse_json( - self._search_regex( - r'flashvars\s*=\s*({.+?});', webpage, - 'flashvars', default='{}'), - display_id, fatal=False) - - if flashvars: - title = flashvars.get('video_title') - thumbnail = flashvars.get('image_url') - duration = int_or_none(flashvars.get('video_duration')) - encrypted = flashvars.get('encrypted') is True - for key, value in flashvars.items(): - mobj = re.search(r'quality_(\d+)[pP]', key) - if mobj: - extract_format(value, int(mobj.group(1))) - video_url = flashvars.get('video_url') - if video_url and determine_ext(video_url, None): - extract_format(video_url) - - video_url = self._html_search_regex( - r'flashvars\.video_url\s*=\s*(["\'])(?P<url>http.+?)\1', - webpage, 'video url', default=None, group='url') - if video_url: - extract_format(compat_urllib_parse_unquote(video_url)) - - if not formats: - if 'title="This video is no longer available"' in webpage: - self.raise_no_formats( - 'Video %s is no longer available' % video_id, expected=True) - - if not title: - title = self._html_search_regex( - r'<h1[^>]*>([^<]+)', webpage, 'title') - - return webpage, { - 'id': video_id, - 'display_id': display_id, - 'title': strip_or_none(title), - 'thumbnail': thumbnail, - 'duration': duration, - 'age_limit': 18, - 'formats': formats, - } - - def _real_extract(self, url): - webpage, info = self._extract_info(url, fatal=False) - if not info['formats']: - return self.url_result(url, 'Generic') - info['view_count'] = str_to_int(self._search_regex( - r'<b>([\d,.]+)</b> Views?', webpage, 'view count', fatal=False)) - return info diff --git a/yt_dlp/extractor/kelbyone.py b/yt_dlp/extractor/kelbyone.py index 2ca9ad4261..0ac0c5eabc 100644 --- a/yt_dlp/extractor/kelbyone.py +++ b/yt_dlp/extractor/kelbyone.py @@ -3,6 +3,7 @@ class KelbyOneIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://members\.kelbyone\.com/course/(?P<id>[^$&?#/]+)' _TESTS = [{ @@ -23,7 +24,7 @@ class KelbyOneIE(InfoExtractor): 'duration': 90, 'upload_date': '20201001', }, - }] + }], }] def _entries(self, playlist): diff --git a/yt_dlp/extractor/ketnet.py b/yt_dlp/extractor/ketnet.py deleted file mode 100644 index ab6276727a..0000000000 --- a/yt_dlp/extractor/ketnet.py +++ /dev/null @@ -1,70 +0,0 @@ -from .canvas import CanvasIE -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote -from ..utils import ( - int_or_none, - parse_iso8601, -) - - -class KetnetIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P<id>(?:[^/]+/)*[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.ketnet.be/kijken/n/nachtwacht/3/nachtwacht-s3a1-de-greystook', - 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9', - 'info_dict': { - 'id': 'pbs-pub-aef8b526-115e-4006-aa24-e59ff6c6ef6f$vid-ddb815bf-c8e7-467b-8879-6bad7a32cebd', - 'ext': 'mp4', - 'title': 'Nachtwacht - Reeks 3: Aflevering 1', - 'description': 'De Nachtwacht krijgt te maken met een parasiet', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1468.02, - 'timestamp': 1609225200, - 'upload_date': '20201229', - 'series': 'Nachtwacht', - 'season': 'Reeks 3', - 'episode': 'De Greystook', - 'episode_number': 1, - }, - 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], - }, { - 'url': 'https://www.ketnet.be/themas/karrewiet/jaaroverzicht-20200/karrewiet-het-jaar-van-black-mamba', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - video = self._download_json( - 'https://senior-bff.ketnet.be/graphql', display_id, query={ - 'query': '''{ - video(id: "content/ketnet/nl/%s.model.json") { - description - episodeNr - imageUrl - mediaReference - programTitle - publicationDate - seasonTitle - subtitleVideodetail - titleVideodetail - } -}''' % display_id, - })['data']['video'] - - mz_id = compat_urllib_parse_unquote(video['mediaReference']) - - return { - '_type': 'url_transparent', - 'id': mz_id, - 'title': video['titleVideodetail'], - 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/' + mz_id, - 'thumbnail': video.get('imageUrl'), - 'description': video.get('description'), - 'timestamp': parse_iso8601(video.get('publicationDate')), - 'series': video.get('programTitle'), - 'season': video.get('seasonTitle'), - 'episode': video.get('subtitleVideodetail'), - 'episode_number': int_or_none(video.get('episodeNr')), - 'ie_key': CanvasIE.ie_key(), - } diff --git a/yt_dlp/extractor/khanacademy.py b/yt_dlp/extractor/khanacademy.py index 5333036a8b..3f03f9e4c4 100644 --- a/yt_dlp/extractor/khanacademy.py +++ b/yt_dlp/extractor/khanacademy.py @@ -3,43 +3,52 @@ from .common import InfoExtractor from ..utils import ( int_or_none, + make_archive_id, parse_iso8601, - try_get, + str_or_none, + traverse_obj, + url_or_none, + urljoin, ) class KhanAcademyBaseIE(InfoExtractor): _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P<id>(?:[^/]+/){%s}%s[^?#/&]+)' + _PUBLISHED_CONTENT_VERSION = '171419ab20465d931b356f22d20527f13969bb70' + def _parse_video(self, video): return { '_type': 'url_transparent', 'url': video['youtubeId'], - 'id': video.get('slug'), - 'title': video.get('title'), - 'thumbnail': video.get('imageUrl') or video.get('thumbnailUrl'), - 'duration': int_or_none(video.get('duration')), - 'description': video.get('description'), + 'id': video['youtubeId'], 'ie_key': 'Youtube', + **traverse_obj(video, { + 'display_id': ('id', {str_or_none}), + 'title': ('translatedTitle', {str}), + 'thumbnail': ('thumbnailUrls', ..., 'url', {url_or_none}), + 'duration': ('duration', {int_or_none}), + 'description': ('description', {str}), + }, get_all=False), } def _real_extract(self, url): display_id = self._match_id(url) content = self._download_json( - 'https://www.khanacademy.org/api/internal/graphql/FetchContentData', - display_id, query={ + 'https://www.khanacademy.org/api/internal/graphql/ContentForPath', display_id, + query={ 'fastly_cacheable': 'persist_until_publish', - 'hash': '4134764944', - 'lang': 'en', + 'pcv': self._PUBLISHED_CONTENT_VERSION, + 'hash': '1242644265', 'variables': json.dumps({ 'path': display_id, - 'queryParams': 'lang=en', - 'isModal': False, - 'followRedirects': True, 'countryCode': 'US', + 'kaLocale': 'en', + 'clientPublishedContentVersion': self._PUBLISHED_CONTENT_VERSION, }), - })['data']['contentJson'] - return self._parse_component_props(self._parse_json(content, display_id)['componentProps']) + 'lang': 'en', + })['data']['contentRoute']['listedPathData'] + return self._parse_component_props(content, display_id) class KhanAcademyIE(KhanAcademyBaseIE): @@ -47,64 +56,98 @@ class KhanAcademyIE(KhanAcademyBaseIE): _VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/') _TEST = { 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad', - 'md5': '9c84b7b06f9ebb80d22a5c8dedefb9a0', + 'md5': '1d5c2e70fa6aa29c38eca419f12515ce', 'info_dict': { 'id': 'FlIG3TvQCBQ', 'ext': 'mp4', 'title': 'The one-time pad', 'description': 'The perfect cipher', + 'display_id': '716378217', 'duration': 176, - 'uploader': 'Brit Cruise', - 'uploader_id': 'khanacademy', + 'uploader': 'Khan Academy', + 'uploader_id': '@khanacademy', + 'uploader_url': 'https://www.youtube.com/@khanacademy', 'upload_date': '20120411', 'timestamp': 1334170113, 'license': 'cc-by-nc-sa', + 'live_status': 'not_live', + 'channel': 'Khan Academy', + 'channel_id': 'UC4a-Gbdw7vOaccHmFo40b9g', + 'channel_url': 'https://www.youtube.com/channel/UC4a-Gbdw7vOaccHmFo40b9g', + 'channel_is_verified': True, + 'playable_in_embed': True, + 'categories': ['Education'], + 'creators': ['Brit Cruise'], + 'tags': [], + 'age_limit': 0, + 'availability': 'public', + 'comment_count': int, + 'channel_follower_count': int, + 'thumbnail': str, + 'view_count': int, + 'like_count': int, + 'heatmap': list, }, 'add_ie': ['Youtube'], } - def _parse_component_props(self, component_props): - video = component_props['tutorialPageData']['contentModel'] - info = self._parse_video(video) - author_names = video.get('authorNames') - info.update({ - 'uploader': ', '.join(author_names) if author_names else None, - 'timestamp': parse_iso8601(video.get('dateAdded')), - 'license': video.get('kaUserLicense'), - }) - return info + def _parse_component_props(self, component_props, display_id): + video = component_props['content'] + return { + **self._parse_video(video), + **traverse_obj(video, { + 'creators': ('authorNames', ..., {str}), + 'timestamp': ('dateAdded', {parse_iso8601}), + 'license': ('kaUserLicense', {str}), + }), + } class KhanAcademyUnitIE(KhanAcademyBaseIE): IE_NAME = 'khanacademy:unit' - _VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('2', '')) + '/?(?:[?#&]|$)' - _TEST = { + _VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('1,2', '')) + '/?(?:[?#&]|$)' + _TESTS = [{ 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography', 'info_dict': { - 'id': 'cryptography', + 'id': 'x48c910b6', 'title': 'Cryptography', 'description': 'How have humans protected their secret messages through history? What has changed today?', + 'display_id': 'computing/computer-science/cryptography', + '_old_archive_ids': ['khanacademyunit cryptography'], }, 'playlist_mincount': 31, - } + }, { + 'url': 'https://www.khanacademy.org/computing/computer-science', + 'info_dict': { + 'id': 'x301707a0', + 'title': 'Computer science theory', + 'description': 'md5:4b472a4646e6cf6ec4ccb52c4062f8ba', + 'display_id': 'computing/computer-science', + '_old_archive_ids': ['khanacademyunit computer-science'], + }, + 'playlist_mincount': 50, + }] - def _parse_component_props(self, component_props): - curation = component_props['curation'] + def _parse_component_props(self, component_props, display_id): + course = component_props['course'] + selected_unit = traverse_obj(course, ( + 'unitChildren', lambda _, v: v['relativeUrl'] == f'/{display_id}', any)) or course - entries = [] - tutorials = try_get(curation, lambda x: x['tabs'][0]['modules'][0]['tutorials'], list) or [] - for tutorial_number, tutorial in enumerate(tutorials, 1): - chapter_info = { - 'chapter': tutorial.get('title'), - 'chapter_number': tutorial_number, - 'chapter_id': tutorial.get('id'), - } - for content_item in (tutorial.get('contentItems') or []): - if content_item.get('kind') == 'Video': - info = self._parse_video(content_item) - info.update(chapter_info) - entries.append(info) + def build_entry(entry): + return self.url_result(urljoin( + 'https://www.khanacademy.org', entry['canonicalUrl']), + KhanAcademyIE, title=entry.get('translatedTitle')) + + entries = traverse_obj(selected_unit, ( + (('unitChildren', ...), None), 'allOrderedChildren', ..., 'curatedChildren', + lambda _, v: v['contentKind'] == 'Video' and v['canonicalUrl'], {build_entry})) return self.playlist_result( - entries, curation.get('unit'), curation.get('title'), - curation.get('description')) + entries, + display_id=display_id, + **traverse_obj(selected_unit, { + 'id': ('id', {str}), + 'title': ('translatedTitle', {str}), + 'description': ('translatedDescription', {str}), + '_old_archive_ids': ('slug', {str}, {lambda x: [make_archive_id(self, x)] if x else None}), + })) diff --git a/yt_dlp/extractor/kick.py b/yt_dlp/extractor/kick.py index a79ffb7a98..1c1b2a1772 100644 --- a/yt_dlp/extractor/kick.py +++ b/yt_dlp/extractor/kick.py @@ -1,10 +1,14 @@ -from .common import InfoExtractor +import functools +from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( - HEADRequest, UserNotLive, + determine_ext, float_or_none, + int_or_none, merge_dicts, + parse_iso8601, str_or_none, traverse_obj, unified_timestamp, @@ -14,7 +18,8 @@ class KickBaseIE(InfoExtractor): def _real_initialize(self): - self._request_webpage(HEADRequest('https://kick.com/'), None, 'Setting up session') + self._request_webpage( + HEADRequest('https://kick.com/'), None, 'Setting up session', fatal=False, impersonate=True) xsrf_token = self._get_cookies('https://kick.com/').get('XSRF-TOKEN') if not xsrf_token: self.write_debug('kick.com did not set XSRF-TOKEN cookie') @@ -25,103 +30,192 @@ def _real_initialize(self): def _call_api(self, path, display_id, note='Downloading API JSON', headers={}, **kwargs): return self._download_json( - f'https://kick.com/api/v1/{path}', display_id, note=note, - headers=merge_dicts(headers, self._API_HEADERS), **kwargs) + f'https://kick.com/api/{path}', display_id, note=note, + headers=merge_dicts(headers, self._API_HEADERS), impersonate=True, **kwargs) class KickIE(KickBaseIE): - _VALID_URL = r'https?://(?:www\.)?kick\.com/(?!(?:video|categories|search|auth)(?:[/?#]|$))(?P<id>[\w_]+)' + IE_NAME = 'kick:live' + _VALID_URL = r'https?://(?:www\.)?kick\.com/(?!(?:video|categories|search|auth)(?:[/?#]|$))(?P<id>[\w-]+)' _TESTS = [{ - 'url': 'https://kick.com/yuppy', + 'url': 'https://kick.com/buddha', 'info_dict': { - 'id': '6cde1-kickrp-joe-flemmingskick-info-heremust-knowmust-see21', + 'id': '92722911-nopixel-40', 'ext': 'mp4', 'title': str, 'description': str, - 'channel': 'yuppy', - 'channel_id': '33538', - 'uploader': 'Yuppy', - 'uploader_id': '33793', - 'upload_date': str, - 'live_status': 'is_live', 'timestamp': int, - 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnail': r're:https?://.+\.jpg', 'categories': list, + 'upload_date': str, + 'channel': 'buddha', + 'channel_id': '32807', + 'uploader': 'Buddha', + 'uploader_id': '33057', + 'live_status': 'is_live', + 'concurrent_view_count': int, + 'release_timestamp': int, + 'age_limit': 18, + 'release_date': str, }, - 'skip': 'livestream', + 'params': {'skip_download': 'livestream'}, + # 'skip': 'livestream', }, { - 'url': 'https://kick.com/kmack710', + 'url': 'https://kick.com/xqc', 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if KickClipIE.suitable(url) else super().suitable(url) + def _real_extract(self, url): channel = self._match_id(url) - response = self._call_api(f'channels/{channel}', channel) + response = self._call_api(f'v2/channels/{channel}', channel) if not traverse_obj(response, 'livestream', expected_type=dict): raise UserNotLive(video_id=channel) return { - 'id': str(traverse_obj( - response, ('livestream', ('slug', 'id')), get_all=False, default=channel)), - 'formats': self._extract_m3u8_formats( - response['playback_url'], channel, 'mp4', live=True), - 'title': traverse_obj( - response, ('livestream', ('session_title', 'slug')), get_all=False, default=''), - 'description': traverse_obj(response, ('user', 'bio')), 'channel': channel, - 'channel_id': str_or_none(traverse_obj(response, 'id', ('livestream', 'channel_id'))), - 'uploader': traverse_obj(response, 'name', ('user', 'username')), - 'uploader_id': str_or_none(traverse_obj(response, 'user_id', ('user', 'id'))), 'is_live': True, - 'timestamp': unified_timestamp(traverse_obj(response, ('livestream', 'created_at'))), - 'thumbnail': traverse_obj( - response, ('livestream', 'thumbnail', 'url'), expected_type=url_or_none), - 'categories': traverse_obj(response, ('recent_categories', ..., 'name')), + 'formats': self._extract_m3u8_formats(response['playback_url'], channel, 'mp4', live=True), + **traverse_obj(response, { + 'id': ('livestream', 'slug', {str}), + 'title': ('livestream', 'session_title', {str}), + 'description': ('user', 'bio', {str}), + 'channel_id': (('id', ('livestream', 'channel_id')), {int}, {str_or_none}, any), + 'uploader': (('name', ('user', 'username')), {str}, any), + 'uploader_id': (('user_id', ('user', 'id')), {int}, {str_or_none}, any), + 'timestamp': ('livestream', 'created_at', {unified_timestamp}), + 'release_timestamp': ('livestream', 'start_time', {unified_timestamp}), + 'thumbnail': ('livestream', 'thumbnail', 'url', {url_or_none}), + 'categories': ('recent_categories', ..., 'name', {str}), + 'concurrent_view_count': ('livestream', 'viewer_count', {int_or_none}), + 'age_limit': ('livestream', 'is_mature', {bool}, {lambda x: 18 if x else 0}), + }), } class KickVODIE(KickBaseIE): + IE_NAME = 'kick:vod' _VALID_URL = r'https?://(?:www\.)?kick\.com/video/(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' _TESTS = [{ - 'url': 'https://kick.com/video/54244b5e-050a-4df4-a013-b2433dafbe35', - 'md5': '73691206a6a49db25c5aa1588e6538fc', + 'url': 'https://kick.com/video/e74614f4-5270-4319-90ad-32179f19a45c', + 'md5': '3870f94153e40e7121a6e46c068b70cb', 'info_dict': { - 'id': '54244b5e-050a-4df4-a013-b2433dafbe35', + 'id': 'e74614f4-5270-4319-90ad-32179f19a45c', 'ext': 'mp4', - 'title': 'Making 710-carBoosting. Kinda No Pixel inspired. !guilded - !links', - 'description': 'md5:a0d3546bf7955d0a8252ffe0fd6f518f', - 'channel': 'kmack710', - 'channel_id': '16278', - 'uploader': 'Kmack710', - 'uploader_id': '16412', - 'upload_date': '20221206', - 'timestamp': 1670318289, - 'duration': 40104.0, + 'title': r're:❎ MEGA DRAMA ❎ LIVE ❎ CLICK ❎ ULTIMATE SKILLS .+', + 'description': 'THE BEST AT ABSOLUTELY EVERYTHING. THE JUICER. LEADER OF THE JUICERS.', + 'channel': 'xqc', + 'channel_id': '668', + 'uploader': 'xQc', + 'uploader_id': '676', + 'upload_date': '20240724', + 'timestamp': 1721796562, + 'duration': 18566.0, 'thumbnail': r're:^https?://.*\.jpg', - 'categories': ['Grand Theft Auto V'], - }, - 'params': { - 'skip_download': 'm3u8', + 'view_count': int, + 'categories': ['VALORANT'], + 'age_limit': 0, }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): video_id = self._match_id(url) - response = self._call_api(f'video/{video_id}', video_id) + response = self._call_api(f'v1/video/{video_id}', video_id) return { 'id': video_id, 'formats': self._extract_m3u8_formats(response['source'], video_id, 'mp4'), - 'title': traverse_obj( - response, ('livestream', ('session_title', 'slug')), get_all=False, default=''), - 'description': traverse_obj(response, ('livestream', 'channel', 'user', 'bio')), - 'channel': traverse_obj(response, ('livestream', 'channel', 'slug')), - 'channel_id': str_or_none(traverse_obj(response, ('livestream', 'channel', 'id'))), - 'uploader': traverse_obj(response, ('livestream', 'channel', 'user', 'username')), - 'uploader_id': str_or_none(traverse_obj(response, ('livestream', 'channel', 'user_id'))), - 'timestamp': unified_timestamp(response.get('created_at')), - 'duration': float_or_none(traverse_obj(response, ('livestream', 'duration')), scale=1000), - 'thumbnail': traverse_obj( - response, ('livestream', 'thumbnail'), expected_type=url_or_none), - 'categories': traverse_obj(response, ('livestream', 'categories', ..., 'name')), + **traverse_obj(response, { + 'title': ('livestream', ('session_title', 'slug'), {str}, any), + 'description': ('livestream', 'channel', 'user', 'bio', {str}), + 'channel': ('livestream', 'channel', 'slug', {str}), + 'channel_id': ('livestream', 'channel', 'id', {int}, {str_or_none}), + 'uploader': ('livestream', 'channel', 'user', 'username', {str}), + 'uploader_id': ('livestream', 'channel', 'user_id', {int}, {str_or_none}), + 'timestamp': ('created_at', {parse_iso8601}), + 'duration': ('livestream', 'duration', {functools.partial(float_or_none, scale=1000)}), + 'thumbnail': ('livestream', 'thumbnail', {url_or_none}), + 'categories': ('livestream', 'categories', ..., 'name', {str}), + 'view_count': ('views', {int_or_none}), + 'age_limit': ('livestream', 'is_mature', {bool}, {lambda x: 18 if x else 0}), + }), + } + + +class KickClipIE(KickBaseIE): + IE_NAME = 'kick:clips' + _VALID_URL = r'https?://(?:www\.)?kick\.com/[\w-]+/?\?(?:[^#]+&)?clip=(?P<id>clip_[\w-]+)' + _TESTS = [{ + 'url': 'https://kick.com/mxddy?clip=clip_01GYXVB5Y8PWAPWCWMSBCFB05X', + 'info_dict': { + 'id': 'clip_01GYXVB5Y8PWAPWCWMSBCFB05X', + 'ext': 'mp4', + 'title': 'Maddy detains Abd D:', + 'channel': 'mxddy', + 'channel_id': '133789', + 'uploader': 'AbdCreates', + 'uploader_id': '3309077', + 'thumbnail': r're:^https?://.*\.jpeg', + 'duration': 35, + 'timestamp': 1682481453, + 'upload_date': '20230426', + 'view_count': int, + 'like_count': int, + 'categories': ['VALORANT'], + 'age_limit': 18, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://kick.com/destiny?clip=clip_01H9SKET879NE7N9RJRRDS98J3', + 'info_dict': { + 'id': 'clip_01H9SKET879NE7N9RJRRDS98J3', + 'title': 'W jews', + 'ext': 'mp4', + 'channel': 'destiny', + 'channel_id': '1772249', + 'uploader': 'punished_furry', + 'uploader_id': '2027722', + 'duration': 49.0, + 'upload_date': '20230908', + 'timestamp': 1694150180, + 'thumbnail': 'https://clips.kick.com/clips/j3/clip_01H9SKET879NE7N9RJRRDS98J3/thumbnail.png', + 'view_count': int, + 'like_count': int, + 'categories': ['Just Chatting'], + 'age_limit': 0, + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + clip_id = self._match_id(url) + clip = self._call_api(f'v2/clips/{clip_id}/play', clip_id)['clip'] + clip_url = clip['clip_url'] + + if determine_ext(clip_url) == 'm3u8': + formats = self._extract_m3u8_formats(clip_url, clip_id, 'mp4') + else: + formats = [{'url': clip_url}] + + return { + 'id': clip_id, + 'formats': formats, + **traverse_obj(clip, { + 'title': ('title', {str}), + 'channel': ('channel', 'slug', {str}), + 'channel_id': ('channel', 'id', {int}, {str_or_none}), + 'uploader': ('creator', 'username', {str}), + 'uploader_id': ('creator', 'id', {int}, {str_or_none}), + 'thumbnail': ('thumbnail_url', {url_or_none}), + 'duration': ('duration', {float_or_none}), + 'categories': ('category', 'name', {str}, all), + 'timestamp': ('created_at', {parse_iso8601}), + 'view_count': ('views', {int_or_none}), + 'like_count': ('likes', {int_or_none}), + 'age_limit': ('is_mature', {bool}, {lambda x: 18 if x else 0}), + }), } diff --git a/yt_dlp/extractor/kicker.py b/yt_dlp/extractor/kicker.py index a2c7dd4e83..4ab6751788 100644 --- a/yt_dlp/extractor/kicker.py +++ b/yt_dlp/extractor/kicker.py @@ -20,8 +20,8 @@ class KickerIE(InfoExtractor): 'age_limit': 0, 'thumbnail': r're:https://s\d+\.dmcdn\.net/v/T-x741YeYAx8aSZ0Z/x1080', 'tags': ['published', 'category.InternationalSoccer'], - 'upload_date': '20220608' - } + 'upload_date': '20220608', + }, }, { 'url': 'https://www.kicker.de/ex-unioner-in-der-bezirksliga-felix-kroos-vereinschallenge-in-pankow-902825/video', 'info_dict': { @@ -39,7 +39,7 @@ class KickerIE(InfoExtractor): 'uploader': 'kicker.de', 'description': 'md5:0c2060c899a91c8bf40f578f78c5846f', 'like_count': int, - } + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/kinja.py b/yt_dlp/extractor/kinja.py index df1386fb86..99c8a12247 100644 --- a/yt_dlp/extractor/kinja.py +++ b/yt_dlp/extractor/kinja.py @@ -1,8 +1,6 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, -) from ..utils import ( int_or_none, parse_iso8601, @@ -12,7 +10,7 @@ class KinjaEmbedIE(InfoExtractor): - IENAME = 'kinja:embed' + IE_NAME = 'kinja:embed' _DOMAIN_REGEX = r'''(?:[^.]+\.)? (?: avclub| @@ -32,7 +30,7 @@ class KinjaEmbedIE(InfoExtractor): ajax/inset| embed/video )/iframe\?.*?\bid=''' - _VALID_URL = r'''(?x)https?://%s%s + _VALID_URL = rf'''(?x)https?://{_DOMAIN_REGEX}{_COMMON_REGEX} (?P<type> fb| imgur| @@ -41,7 +39,6 @@ class KinjaEmbedIE(InfoExtractor): kinjavideo| mcp| megaphone| - ooyala| soundcloud(?:-playlist)?| tumblr-post| twitch-stream| @@ -50,7 +47,7 @@ class KinjaEmbedIE(InfoExtractor): vimeo| vine| youtube-(?:list|video) - )-(?P<id>[^&]+)''' % (_DOMAIN_REGEX, _COMMON_REGEX) + )-(?P<id>[^&]+)''' _EMBED_REGEX = [rf'(?x)<iframe[^>]+?src=(?P<q>["\'])(?P<url>(?:(?:https?:)?//{_DOMAIN_REGEX})?{_COMMON_REGEX}(?:(?!\1).)+)\1'] _TESTS = [{ 'url': 'https://kinja.com/ajax/inset/iframe?id=fb-10103303356633621', @@ -61,9 +58,6 @@ class KinjaEmbedIE(InfoExtractor): }, { 'url': 'https://kinja.com/ajax/inset/iframe?id=megaphone-PPY1300931075', 'only_matching': True, - }, { - 'url': 'https://kinja.com/ajax/inset/iframe?id=ooyala-xzMXhleDpopuT0u1ijt_qZj3Va-34pEX%2FZTIxYmJjZDM2NWYzZDViZGRiOWJjYzc5', - 'only_matching': True, }, { 'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-128574047', 'only_matching': True, @@ -103,7 +97,6 @@ class KinjaEmbedIE(InfoExtractor): 'jwplayer-video': _JWPLATFORM_PROVIDER, 'jwp-video': _JWPLATFORM_PROVIDER, 'megaphone': ('player.megaphone.fm/', 'Generic'), - 'ooyala': ('player.ooyala.com/player.js?embedCode=', 'Ooyala'), 'soundcloud': ('api.soundcloud.com/tracks/', 'Soundcloud'), 'soundcloud-playlist': ('api.soundcloud.com/playlists/', 'SoundcloudPlaylist'), 'tumblr-post': ('%s.tumblr.com/post/%s', 'Tumblr'), @@ -121,7 +114,7 @@ def _real_extract(self, url): provider = self._PROVIDER_MAP.get(video_type) if provider: - video_id = compat_urllib_parse_unquote(video_id) + video_id = urllib.parse.unquote(video_id) if video_type == 'tumblr-post': video_id, blog = video_id.split('-', 1) result_url = provider[0] % (blog, video_id) @@ -129,8 +122,6 @@ def _real_extract(self, url): video_id, playlist_id = video_id.split('/') result_url = provider[0] % (video_id, playlist_id) else: - if video_type == 'ooyala': - video_id = video_id.split('/')[0] result_url = provider[0] + video_id return self.url_result('http://' + result_url, provider[1]) @@ -152,7 +143,7 @@ def _real_extract(self, url): poster = data.get('poster') or {} poster_id = poster.get('id') if poster_id: - thumbnail = 'https://i.kinja-img.com/gawker-media/image/upload/%s.%s' % (poster_id, poster.get('format') or 'jpg') + thumbnail = 'https://i.kinja-img.com/gawker-media/image/upload/{}.{}'.format(poster_id, poster.get('format') or 'jpg') return { 'id': video_id, @@ -197,10 +188,10 @@ def _real_extract(self, url): return { 'id': video_id, 'title': title, - 'thumbnail': try_get(iptc, lambda x: x['cloudinaryLink']['link'], compat_str), + 'thumbnail': try_get(iptc, lambda x: x['cloudinaryLink']['link'], str), 'uploader': fmg.get('network'), 'duration': int_or_none(iptc.get('fileDuration')), 'formats': formats, - 'description': try_get(iptc, lambda x: x['description']['en'], compat_str), + 'description': try_get(iptc, lambda x: x['description']['en'], str), 'timestamp': parse_iso8601(iptc.get('dateReleased')), } diff --git a/yt_dlp/extractor/kommunetv.py b/yt_dlp/extractor/kommunetv.py index e21e556be3..5ec33a9ff3 100644 --- a/yt_dlp/extractor/kommunetv.py +++ b/yt_dlp/extractor/kommunetv.py @@ -3,23 +3,23 @@ class KommunetvIE(InfoExtractor): - _VALID_URL = r'https://(\w+).kommunetv.no/archive/(?P<id>\w+)' + _VALID_URL = r'https?://\w+\.kommunetv\.no/archive/(?P<id>\w+)' _TEST = { 'url': 'https://oslo.kommunetv.no/archive/921', 'md5': '5f102be308ee759be1e12b63d5da4bbc', 'info_dict': { 'id': '921', 'title': 'Bystyremøte', - 'ext': 'mp4' - } + 'ext': 'mp4', + }, } def _real_extract(self, url): video_id = self._match_id(url) headers = { - 'Accept': 'application/json' + 'Accept': 'application/json', } - data = self._download_json('https://oslo.kommunetv.no/api/streams?streamType=1&id=%s' % video_id, video_id, headers=headers) + data = self._download_json(f'https://oslo.kommunetv.no/api/streams?streamType=1&id={video_id}', video_id, headers=headers) title = data['stream']['title'] file = data['playlist'][0]['playlist'][0]['file'] url = update_url(file, query=None, fragment=None) @@ -27,5 +27,5 @@ def _real_extract(self, url): return { 'id': video_id, 'formats': formats, - 'title': title + 'title': title, } diff --git a/yt_dlp/extractor/kompas.py b/yt_dlp/extractor/kompas.py index 8bad961906..2ef076c1ef 100644 --- a/yt_dlp/extractor/kompas.py +++ b/yt_dlp/extractor/kompas.py @@ -16,7 +16,7 @@ class KompasVideoIE(JixieBaseIE): 'categories': ['news'], 'thumbnail': 'https://video.jixie.media/1001/164474/164474_1280x720.jpg', 'tags': 'count:9', - } + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/konserthusetplay.py b/yt_dlp/extractor/konserthusetplay.py deleted file mode 100644 index 10767f1b69..0000000000 --- a/yt_dlp/extractor/konserthusetplay.py +++ /dev/null @@ -1,119 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - determine_ext, - float_or_none, - int_or_none, - url_or_none, -) - - -class KonserthusetPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:konserthusetplay|rspoplay)\.se/\?.*\bm=(?P<id>[^&]+)' - _TESTS = [{ - 'url': 'http://www.konserthusetplay.se/?m=CKDDnlCY-dhWAAqiMERd-A', - 'md5': 'e3fd47bf44e864bd23c08e487abe1967', - 'info_dict': { - 'id': 'CKDDnlCY-dhWAAqiMERd-A', - 'ext': 'mp4', - 'title': 'Orkesterns instrument: Valthornen', - 'description': 'md5:f10e1f0030202020396a4d712d2fa827', - 'thumbnail': 're:^https?://.*$', - 'duration': 398.76, - }, - }, { - 'url': 'http://rspoplay.se/?m=elWuEH34SMKvaO4wO_cHBw', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - e = self._search_regex( - r'https?://csp\.picsearch\.com/rest\?.*\be=(.+?)[&"\']', webpage, 'e') - - rest = self._download_json( - 'http://csp.picsearch.com/rest?e=%s&containerId=mediaplayer&i=object' % e, - video_id, transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) - - media = rest['media'] - player_config = media['playerconfig'] - playlist = player_config['playlist'] - - source = next(f for f in playlist if f.get('bitrates') or f.get('provider')) - - FORMAT_ID_REGEX = r'_([^_]+)_h264m\.mp4' - - formats = [] - - m3u8_url = source.get('url') - if m3u8_url and determine_ext(m3u8_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - - fallback_url = source.get('fallbackUrl') - fallback_format_id = None - if fallback_url: - fallback_format_id = self._search_regex( - FORMAT_ID_REGEX, fallback_url, 'format id', default=None) - - connection_url = (player_config.get('rtmp', {}).get( - 'netConnectionUrl') or player_config.get( - 'plugins', {}).get('bwcheck', {}).get('netConnectionUrl')) - if connection_url: - for f in source['bitrates']: - video_url = f.get('url') - if not video_url: - continue - format_id = self._search_regex( - FORMAT_ID_REGEX, video_url, 'format id', default=None) - f_common = { - 'vbr': int_or_none(f.get('bitrate')), - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - } - f = f_common.copy() - f.update({ - 'url': connection_url, - 'play_path': video_url, - 'format_id': 'rtmp-%s' % format_id if format_id else 'rtmp', - 'ext': 'flv', - }) - formats.append(f) - if format_id and format_id == fallback_format_id: - f = f_common.copy() - f.update({ - 'url': fallback_url, - 'format_id': 'http-%s' % format_id if format_id else 'http', - }) - formats.append(f) - - if not formats and fallback_url: - formats.append({ - 'url': fallback_url, - }) - - title = player_config.get('title') or media['title'] - description = player_config.get('mediaInfo', {}).get('description') - thumbnail = media.get('image') - duration = float_or_none(media.get('duration'), 1000) - - subtitles = {} - captions = source.get('captionsAvailableLanguages') - if isinstance(captions, dict): - for lang, subtitle_url in captions.items(): - subtitle_url = url_or_none(subtitle_url) - if lang != 'none' and subtitle_url: - subtitles.setdefault(lang, []).append({'url': subtitle_url}) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } diff --git a/yt_dlp/extractor/koo.py b/yt_dlp/extractor/koo.py index 9cfec5eb95..6ec5b59f9a 100644 --- a/yt_dlp/extractor/koo.py +++ b/yt_dlp/extractor/koo.py @@ -6,6 +6,7 @@ class KooIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?kooapp\.com/koo/[^/]+/(?P<id>[^/&#$?]+)' _TESTS = [{ # Test for video in the comments 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/946c4189-bc2d-4524-b95b-43f641e2adde', @@ -18,9 +19,9 @@ class KooIE(InfoExtractor): 'uploader_id': 'ytdlpTestAccount', 'uploader': 'yt-dlpTestAccount', 'duration': 7000, - 'upload_date': '20210921' + 'upload_date': '20210921', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }, { # Test for koo with long title 'url': 'https://www.kooapp.com/koo/laxman_kumarDBFEC/33decbf7-5e1e-4bb8-bfd7-04744a064361', 'info_dict': { @@ -32,9 +33,9 @@ class KooIE(InfoExtractor): 'uploader_id': 'laxman_kumarDBFEC', 'uploader': 'Laxman Kumar 🇮🇳', 'duration': 46000, - 'upload_date': '20210920' + 'upload_date': '20210920', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }, { # Test for audio 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/a2a9c88e-ce4b-4d2d-952f-d06361c5b602', 'info_dict': { @@ -46,9 +47,9 @@ class KooIE(InfoExtractor): 'uploader_id': 'ytdlpTestAccount', 'uploader': 'yt-dlpTestAccount', 'duration': 214000, - 'upload_date': '20210921' + 'upload_date': '20210921', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }, { # Test for video 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/a3e56c53-c1ed-4ac9-ac02-ed1630e6b1d1', 'info_dict': { @@ -60,9 +61,9 @@ class KooIE(InfoExtractor): 'uploader_id': 'ytdlpTestAccount', 'uploader': 'yt-dlpTestAccount', 'duration': 14000, - 'upload_date': '20210921' + 'upload_date': '20210921', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }, { # Test for link 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/01bf5b94-81a5-4d8e-a387-5f732022e15a', 'skip': 'No video/audio found at the provided url.', @@ -82,10 +83,11 @@ class KooIE(InfoExtractor): }] def _real_extract(self, url): - id = self._match_id(url) - data_json = self._download_json(f'https://www.kooapp.com/apiV1/ku/{id}?limit=20&offset=0&showSimilarKoos=true', id)['parentContent'] + video_id = self._match_id(url) + data_json = self._download_json( + f'https://www.kooapp.com/apiV1/ku/{video_id}?limit=20&offset=0&showSimilarKoos=true', video_id)['parentContent'] item_json = next(content['items'][0] for content in data_json - if try_get(content, lambda x: x['items'][0]['id']) == id) + if try_get(content, lambda x: x['items'][0]['id']) == video_id) media_json = item_json['mediaMap'] formats = [] @@ -97,12 +99,12 @@ def _real_extract(self, url): 'ext': 'mp4', }) if video_m3u8_url: - formats.extend(self._extract_m3u8_formats(video_m3u8_url, id, fatal=False, ext='mp4')) + formats.extend(self._extract_m3u8_formats(video_m3u8_url, video_id, fatal=False, ext='mp4')) if not formats: self.raise_no_formats('No video/audio found at the provided url.', expected=True) return { - 'id': id, + 'id': video_id, 'title': clean_html(item_json.get('title')), 'description': f'{clean_html(item_json.get("title"))}\n\n{clean_html(item_json.get("enTransliteration"))}', 'timestamp': item_json.get('createdAt'), diff --git a/yt_dlp/extractor/krasview.py b/yt_dlp/extractor/krasview.py index 4323aa4292..0febf759ba 100644 --- a/yt_dlp/extractor/krasview.py +++ b/yt_dlp/extractor/krasview.py @@ -8,6 +8,7 @@ class KrasViewIE(InfoExtractor): + _WORKING = False IE_DESC = 'Красвью' _VALID_URL = r'https?://krasview\.ru/(?:video|embed)/(?P<id>\d+)' diff --git a/yt_dlp/extractor/kth.py b/yt_dlp/extractor/kth.py index e17c6db912..76899fdb8c 100644 --- a/yt_dlp/extractor/kth.py +++ b/yt_dlp/extractor/kth.py @@ -16,13 +16,12 @@ class KTHIE(InfoExtractor): 'timestamp': 1647345358, 'upload_date': '20220315', 'uploader_id': 'md5:0ec23e33a89e795a4512930c8102509f', - } + }, } def _real_extract(self, url): video_id = self._match_id(url) - result = self.url_result( - smuggle_url('kaltura:308:%s' % video_id, { + return self.url_result( + smuggle_url(f'kaltura:308:{video_id}', { 'service_url': 'https://api.kaltura.nordu.net'}), 'Kaltura') - return result diff --git a/yt_dlp/extractor/ku6.py b/yt_dlp/extractor/ku6.py index 31b4ea0c60..00e814cc01 100644 --- a/yt_dlp/extractor/ku6.py +++ b/yt_dlp/extractor/ku6.py @@ -10,7 +10,7 @@ class Ku6IE(InfoExtractor): 'id': 'JG-8yS14xzBr4bCn1pu0xw', 'ext': 'f4v', 'title': 'techniques test', - } + }, } def _real_extract(self, url): @@ -19,12 +19,12 @@ def _real_extract(self, url): title = self._html_search_regex( r'<h1 title=.*>(.*?)</h1>', webpage, 'title') - dataUrl = 'http://v.ku6.com/fetchVideo4Player/%s.html' % video_id - jsonData = self._download_json(dataUrl, video_id) - downloadUrl = jsonData['data']['f'] + data_url = f'http://v.ku6.com/fetchVideo4Player/{video_id}.html' + json_data = self._download_json(data_url, video_id) + download_url = json_data['data']['f'] return { 'id': video_id, 'title': title, - 'url': downloadUrl + 'url': download_url, } diff --git a/yt_dlp/extractor/kukululive.py b/yt_dlp/extractor/kukululive.py new file mode 100644 index 0000000000..86ab5d40ec --- /dev/null +++ b/yt_dlp/extractor/kukululive.py @@ -0,0 +1,140 @@ +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + filter_dict, + get_element_by_id, + int_or_none, + join_nonempty, + js_to_json, + qualities, + url_or_none, + urljoin, +) +from ..utils.traversal import traverse_obj + + +class KukuluLiveIE(InfoExtractor): + _VALID_URL = r'https?://live\.erinn\.biz/live\.php\?h(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://live.erinn.biz/live.php?h675134569', + 'md5': 'e380fa6a47fc703d91cea913ab44ec2e', + 'info_dict': { + 'id': '675134569', + 'ext': 'mp4', + 'title': 'プロセカ', + 'description': 'テストも兼ねたプロセカ配信。', + 'timestamp': 1702689148, + 'upload_date': '20231216', + 'thumbnail': r're:^https?://.*', + }, + }, { + 'url': 'https://live.erinn.biz/live.php?h102338092', + 'md5': 'dcf5167a934b1c60333461e13a81a6e2', + 'info_dict': { + 'id': '102338092', + 'ext': 'mp4', + 'title': 'Among Usで遊びます!!', + 'description': 'VTuberになりましたねんねこ㌨ですよろしくお願いします', + 'timestamp': 1704603118, + 'upload_date': '20240107', + 'thumbnail': r're:^https?://.*', + }, + }, { + 'url': 'https://live.erinn.biz/live.php?h878049531', + 'only_matching': True, + }] + + def _get_quality_meta(self, video_id, desc, code, force_h264=None): + desc += ' (force_h264)' if force_h264 else '' + qs = self._download_webpage( + 'https://live.erinn.biz/live.player.fplayer.php', video_id, + f'Downloading {desc} quality metadata', f'Unable to download {desc} quality metadata', + query=filter_dict({ + 'hash': video_id, + 'action': f'get{code}liveByAjax', + 'force_h264': force_h264, + })) + return urllib.parse.parse_qs(qs) + + def _add_quality_formats(self, formats, quality_meta): + vcodec = traverse_obj(quality_meta, ('vcodec', 0, {str})) + quality = traverse_obj(quality_meta, ('now_quality', 0, {str})) + quality_priority = qualities(('low', 'h264', 'high'))(quality) + if traverse_obj(quality_meta, ('hlsaddr', 0, {url_or_none})): + formats.append({ + 'format_id': quality, + 'url': quality_meta['hlsaddr'][0], + 'ext': 'mp4', + 'vcodec': vcodec, + 'quality': quality_priority, + }) + if traverse_obj(quality_meta, ('hlsaddr_audioonly', 0, {url_or_none})): + formats.append({ + 'format_id': join_nonempty(quality, 'audioonly'), + 'url': quality_meta['hlsaddr_audioonly'][0], + 'ext': 'm4a', + 'vcodec': 'none', + 'quality': quality_priority, + }) + + def _real_extract(self, url): + video_id = self._match_id(url) + html = self._download_webpage(url, video_id) + + if '>タイムシフトが見つかりませんでした。<' in html: + raise ExtractorError('This stream has expired', expected=True) + + title = clean_html( + get_element_by_id('livetitle', html.replace('<SPAN', '<span').replace('SPAN>', 'span>'))) + description = self._html_search_meta('Description', html) + thumbnail = self._html_search_meta(['og:image', 'twitter:image'], html) + + if self._search_regex(r'(var\s+timeshift\s*=\s*false)', html, 'is livestream', default=False): + formats = [] + for (desc, code) in [('high', 'Z'), ('low', 'ForceLow')]: + quality_meta = self._get_quality_meta(video_id, desc, code) + self._add_quality_formats(formats, quality_meta) + if desc == 'high' and traverse_obj(quality_meta, ('vcodec', 0)) == 'HEVC': + self._add_quality_formats( + formats, self._get_quality_meta(video_id, desc, code, force_h264='1')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'is_live': True, + 'formats': formats, + } + + # VOD extraction + player_html = self._download_webpage( + 'https://live.erinn.biz/live.timeshift.fplayer.php', video_id, + 'Downloading player html', 'Unable to download player html', query={'hash': video_id}) + + sources = traverse_obj(self._search_json( + r'var\s+fplayer_source\s*=', player_html, 'stream data', video_id, + contains_pattern=r'\[(?s:.+)\]', transform_source=js_to_json), lambda _, v: v['file']) + + def entries(segments, playlist=True): + for i, segment in enumerate(segments, 1): + yield { + 'id': f'{video_id}_{i}' if playlist else video_id, + 'title': f'{title} (Part {i})' if playlist else title, + 'description': description, + 'timestamp': traverse_obj(segment, ('time_start', {int_or_none})), + 'thumbnail': thumbnail, + 'formats': [{ + 'url': urljoin('https://live.erinn.biz', segment['file']), + 'ext': 'mp4', + 'protocol': 'm3u8_native', + }], + } + + if len(sources) == 1: + return next(entries(sources, playlist=False)) + + return self.playlist_result(entries(sources), video_id, title, description, multi_video=True) diff --git a/yt_dlp/extractor/kusi.py b/yt_dlp/extractor/kusi.py deleted file mode 100644 index a23ad8945d..0000000000 --- a/yt_dlp/extractor/kusi.py +++ /dev/null @@ -1,83 +0,0 @@ -import random -import urllib.parse - -from .common import InfoExtractor -from ..utils import ( - float_or_none, - int_or_none, - timeconvert, - update_url_query, - xpath_text, -) - - -class KUSIIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))' - _TESTS = [{ - 'url': 'http://www.kusi.com/story/32849881/turko-files-refused-to-help-it-aint-right', - 'md5': '4e76ce8e53660ce9697d06c0ba6fc47d', - 'info_dict': { - 'id': '12689020', - 'ext': 'mp4', - 'title': "Turko Files: Refused to Help, It Ain't Right!", - 'duration': 223.586, - 'upload_date': '20160826', - 'timestamp': 1472233118, - 'thumbnail': r're:^https?://.*\.jpg$' - }, - }, { - 'url': 'http://kusi.com/video?clipId=12203019', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - clip_id = mobj.group('clipId') - video_id = clip_id or mobj.group('path') - - webpage = self._download_webpage(url, video_id) - - if clip_id is None: - video_id = clip_id = self._html_search_regex( - r'"clipId"\s*,\s*"(\d+)"', webpage, 'clip id') - - affiliate_id = self._search_regex( - r'affiliateId\s*:\s*\'([^\']+)\'', webpage, 'affiliate id') - - # See __Packages/worldnow/model/GalleryModel.as of WNGallery.swf - xml_url = update_url_query('http://www.kusi.com/build.asp', { - 'buildtype': 'buildfeaturexmlrequest', - 'featureType': 'Clip', - 'featureid': clip_id, - 'affiliateno': affiliate_id, - 'clientgroupid': '1', - 'rnd': int(round(random.random() * 1000000)), - }) - - doc = self._download_xml(xml_url, video_id) - - video_title = xpath_text(doc, 'HEADLINE', fatal=True) - duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000) - description = xpath_text(doc, 'ABSTRACT') - thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME') - creation_time = timeconvert(xpath_text(doc, 'rfc822creationdate')) - - quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content') - formats = [] - for quality in quality_options: - formats.append({ - 'url': urllib.parse.unquote_plus(quality.attrib['url']), - 'height': int_or_none(quality.attrib.get('height')), - 'width': int_or_none(quality.attrib.get('width')), - 'vbr': float_or_none(quality.attrib.get('bitratebits'), scale=1000), - }) - - return { - 'id': video_id, - 'title': video_title, - 'description': description, - 'duration': duration, - 'formats': formats, - 'thumbnail': thumbnail, - 'timestamp': creation_time, - } diff --git a/yt_dlp/extractor/kuwo.py b/yt_dlp/extractor/kuwo.py index cfec1c50f6..80b6b55f1a 100644 --- a/yt_dlp/extractor/kuwo.py +++ b/yt_dlp/extractor/kuwo.py @@ -1,12 +1,12 @@ import re +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( - get_element_by_id, - clean_html, ExtractorError, InAdvancePagedList, + clean_html, + get_element_by_id, remove_start, ) @@ -18,7 +18,7 @@ class KuwoBaseIE(InfoExtractor): {'format': 'mp3-192', 'ext': 'mp3', 'br': '192kmp3', 'abr': 192, 'preference': 70}, {'format': 'mp3-128', 'ext': 'mp3', 'br': '128kmp3', 'abr': 128, 'preference': 60}, {'format': 'wma', 'ext': 'wma', 'preference': 20}, - {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10} + {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10}, ] def _get_formats(self, song_id, tolerate_ip_deny=False): @@ -27,21 +27,21 @@ def _get_formats(self, song_id, tolerate_ip_deny=False): query = { 'format': file_format['ext'], 'br': file_format.get('br', ''), - 'rid': 'MUSIC_%s' % song_id, + 'rid': f'MUSIC_{song_id}', 'type': 'convert_url', - 'response': 'url' + 'response': 'url', } song_url = self._download_webpage( 'http://antiserver.kuwo.cn/anti.s', - song_id, note='Download %s url info' % file_format['format'], + song_id, note='Download {} url info'.format(file_format['format']), query=query, headers=self.geo_verification_headers(), ) if song_url == 'IPDeny' and not tolerate_ip_deny: raise ExtractorError('This song is blocked in this region', expected=True) - if song_url.startswith('http://') or song_url.startswith('https://'): + if song_url.startswith(('http://', 'https://')): formats.append({ 'url': song_url, 'format_id': file_format['format'], @@ -54,6 +54,7 @@ def _get_formats(self, song_id, tolerate_ip_deny=False): class KuwoIE(KuwoBaseIE): + _WORKING = False IE_NAME = 'kuwo:song' IE_DESC = '酷我音乐' _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/yinyue/(?P<id>\d+)' @@ -65,7 +66,7 @@ class KuwoIE(KuwoBaseIE): 'title': '爱我别走', 'creator': '张震岳', 'upload_date': '20080122', - 'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c' + 'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c', }, 'skip': 'this song has been offline because of copyright issues', }, { @@ -91,7 +92,7 @@ def _real_extract(self, url): webpage, urlh = self._download_webpage_handle( url, song_id, note='Download song detail info', errnote='Unable to get song detail info') - if song_id not in urlh.geturl() or '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage: + if song_id not in urlh.url or '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage: raise ExtractorError('this song has been offline because of copyright issues', expected=True) song_name = self._html_search_regex( @@ -112,7 +113,7 @@ def _real_extract(self, url): publish_time = None if album_id is not None: album_info_page = self._download_webpage( - 'http://www.kuwo.cn/album/%s/' % album_id, song_id, + f'http://www.kuwo.cn/album/{album_id}/', song_id, note='Download album detail info', errnote='Unable to get album detail info') @@ -133,6 +134,7 @@ def _real_extract(self, url): class KuwoAlbumIE(InfoExtractor): + _WORKING = False IE_NAME = 'kuwo:album' IE_DESC = '酷我音乐 - 专辑' _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/album/(?P<id>\d+?)/' @@ -158,7 +160,7 @@ def _real_extract(self, url): 'album name') album_intro = remove_start( clean_html(get_element_by_id('intro', webpage)), - '%s简介:' % album_name) + f'{album_name}简介:') entries = [ self.url_result(song_url, 'Kuwo') for song_url in re.findall( @@ -169,6 +171,7 @@ def _real_extract(self, url): class KuwoChartIE(InfoExtractor): + _WORKING = False IE_NAME = 'kuwo:chart' IE_DESC = '酷我音乐 - 排行榜' _VALID_URL = r'https?://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm' @@ -194,6 +197,7 @@ def _real_extract(self, url): class KuwoSingerIE(InfoExtractor): + _WORKING = False IE_NAME = 'kuwo:singer' IE_DESC = '酷我音乐 - 歌手' _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/mingxing/(?P<id>[^/]+)' @@ -234,12 +238,12 @@ def _real_extract(self, url): def page_func(page_num): webpage = self._download_webpage( 'http://www.kuwo.cn/artist/contentMusicsAjax', - singer_id, note='Download song list page #%d' % (page_num + 1), - errnote='Unable to get song list page #%d' % (page_num + 1), + singer_id, note=f'Download song list page #{page_num + 1}', + errnote=f'Unable to get song list page #{page_num + 1}', query={'artistId': artist_id, 'pn': page_num, 'rn': self.PAGE_SIZE}) return [ - self.url_result(compat_urlparse.urljoin(url, song_url), 'Kuwo') + self.url_result(urllib.parse.urljoin(url, song_url), 'Kuwo') for song_url in re.findall( r'<div[^>]+class="name"><a[^>]+href="(/yinyue/\d+)', webpage) @@ -251,6 +255,7 @@ def page_func(page_num): class KuwoCategoryIE(InfoExtractor): + _WORKING = False IE_NAME = 'kuwo:category' IE_DESC = '酷我音乐 - 分类' _VALID_URL = r'https?://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm' @@ -275,7 +280,7 @@ def _real_extract(self, url): category_desc = remove_start( get_element_by_id('intro', webpage).strip(), - '%s简介:' % category_name) + f'{category_name}简介:') if category_desc == '暂无': category_desc = None @@ -283,13 +288,14 @@ def _real_extract(self, url): r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id) entries = [ - self.url_result('http://www.kuwo.cn/yinyue/%s/' % song['musicrid'], 'Kuwo') + self.url_result('http://www.kuwo.cn/yinyue/{}/'.format(song['musicrid']), 'Kuwo') for song in jsonm['musiclist'] ] return self.playlist_result(entries, category_id, category_name, category_desc) class KuwoMvIE(KuwoBaseIE): + _WORKING = False IE_NAME = 'kuwo:mv' IE_DESC = '酷我音乐 - MV' _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/mv/(?P<id>\d+?)/' @@ -308,16 +314,16 @@ class KuwoMvIE(KuwoBaseIE): 'format': 'mv', }, } - _FORMATS = KuwoBaseIE._FORMATS + [ + _FORMATS = [ + *KuwoBaseIE._FORMATS, {'format': 'mkv', 'ext': 'mkv', 'preference': 250}, - {'format': 'mp4', 'ext': 'mp4', 'preference': 200}, - ] + {'format': 'mp4', 'ext': 'mp4', 'preference': 200}] def _real_extract(self, url): song_id = self._match_id(url) webpage = self._download_webpage( - url, song_id, note='Download mv detail info: %s' % song_id, - errnote='Unable to get mv detail info: %s' % song_id) + url, song_id, note=f'Download mv detail info: {song_id}', + errnote=f'Unable to get mv detail info: {song_id}') mobj = re.search( r'<h1[^>]+title="(?P<song>[^"]+)">[^<]+<span[^>]+title="(?P<singer>[^"]+)"', @@ -331,8 +337,8 @@ def _real_extract(self, url): formats = self._get_formats(song_id, tolerate_ip_deny=True) mv_url = self._download_webpage( - 'http://www.kuwo.cn/yy/st/mvurl?rid=MUSIC_%s' % song_id, - song_id, note='Download %s MV URL' % song_id) + f'http://www.kuwo.cn/yy/st/mvurl?rid=MUSIC_{song_id}', + song_id, note=f'Download {song_id} MV URL') formats.append({ 'url': mv_url, 'format_id': 'mv', diff --git a/yt_dlp/extractor/la7.py b/yt_dlp/extractor/la7.py index 36bfaf5c30..20a5235f22 100644 --- a/yt_dlp/extractor/la7.py +++ b/yt_dlp/extractor/la7.py @@ -1,13 +1,8 @@ import re from .common import InfoExtractor -from ..utils import ( - float_or_none, - HEADRequest, - int_or_none, - parse_duration, - unified_strdate, -) +from ..networking import HEADRequest +from ..utils import float_or_none, int_or_none, parse_duration, unified_strdate class LA7IE(InfoExtractor): @@ -98,7 +93,7 @@ def _real_extract(self, url): 'description': self._og_search_description(webpage, default=None), 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'formats': formats, - 'upload_date': unified_strdate(self._search_regex(r'datetime="(.+?)"', webpage, 'upload_date', fatal=False)) + 'upload_date': unified_strdate(self._search_regex(r'datetime="(.+?)"', webpage, 'upload_date', fatal=False)), } @@ -213,9 +208,9 @@ class LA7PodcastIE(LA7PodcastEpisodeIE): # XXX: Do not subclass from concrete I 'url': 'https://www.la7.it/propagandalive/podcast', 'info_dict': { 'id': 'propagandalive', - 'title': "Propaganda Live", + 'title': 'Propaganda Live', }, - 'playlist_count_min': 10, + 'playlist_mincount': 10, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/laola1tv.py b/yt_dlp/extractor/laola1tv.py deleted file mode 100644 index 416dd7eb4e..0000000000 --- a/yt_dlp/extractor/laola1tv.py +++ /dev/null @@ -1,261 +0,0 @@ -import json -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - unified_strdate, - urlencode_postdata, - xpath_element, - xpath_text, - update_url_query, - js_to_json, -) - - -class Laola1TvEmbedIE(InfoExtractor): - IE_NAME = 'laola1tv:embed' - _VALID_URL = r'https?://(?:www\.)?laola1\.tv/titanplayer\.php\?.*?\bvideoid=(?P<id>\d+)' - _TESTS = [{ - # flashvars.premium = "false"; - 'url': 'https://www.laola1.tv/titanplayer.php?videoid=708065&type=V&lang=en&portal=int&customer=1024', - 'info_dict': { - 'id': '708065', - 'ext': 'mp4', - 'title': 'MA Long CHN - FAN Zhendong CHN', - 'uploader': 'ITTF - International Table Tennis Federation', - 'upload_date': '20161211', - }, - }] - - def _extract_token_url(self, stream_access_url, video_id, data): - return self._download_json( - self._proto_relative_url(stream_access_url, 'https:'), video_id, - headers={ - 'Content-Type': 'application/json', - }, data=json.dumps(data).encode())['data']['stream-access'][0] - - def _extract_formats(self, token_url, video_id): - token_doc = self._download_xml( - token_url, video_id, 'Downloading token', - headers=self.geo_verification_headers()) - - token_attrib = xpath_element(token_doc, './/token').attrib - - if token_attrib['status'] != '0': - raise ExtractorError( - 'Token error: %s' % token_attrib['comment'], expected=True) - - formats = self._extract_akamai_formats( - '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']), - video_id) - return formats - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - flash_vars = self._search_regex( - r'(?s)flashvars\s*=\s*({.+?});', webpage, 'flash vars') - - def get_flashvar(x, *args, **kwargs): - flash_var = self._search_regex( - r'%s\s*:\s*"([^"]+)"' % x, - flash_vars, x, default=None) - if not flash_var: - flash_var = self._search_regex([ - r'flashvars\.%s\s*=\s*"([^"]+)"' % x, - r'%s\s*=\s*"([^"]+)"' % x], - webpage, x, *args, **kwargs) - return flash_var - - hd_doc = self._download_xml( - 'http://www.laola1.tv/server/hd_video.php', video_id, query={ - 'play': get_flashvar('streamid'), - 'partner': get_flashvar('partnerid'), - 'portal': get_flashvar('portalid'), - 'lang': get_flashvar('sprache'), - 'v5ident': '', - }) - - _v = lambda x, **k: xpath_text(hd_doc, './/video/' + x, **k) - title = _v('title', fatal=True) - - token_url = None - premium = get_flashvar('premium', default=None) - if premium: - token_url = update_url_query( - _v('url', fatal=True), { - 'timestamp': get_flashvar('timestamp'), - 'auth': get_flashvar('auth'), - }) - else: - data_abo = urlencode_postdata( - dict((i, v) for i, v in enumerate(_v('req_liga_abos').split(',')))) - stream_access_url = update_url_query( - 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access', { - 'videoId': _v('id'), - 'target': self._search_regex(r'vs_target = (\d+);', webpage, 'vs target'), - 'label': _v('label'), - 'area': _v('area'), - }) - token_url = self._extract_token_url(stream_access_url, video_id, data_abo) - - formats = self._extract_formats(token_url, video_id) - - categories_str = _v('meta_sports') - categories = categories_str.split(',') if categories_str else [] - is_live = _v('islive') == 'true' - - return { - 'id': video_id, - 'title': title, - 'upload_date': unified_strdate(_v('time_date')), - 'uploader': _v('meta_organisation'), - 'categories': categories, - 'is_live': is_live, - 'formats': formats, - } - - -class Laola1TvBaseIE(Laola1TvEmbedIE): # XXX: Do not subclass from concrete IE - def _extract_video(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - if 'Dieser Livestream ist bereits beendet.' in webpage: - raise ExtractorError('This live stream has already finished.', expected=True) - - conf = self._parse_json(self._search_regex( - r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'), - display_id, - transform_source=lambda s: js_to_json(re.sub(r'shareurl:.+,', '', s))) - video_id = conf['videoid'] - - config = self._download_json(conf['configUrl'], video_id, query={ - 'videoid': video_id, - 'partnerid': conf['partnerid'], - 'language': conf.get('language', ''), - 'portal': conf.get('portalid', ''), - }) - error = config.get('error') - if error: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) - - video_data = config['video'] - title = video_data['title'] - is_live = video_data.get('isLivestream') and video_data.get('isLive') - meta = video_data.get('metaInformation') - sports = meta.get('sports') - categories = sports.split(',') if sports else [] - - token_url = self._extract_token_url( - video_data['streamAccess'], video_id, - video_data['abo']['required']) - - formats = self._extract_formats(token_url, video_id) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': video_data.get('description'), - 'thumbnail': video_data.get('image'), - 'categories': categories, - 'formats': formats, - 'is_live': is_live, - } - - -class Laola1TvIE(Laola1TvBaseIE): - IE_NAME = 'laola1tv' - _VALID_URL = r'https?://(?:www\.)?laola1\.tv/[a-z]+-[a-z]+/[^/]+/(?P<id>[^/?#&]+)' - - _TESTS = [{ - 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html', - 'info_dict': { - 'id': '227883', - 'display_id': 'straubing-tigers-koelner-haie', - 'ext': 'flv', - 'title': 'Straubing Tigers - Kölner Haie', - 'upload_date': '20140912', - 'is_live': False, - 'categories': ['Eishockey'], - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie', - 'info_dict': { - 'id': '464602', - 'display_id': 'straubing-tigers-koelner-haie', - 'ext': 'flv', - 'title': 'Straubing Tigers - Kölner Haie', - 'upload_date': '20160129', - 'is_live': False, - 'categories': ['Eishockey'], - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.laola1.tv/de-de/livestream/2016-03-22-belogorie-belgorod-trentino-diatec-lde', - 'info_dict': { - 'id': '487850', - 'display_id': '2016-03-22-belogorie-belgorod-trentino-diatec-lde', - 'ext': 'flv', - 'title': 'Belogorie BELGOROD - TRENTINO Diatec', - 'upload_date': '20160322', - 'uploader': 'CEV - Europäischer Volleyball Verband', - 'is_live': True, - 'categories': ['Volleyball'], - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This live stream has already finished.', - }] - - def _real_extract(self, url): - return self._extract_video(url) - - -class EHFTVIE(Laola1TvBaseIE): - IE_NAME = 'ehftv' - _VALID_URL = r'https?://(?:www\.)?ehftv\.com/[a-z]+(?:-[a-z]+)?/[^/]+/(?P<id>[^/?#&]+)' - - _TESTS = [{ - 'url': 'https://www.ehftv.com/int/video/paris-saint-germain-handball-pge-vive-kielce/1166761', - 'info_dict': { - 'id': '1166761', - 'display_id': 'paris-saint-germain-handball-pge-vive-kielce', - 'ext': 'mp4', - 'title': 'Paris Saint-Germain Handball - PGE Vive Kielce', - 'is_live': False, - 'categories': ['Handball'], - }, - 'params': { - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - return self._extract_video(url) - - -class ITTFIE(InfoExtractor): - _VALID_URL = r'https?://tv\.ittf\.com/video/[^/]+/(?P<id>\d+)' - _TEST = { - 'url': 'https://tv.ittf.com/video/peng-wang-wei-matsudaira-kenta/951802', - 'only_matching': True, - } - - def _real_extract(self, url): - return self.url_result( - update_url_query('https://www.laola1.tv/titanplayer.php', { - 'videoid': self._match_id(url), - 'type': 'V', - 'lang': 'en', - 'portal': 'int', - 'customer': 1024, - }), Laola1TvEmbedIE.ie_key()) diff --git a/yt_dlp/extractor/laracasts.py b/yt_dlp/extractor/laracasts.py new file mode 100644 index 0000000000..4494c4b79a --- /dev/null +++ b/yt_dlp/extractor/laracasts.py @@ -0,0 +1,114 @@ +import json + +from .common import InfoExtractor +from .vimeo import VimeoIE +from ..utils import ( + clean_html, + extract_attributes, + get_element_html_by_id, + int_or_none, + parse_duration, + str_or_none, + unified_strdate, + url_or_none, + urljoin, +) +from ..utils.traversal import traverse_obj + + +class LaracastsBaseIE(InfoExtractor): + def _get_prop_data(self, url, display_id): + webpage = self._download_webpage(url, display_id) + return traverse_obj( + get_element_html_by_id('app', webpage), + ({extract_attributes}, 'data-page', {json.loads}, 'props')) + + def _parse_episode(self, episode): + if not traverse_obj(episode, 'vimeoId'): + self.raise_login_required('This video is only available for subscribers.') + return self.url_result( + VimeoIE._smuggle_referrer( + f'https://player.vimeo.com/video/{episode["vimeoId"]}', 'https://laracasts.com/'), + VimeoIE, url_transparent=True, + **traverse_obj(episode, { + 'id': ('id', {int}, {str_or_none}), + 'webpage_url': ('path', {lambda x: urljoin('https://laracasts.com', x)}), + 'title': ('title', {clean_html}), + 'season_number': ('chapter', {int_or_none}), + 'episode_number': ('position', {int_or_none}), + 'description': ('body', {clean_html}), + 'thumbnail': ('largeThumbnail', {url_or_none}), + 'duration': ('length', {int_or_none}), + 'date': ('dateSegments', 'published', {unified_strdate}), + })) + + +class LaracastsIE(LaracastsBaseIE): + IE_NAME = 'laracasts' + _VALID_URL = r'https?://(?:www\.)?laracasts\.com/series/(?P<id>[\w-]+/episodes/\d+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://laracasts.com/series/30-days-to-learn-laravel-11/episodes/1', + 'md5': 'c8f5e7b02ad0e438ef9280a08c8493dc', + 'info_dict': { + 'id': '922040563', + 'title': 'Hello, Laravel', + 'ext': 'mp4', + 'duration': 519, + 'date': '20240312', + 'thumbnail': 'https://laracasts.s3.amazonaws.com/videos/thumbnails/youtube/30-days-to-learn-laravel-11-1.png', + 'description': 'md5:ddd658bb241975871d236555657e1dd1', + 'season_number': 1, + 'season': 'Season 1', + 'episode_number': 1, + 'episode': 'Episode 1', + 'uploader': 'Laracasts', + 'uploader_id': 'user20182673', + 'uploader_url': 'https://vimeo.com/user20182673', + }, + 'expected_warnings': ['Failed to parse XML'], # TODO: Remove when vimeo extractor is fixed + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + return self._parse_episode(self._get_prop_data(url, display_id)['lesson']) + + +class LaracastsPlaylistIE(LaracastsBaseIE): + IE_NAME = 'laracasts:series' + _VALID_URL = r'https?://(?:www\.)?laracasts\.com/series/(?P<id>[\w-]+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://laracasts.com/series/30-days-to-learn-laravel-11', + 'info_dict': { + 'title': '30 Days to Learn Laravel', + 'id': '210', + 'thumbnail': 'https://laracasts.s3.amazonaws.com/series/thumbnails/social-cards/30-days-to-learn-laravel-11.png?v=2', + 'duration': 30600.0, + 'modified_date': '20240511', + 'description': 'md5:27c260a1668a450984e8f901579912dd', + 'categories': ['Frameworks'], + 'tags': ['Laravel'], + 'display_id': '30-days-to-learn-laravel-11', + }, + 'playlist_count': 30, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + series = self._get_prop_data(url, display_id)['series'] + + metadata = { + 'display_id': display_id, + **traverse_obj(series, { + 'title': ('title', {str}), + 'id': ('id', {int}, {str_or_none}), + 'description': ('body', {clean_html}), + 'thumbnail': (('large_thumbnail', 'thumbnail'), {url_or_none}, any), + 'duration': ('runTime', {parse_duration}), + 'categories': ('taxonomy', 'name', {str}, {lambda x: x and [x]}), + 'tags': ('topics', ..., 'name', {str}), + 'modified_date': ('lastUpdated', {unified_strdate}), + }), + } + + return self.playlist_result(traverse_obj( + series, ('chapters', ..., 'episodes', lambda _, v: v['vimeoId'], {self._parse_episode})), **metadata) diff --git a/yt_dlp/extractor/lastfm.py b/yt_dlp/extractor/lastfm.py index f14198cfdd..67103352e7 100644 --- a/yt_dlp/extractor/lastfm.py +++ b/yt_dlp/extractor/lastfm.py @@ -1,33 +1,24 @@ +import itertools import re from .common import InfoExtractor -from ..utils import int_or_none, format_field +from ..utils import int_or_none, parse_qs, traverse_obj class LastFMPlaylistBaseIE(InfoExtractor): def _entries(self, url, playlist_id): - webpage = self._download_webpage(url, playlist_id) - start_page_number = int_or_none(self._search_regex( - r'\bpage=(\d+)', url, 'page', default=None)) or 1 - last_page_number = int_or_none(self._search_regex( - r'>(\d+)</a>[^<]*</li>[^<]*<li[^>]+class="pagination-next', webpage, 'last_page', default=None)) - - for page_number in range(start_page_number, (last_page_number or start_page_number) + 1): + single_page = traverse_obj(parse_qs(url), ('page', -1, {int_or_none})) + for page in itertools.count(single_page or 1): webpage = self._download_webpage( - url, playlist_id, - note='Downloading page %d%s' % (page_number, format_field(last_page_number, None, ' of %d')), - query={'page': page_number}) - page_entries = [ - self.url_result(player_url, 'Youtube') - for player_url in set(re.findall(r'data-youtube-url="([^"]+)"', webpage)) - ] - - for e in page_entries: - yield e + url, playlist_id, f'Downloading page {page}', query={'page': page}) + videos = re.findall(r'data-youtube-url="([^"]+)"', webpage) + yield from videos + if single_page or not videos: + return def _real_extract(self, url): playlist_id = self._match_id(url) - return self.playlist_result(self._entries(url, playlist_id), playlist_id) + return self.playlist_from_matches(self._entries(url, playlist_id), playlist_id, ie='Youtube') class LastFMPlaylistIE(LastFMPlaylistBaseIE): @@ -37,7 +28,7 @@ class LastFMPlaylistIE(LastFMPlaylistBaseIE): 'info_dict': { 'id': 'Oasis', }, - 'playlist_count': 11, + 'playlist_mincount': 11, }, { 'url': 'https://www.last.fm/music/Oasis', 'only_matching': True, @@ -73,6 +64,18 @@ class LastFMUserIE(LastFMPlaylistBaseIE): 'id': '12319471', }, 'playlist_count': 30, + }, { + 'url': 'https://www.last.fm/user/naamloos1/playlists/12543760', + 'info_dict': { + 'id': '12543760', + }, + 'playlist_mincount': 80, + }, { + 'url': 'https://www.last.fm/user/naamloos1/playlists/12543760?page=3', + 'info_dict': { + 'id': '12543760', + }, + 'playlist_count': 32, }] diff --git a/yt_dlp/extractor/laxarxames.py b/yt_dlp/extractor/laxarxames.py new file mode 100644 index 0000000000..f6d515b218 --- /dev/null +++ b/yt_dlp/extractor/laxarxames.py @@ -0,0 +1,73 @@ +import json + +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor +from ..utils import ExtractorError +from ..utils.traversal import traverse_obj + + +class LaXarxaMesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?laxarxames\.cat/(?:[^/?#]+/)*?(player|movie-details)/(?P<id>\d+)' + _NETRC_MACHINE = 'laxarxames' + _TOKEN = None + _TESTS = [{ + 'url': 'https://www.laxarxames.cat/player/3459421', + 'md5': '0966f46c34275934c19af78f3df6e2bc', + 'info_dict': { + 'id': '6339612436112', + 'ext': 'mp4', + 'title': 'Resum | UA Horta — UD Viladecans', + 'timestamp': 1697905186, + 'thumbnail': r're:https?://.*\.jpg', + 'description': '', + 'upload_date': '20231021', + 'duration': 129.44, + 'tags': ['ott', 'esports', '23-24', ' futbol', ' futbol-partits', 'elit', 'resum'], + 'uploader_id': '5779379807001', + }, + 'skip': 'Requires login', + }] + + def _perform_login(self, username, password): + if self._TOKEN: + return + + login = self._download_json( + 'https://api.laxarxames.cat/Authorization/SignIn', None, note='Logging in', headers={ + 'X-Tenantorigin': 'https://laxarxames.cat', + 'Content-Type': 'application/json', + }, data=json.dumps({ + 'Username': username, + 'Password': password, + 'Device': { + 'PlatformCode': 'WEB', + 'Name': 'Mac OS ()', + }, + }).encode(), expected_status=401) + + self._TOKEN = traverse_obj(login, ('AuthorizationToken', 'Token', {str})) + if not self._TOKEN: + raise ExtractorError('Login failed', expected=True) + + def _real_extract(self, url): + video_id = self._match_id(url) + if not self._TOKEN: + self.raise_login_required() + + media_play_info = self._download_json( + 'https://api.laxarxames.cat/Media/GetMediaPlayInfo', video_id, + data=json.dumps({ + 'MediaId': int(video_id), + 'StreamType': 'MAIN', + }).encode(), headers={ + 'Authorization': f'Bearer {self._TOKEN}', + 'X-Tenantorigin': 'https://laxarxames.cat', + 'Content-Type': 'application/json', + }) + + if not traverse_obj(media_play_info, ('ContentUrl', {str})): + self.raise_no_formats('No video found', expected=True) + + return self.url_result( + f'https://players.brightcove.net/5779379807001/default_default/index.html?videoId={media_play_info["ContentUrl"]}', + BrightcoveNewIE, video_id, media_play_info.get('Title')) diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index b5def1e071..c764d49611 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -1,27 +1,32 @@ import functools import json +import re +import urllib.parse from .common import InfoExtractor -from ..compat import compat_str, compat_urllib_parse_unquote +from ..networking import HEADRequest from ..utils import ( ExtractorError, - HEADRequest, OnDemandPagedList, UnsupportedError, determine_ext, int_or_none, mimetype2ext, parse_qs, + traverse_obj, try_get, + url_or_none, + urlhandle_detect_ext, urljoin, ) class LBRYBaseIE(InfoExtractor): - _BASE_URL_REGEX = r'(?:https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/|lbry://)' + _BASE_URL_REGEX = r'(?x)(?:https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/|lbry://)' _CLAIM_ID_REGEX = r'[0-9a-f]{1,40}' - _OPT_CLAIM_ID = '[^:/?#&]+(?:[:#]%s)?' % _CLAIM_ID_REGEX + _OPT_CLAIM_ID = f'[^$@:/?#&]+(?:[:#]{_CLAIM_ID_REGEX})?' _SUPPORTED_STREAM_TYPES = ['video', 'audio'] + _PAGE_SIZE = 50 def _call_api_proxy(self, method, display_id, params, resource): headers = {'Content-Type': 'application/json-rpc'} @@ -30,7 +35,7 @@ def _call_api_proxy(self, method, display_id, params, resource): headers['x-lbry-auth-token'] = token response = self._download_json( 'https://api.lbry.tv/api/v1/proxy', - display_id, 'Downloading %s JSON metadata' % resource, + display_id, f'Downloading {resource} JSON metadata', headers=headers, data=json.dumps({ 'method': method, @@ -49,51 +54,98 @@ def _resolve_url(self, url, display_id, resource): def _permanent_url(self, url, claim_name, claim_id): return urljoin( url.replace('lbry://', 'https://lbry.tv/'), - '/%s:%s' % (claim_name, claim_id)) + f'/{claim_name}:{claim_id}') def _parse_stream(self, stream, url): - stream_value = stream.get('value') or {} - stream_type = stream_value.get('stream_type') - source = stream_value.get('source') or {} - media = stream_value.get(stream_type) or {} - signing_channel = stream.get('signing_channel') or {} - channel_name = signing_channel.get('name') - channel_claim_id = signing_channel.get('claim_id') - channel_url = None - if channel_name and channel_claim_id: - channel_url = self._permanent_url(url, channel_name, channel_claim_id) + stream_type = traverse_obj(stream, ('value', 'stream_type', {str})) + + info = traverse_obj(stream, { + 'title': ('value', 'title', {str}), + 'thumbnail': ('value', 'thumbnail', 'url', {url_or_none}), + 'description': ('value', 'description', {str}), + 'license': ('value', 'license', {str}), + 'timestamp': ('timestamp', {int_or_none}), + 'release_timestamp': ('value', 'release_time', {int_or_none}), + 'tags': ('value', 'tags', ..., {lambda x: x or None}), + 'duration': ('value', stream_type, 'duration', {int_or_none}), + 'channel': ('signing_channel', 'value', 'title', {str}), + 'channel_id': ('signing_channel', 'claim_id', {str}), + 'uploader_id': ('signing_channel', 'name', {str}), + }) + + if info.get('uploader_id') and info.get('channel_id'): + info['channel_url'] = self._permanent_url(url, info['uploader_id'], info['channel_id']) - info = { - 'thumbnail': try_get(stream_value, lambda x: x['thumbnail']['url'], compat_str), - 'description': stream_value.get('description'), - 'license': stream_value.get('license'), - 'timestamp': int_or_none(stream.get('timestamp')), - 'release_timestamp': int_or_none(stream_value.get('release_time')), - 'tags': stream_value.get('tags'), - 'duration': int_or_none(media.get('duration')), - 'channel': try_get(signing_channel, lambda x: x['value']['title']), - 'channel_id': channel_claim_id, - 'channel_url': channel_url, - 'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')), - 'filesize': int_or_none(source.get('size')), - } - if stream_type == 'audio': - info['vcodec'] = 'none' - else: - info.update({ - 'width': int_or_none(media.get('width')), - 'height': int_or_none(media.get('height')), - }) return info + def _fetch_page(self, display_id, url, params, page): + page += 1 + page_params = { + 'no_totals': True, + 'page': page, + 'page_size': self._PAGE_SIZE, + **params, + } + result = self._call_api_proxy( + 'claim_search', display_id, page_params, f'page {page}') + for item in traverse_obj(result, ('items', lambda _, v: v['name'] and v['claim_id'])): + yield { + **self._parse_stream(item, url), + '_type': 'url', + 'id': item['claim_id'], + 'url': self._permanent_url(url, item['name'], item['claim_id']), + } + + def _playlist_entries(self, url, display_id, claim_param, metadata): + qs = parse_qs(url) + content = qs.get('content', [None])[0] + params = { + 'fee_amount': qs.get('fee_amount', ['>=0'])[0], + 'order_by': { + 'new': ['release_time'], + 'top': ['effective_amount'], + 'trending': ['trending_group', 'trending_mixed'], + }[qs.get('order', ['new'])[0]], + 'claim_type': 'stream', + 'stream_types': [content] if content in ['audio', 'video'] else self._SUPPORTED_STREAM_TYPES, + **claim_param, + } + duration = qs.get('duration', [None])[0] + if duration: + params['duration'] = { + 'long': '>=1200', + 'short': '<=240', + }[duration] + language = qs.get('language', ['all'])[0] + if language != 'all': + languages = [language] + if language == 'en': + languages.append('none') + params['any_languages'] = languages + + entries = OnDemandPagedList( + functools.partial(self._fetch_page, display_id, url, params), + self._PAGE_SIZE) + + return self.playlist_result( + entries, display_id, **traverse_obj(metadata, ('value', { + 'title': 'title', + 'description': 'description', + }))) + class LBRYIE(LBRYBaseIE): IE_NAME = 'lbry' - _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'(?P<id>\$/[^/]+/[^/]+/{1}|@{0}/{0}|(?!@){0})'.format(LBRYBaseIE._OPT_CLAIM_ID, LBRYBaseIE._CLAIM_ID_REGEX) + _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + rf''' + (?:\$/(?:download|embed)/)? + (?P<id> + [^$@:/?#]+/{LBRYBaseIE._CLAIM_ID_REGEX} + |(?:@{LBRYBaseIE._OPT_CLAIM_ID}/)?{LBRYBaseIE._OPT_CLAIM_ID} + )''' _TESTS = [{ # Video 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1', - 'md5': 'fffd15d76062e9a985c22c7c7f2f4805', + 'md5': '65bd7ec1f6744ada55da8e4c48a2edf9', 'info_dict': { 'id': '17f983b61f53091fb8ea58a9c56804e4ff8cff4d', 'ext': 'mp4', @@ -107,6 +159,7 @@ class LBRYIE(LBRYBaseIE): 'height': 720, 'thumbnail': 'https://spee.ch/7/67f2d809c263288c.png', 'license': 'None', + 'uploader_id': '@Mantega', 'duration': 346, 'channel': 'LBRY/Odysee rats united!!!', 'channel_id': '1c8ad6a2ab4e889a71146ae4deeb23bb92dab627', @@ -116,9 +169,9 @@ class LBRYIE(LBRYBaseIE): 'lbc', 'lbry', 'start', - 'tutorial' + 'tutorial', ], - } + }, }, { # Audio 'url': 'https://lbry.tv/@LBRYFoundation:0/Episode-1:e', @@ -140,11 +193,11 @@ class LBRYIE(LBRYBaseIE): 'vcodec': 'none', 'thumbnail': 'https://spee.ch/d/0bc63b0e6bf1492d.png', 'license': 'None', - } + 'uploader_id': '@LBRYFoundation', + }, }, { - # HLS 'url': 'https://odysee.com/@gardeningincanada:b/plants-i-will-never-grow-again.-the:e', - 'md5': '25049011f3c8bc2f8b60ad88a031837e', + 'md5': 'c35fac796f62a14274b4dc2addb5d0ba', 'info_dict': { 'id': 'e51671357333fe22ae88aad320bde2f6f96b1410', 'ext': 'mp4', @@ -159,10 +212,11 @@ class LBRYIE(LBRYBaseIE): 'channel': 'Gardening In Canada', 'channel_id': 'b8be0e93b423dad221abe29545fbe8ec36e806bc', 'channel_url': 'https://odysee.com/@gardeningincanada:b8be0e93b423dad221abe29545fbe8ec36e806bc', + 'uploader_id': '@gardeningincanada', 'formats': 'mincount:3', 'thumbnail': 'https://thumbnails.lbry.com/AgHSc_HzrrE', 'license': 'Copyrighted (contact publisher)', - } + }, }, { # HLS live stream (might expire) 'url': 'https://odysee.com/@RT:fd/livestream_RT:d', @@ -177,15 +231,38 @@ class LBRYIE(LBRYBaseIE): 'release_timestamp': int, 'release_date': str, 'tags': list, - 'duration': None, 'channel': 'RT', 'channel_id': 'fdd11cb3ab75f95efb7b3bc2d726aa13ac915b66', 'channel_url': 'https://odysee.com/@RT:fdd11cb3ab75f95efb7b3bc2d726aa13ac915b66', 'formats': 'mincount:1', 'thumbnail': 'startswith:https://thumb', 'license': 'None', + 'uploader_id': '@RT', + }, + 'params': {'skip_download': True}, + }, { + # original quality format w/higher resolution than HLS formats + 'url': 'https://odysee.com/@wickedtruths:2/Biotechnological-Invasion-of-Skin-(April-2023):4', + 'md5': '305b0b3b369bde1b984961f005b67193', + 'info_dict': { + 'id': '41fbfe805eb73c8d3012c0c49faa0f563274f634', + 'ext': 'mp4', + 'title': 'Biotechnological Invasion of Skin (April 2023)', + 'description': 'md5:fe28689db2cb7ba3436d819ac3ffc378', + 'channel': 'Wicked Truths', + 'channel_id': '23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', + 'channel_url': 'https://odysee.com/@wickedtruths:23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', + 'uploader_id': '@wickedtruths', + 'timestamp': 1695114347, + 'upload_date': '20230919', + 'release_timestamp': 1685617473, + 'release_date': '20230601', + 'duration': 1063, + 'thumbnail': 'https://thumbs.odycdn.com/4e6d39da4df0cfdad45f64e253a15959.webp', + 'tags': ['smart skin surveillance', 'biotechnology invasion of skin', 'morgellons'], + 'license': 'None', + 'protocol': 'https', # test for direct mp4 download }, - 'params': {'skip_download': True} }, { 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', 'only_matching': True, @@ -217,45 +294,69 @@ class LBRYIE(LBRYBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - if display_id.startswith('$/'): - display_id = display_id.split('/', 2)[-1].replace('/', ':') - else: + if display_id.startswith('@'): display_id = display_id.replace(':', '#') - display_id = compat_urllib_parse_unquote(display_id) + else: + display_id = display_id.replace('/', ':') + display_id = urllib.parse.unquote(display_id) uri = 'lbry://' + display_id result = self._resolve_url(uri, display_id, 'stream') headers = {'Referer': 'https://odysee.com/'} - if result['value'].get('stream_type') in self._SUPPORTED_STREAM_TYPES: + + formats = [] + stream_type = traverse_obj(result, ('value', 'stream_type', {str})) + + if stream_type in self._SUPPORTED_STREAM_TYPES: claim_id, is_live = result['claim_id'], False streaming_url = self._call_api_proxy( 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] + + # GET request to v3 API returns original video/audio file if available + direct_url = re.sub(r'/api/v\d+/', '/api/v3/', streaming_url) + urlh = self._request_webpage( + direct_url, display_id, 'Checking for original quality', headers=headers, fatal=False) + if urlh and urlhandle_detect_ext(urlh) != 'm3u8': + formats.append({ + 'url': direct_url, + 'format_id': 'original', + 'quality': 1, + **traverse_obj(result, ('value', { + 'ext': ('source', (('name', {determine_ext}), ('media_type', {mimetype2ext}))), + 'filesize': ('source', 'size', {int_or_none}), + 'width': ('video', 'width', {int_or_none}), + 'height': ('video', 'height', {int_or_none}), + }), get_all=False), + 'vcodec': 'none' if stream_type == 'audio' else None, + }) + + # HEAD request returns redirect response to m3u8 URL if available final_url = self._request_webpage( HEADRequest(streaming_url), display_id, headers=headers, - note='Downloading streaming redirect url info').geturl() + note='Downloading streaming redirect url info').url + elif result.get('value_type') == 'stream': claim_id, is_live = result['signing_channel']['claim_id'], True live_data = self._download_json( 'https://api.odysee.live/livestream/is_live', claim_id, query={'channel_claim_id': claim_id}, note='Downloading livestream JSON metadata')['data'] - streaming_url = final_url = live_data.get('VideoURL') + final_url = live_data.get('VideoURL') # Upcoming videos may still give VideoURL if not live_data.get('Live'): - streaming_url = final_url = None + final_url = None self.raise_no_formats('This stream is not live', True, claim_id) + else: raise UnsupportedError(url) - info = self._parse_stream(result, url) if determine_ext(final_url) == 'm3u8': - info['formats'] = self._extract_m3u8_formats( - final_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', live=is_live, headers=headers) - else: - info['url'] = streaming_url + formats.extend(self._extract_m3u8_formats( + final_url, display_id, 'mp4', m3u8_id='hls', live=is_live, headers=headers)) + return { - **info, + **self._parse_stream(result, url), 'id': claim_id, - 'title': result['value']['title'], + 'formats': formats, 'is_live': is_live, 'http_headers': headers, } @@ -263,7 +364,7 @@ def _real_extract(self, url): class LBRYChannelIE(LBRYBaseIE): IE_NAME = 'lbry:channel' - _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'(?P<id>@%s)/?(?:[?&]|$)' % LBRYBaseIE._OPT_CLAIM_ID + _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + rf'(?P<id>@{LBRYBaseIE._OPT_CLAIM_ID})/?(?:[?&]|$)' _TESTS = [{ 'url': 'https://lbry.tv/@LBRYFoundation:0', 'info_dict': { @@ -279,67 +380,50 @@ class LBRYChannelIE(LBRYBaseIE): 'url': 'lbry://@lbry#3f', 'only_matching': True, }] - _PAGE_SIZE = 50 - - def _fetch_page(self, claim_id, url, params, page): - page += 1 - page_params = { - 'channel_ids': [claim_id], - 'claim_type': 'stream', - 'no_totals': True, - 'page': page, - 'page_size': self._PAGE_SIZE, - } - page_params.update(params) - result = self._call_api_proxy( - 'claim_search', claim_id, page_params, 'page %d' % page) - for item in (result.get('items') or []): - stream_claim_name = item.get('name') - stream_claim_id = item.get('claim_id') - if not (stream_claim_name and stream_claim_id): - continue - - info = self._parse_stream(item, url) - info.update({ - '_type': 'url', - 'id': stream_claim_id, - 'title': try_get(item, lambda x: x['value']['title']), - 'url': self._permanent_url(url, stream_claim_name, stream_claim_id), - }) - yield info def _real_extract(self, url): display_id = self._match_id(url).replace(':', '#') - result = self._resolve_url( - 'lbry://' + display_id, display_id, 'channel') + result = self._resolve_url(f'lbry://{display_id}', display_id, 'channel') claim_id = result['claim_id'] - qs = parse_qs(url) - content = qs.get('content', [None])[0] - params = { - 'fee_amount': qs.get('fee_amount', ['>=0'])[0], - 'order_by': { - 'new': ['release_time'], - 'top': ['effective_amount'], - 'trending': ['trending_group', 'trending_mixed'], - }[qs.get('order', ['new'])[0]], - 'stream_types': [content] if content in ['audio', 'video'] else self._SUPPORTED_STREAM_TYPES, - } - duration = qs.get('duration', [None])[0] - if duration: - params['duration'] = { - 'long': '>=1200', - 'short': '<=240', - }[duration] - language = qs.get('language', ['all'])[0] - if language != 'all': - languages = [language] - if language == 'en': - languages.append('none') - params['any_languages'] = languages - entries = OnDemandPagedList( - functools.partial(self._fetch_page, claim_id, url, params), - self._PAGE_SIZE) - result_value = result.get('value') or {} - return self.playlist_result( - entries, claim_id, result_value.get('title'), - result_value.get('description')) + + return self._playlist_entries(url, claim_id, {'channel_ids': [claim_id]}, result) + + +class LBRYPlaylistIE(LBRYBaseIE): + IE_NAME = 'lbry:playlist' + _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'\$/(?:play)?list/(?P<id>[0-9a-f-]+)' + _TESTS = [{ + 'url': 'https://odysee.com/$/playlist/ffef782f27486f0ac138bde8777f72ebdd0548c2', + 'info_dict': { + 'id': 'ffef782f27486f0ac138bde8777f72ebdd0548c2', + 'title': 'Théâtre Classique', + 'description': 'Théâtre Classique', + }, + 'playlist_mincount': 4, + }, { + 'url': 'https://odysee.com/$/list/9c6658b3dd21e4f2a0602d523a13150e2b48b770', + 'info_dict': { + 'id': '9c6658b3dd21e4f2a0602d523a13150e2b48b770', + 'title': 'Social Media Exposed', + 'description': 'md5:98af97317aacd5b85d595775ea37d80e', + }, + 'playlist_mincount': 34, + }, { + 'url': 'https://odysee.com/$/playlist/938fb11d-215f-4d1c-ad64-723954df2184', + 'info_dict': { + 'id': '938fb11d-215f-4d1c-ad64-723954df2184', + }, + 'playlist_mincount': 1000, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + result = traverse_obj(self._call_api_proxy('claim_search', display_id, { + 'claim_ids': [display_id], + 'no_totals': True, + 'page': 1, + 'page_size': self._PAGE_SIZE, + }, 'playlist'), ('items', 0)) + claim_param = {'claim_ids': traverse_obj(result, ('value', 'claims', ..., {str}))} + + return self._playlist_entries(url, display_id, claim_param, result) diff --git a/yt_dlp/extractor/lci.py b/yt_dlp/extractor/lci.py index e7d2f8a24c..708cb548d8 100644 --- a/yt_dlp/extractor/lci.py +++ b/yt_dlp/extractor/lci.py @@ -1,9 +1,25 @@ from .common import InfoExtractor +from .wat import WatIE +from ..utils import ExtractorError, int_or_none +from ..utils.traversal import traverse_obj class LCIIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:lci|tf1info)\.fr/[^/]+/[\w-]+-(?P<id>\d+)\.html' + _VALID_URL = r'https?://(?:www\.)?(?:lci|tf1info)\.fr/(?:[^/?#]+/)+[\w-]+-(?P<id>\d+)\.html' _TESTS = [{ + 'url': 'https://www.tf1info.fr/replay-lci/videos/video-24h-pujadas-du-vendredi-24-mai-6708-2300831.html', + 'info_dict': { + 'id': '14113788', + 'ext': 'mp4', + 'title': '24H Pujadas du vendredi 24 mai 2024', + 'thumbnail': 'https://photos.tf1.fr/1280/720/24h-pujadas-du-24-mai-2024-55bf2d-0@1x.jpg', + 'upload_date': '20240524', + 'duration': 6158, + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'https://www.tf1info.fr/politique/election-presidentielle-2022-second-tour-j-2-marine-le-pen-et-emmanuel-macron-en-interview-de-lci-vendredi-soir-2217486.html', 'info_dict': { 'id': '13875948', @@ -24,5 +40,10 @@ class LCIIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - wat_id = self._search_regex(r'watId["\']?\s*:\s*["\']?(\d+)', webpage, 'wat id') - return self.url_result('wat:' + wat_id, 'Wat', wat_id) + next_data = self._search_nextjs_data(webpage, video_id) + wat_id = traverse_obj(next_data, ( + 'props', 'pageProps', 'page', 'tms', 'videos', {dict.keys}, ..., {int_or_none}, any)) + if wat_id is None: + raise ExtractorError('Could not find wat_id') + + return self.url_result(f'wat:{wat_id}', WatIE, str(wat_id)) diff --git a/yt_dlp/extractor/lcp.py b/yt_dlp/extractor/lcp.py index 9846319e0c..69148be222 100644 --- a/yt_dlp/extractor/lcp.py +++ b/yt_dlp/extractor/lcp.py @@ -1,5 +1,5 @@ -from .common import InfoExtractor from .arkena import ArkenaIE +from .common import InfoExtractor class LcpPlayIE(ArkenaIE): # XXX: Do not subclass from concrete IE @@ -66,7 +66,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, display_id) play_url = self._search_regex( - r'<iframe[^>]+src=(["\'])(?P<url>%s?(?:(?!\1).)*)\1' % LcpPlayIE._VALID_URL, + rf'<iframe[^>]+src=(["\'])(?P<url>{LcpPlayIE._VALID_URL}?(?:(?!\1).)*)\1', webpage, 'play iframe', default=None, group='url') if not play_url: diff --git a/yt_dlp/extractor/learningonscreen.py b/yt_dlp/extractor/learningonscreen.py new file mode 100644 index 0000000000..dcf83144c8 --- /dev/null +++ b/yt_dlp/extractor/learningonscreen.py @@ -0,0 +1,78 @@ +import functools +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + extract_attributes, + get_element_by_class, + get_element_html_by_id, + join_nonempty, + parse_duration, + unified_timestamp, +) +from ..utils.traversal import traverse_obj + + +class LearningOnScreenIE(InfoExtractor): + _VALID_URL = r'https?://learningonscreen\.ac\.uk/ondemand/index\.php/prog/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://learningonscreen.ac.uk/ondemand/index.php/prog/005D81B2?bcast=22757013', + 'info_dict': { + 'id': '005D81B2', + 'ext': 'mp4', + 'title': 'Planet Earth', + 'duration': 3600.0, + 'timestamp': 1164567600.0, + 'upload_date': '20061126', + 'thumbnail': 'https://stream.learningonscreen.ac.uk/trilt-cover-images/005D81B2-Planet-Earth-2006-11-26T190000Z-BBC4.jpg', + }, + }] + + def _real_initialize(self): + if not self._get_cookies('https://learningonscreen.ac.uk/').get('PHPSESSID-BOB-LIVE'): + self.raise_login_required( + 'Use --cookies for authentication. See ' + ' https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp ' + 'for how to manually pass cookies', method=None) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + details = traverse_obj(webpage, ( + {functools.partial(get_element_html_by_id, 'programme-details')}, { + 'title': ({functools.partial(re.search, r'<h2>([^<]+)</h2>')}, 1, {clean_html}), + 'timestamp': ( + {functools.partial(get_element_by_class, 'broadcast-date')}, + {functools.partial(re.match, r'([^<]+)')}, 1, {unified_timestamp}), + 'duration': ( + {functools.partial(get_element_by_class, 'prog-running-time')}, + {clean_html}, {parse_duration}), + })) + + title = details.pop('title', None) or traverse_obj(webpage, ( + {functools.partial(get_element_html_by_id, 'add-to-existing-playlist')}, + {extract_attributes}, 'data-record-title', {clean_html})) + + entries = self._parse_html5_media_entries( + 'https://stream.learningonscreen.ac.uk', webpage, video_id, m3u8_id='hls', mpd_id='dash', + _headers={'Origin': 'https://learningonscreen.ac.uk', 'Referer': 'https://learningonscreen.ac.uk/'}) + if not entries: + raise ExtractorError('No video found') + + if len(entries) > 1: + duration = details.pop('duration', None) + for idx, entry in enumerate(entries, start=1): + entry.update(details) + entry['id'] = join_nonempty(video_id, idx) + entry['title'] = join_nonempty(title, idx) + return self.playlist_result(entries, video_id, title, duration=duration) + + return { + **entries[0], + **details, + 'id': video_id, + 'title': title, + } diff --git a/yt_dlp/extractor/lecture2go.py b/yt_dlp/extractor/lecture2go.py index 3a9b30a3c2..6157f3da34 100644 --- a/yt_dlp/extractor/lecture2go.py +++ b/yt_dlp/extractor/lecture2go.py @@ -4,12 +4,13 @@ from ..utils import ( determine_ext, determine_protocol, - parse_duration, int_or_none, + parse_duration, ) class Lecture2GoIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://lecture2go\.uni-hamburg\.de/veranstaltungen/-/v/(?P<id>\d+)' _TEST = { 'url': 'https://lecture2go.uni-hamburg.de/veranstaltungen/-/v/17473', @@ -24,7 +25,7 @@ class Lecture2GoIE(InfoExtractor): 'params': { # m3u8 download 'skip_download': True, - } + }, } def _real_extract(self, url): diff --git a/yt_dlp/extractor/lecturio.py b/yt_dlp/extractor/lecturio.py index 973764c63f..4cfb872241 100644 --- a/yt_dlp/extractor/lecturio.py +++ b/yt_dlp/extractor/lecturio.py @@ -2,9 +2,9 @@ from .common import InfoExtractor from ..utils import ( + ExtractorError, clean_html, determine_ext, - ExtractorError, float_or_none, int_or_none, str_or_none, @@ -25,7 +25,7 @@ def _perform_login(self, username, password): self._LOGIN_URL, None, 'Downloading login popup') def is_logged(url_handle): - return self._LOGIN_URL not in url_handle.geturl() + return self._LOGIN_URL not in url_handle.url # Already logged in if is_logged(urlh): @@ -49,7 +49,7 @@ def is_logged(url_handle): r'(?s)<ul[^>]+class=["\']error_list[^>]+>(.+?)</ul>', response, 'errors', default=None) if errors: - raise ExtractorError('Unable to login: %s' % errors, expected=True) + raise ExtractorError(f'Unable to login: {errors}', expected=True) raise ExtractorError('Unable to log in') @@ -57,8 +57,8 @@ class LecturioIE(LecturioBaseIE): _VALID_URL = r'''(?x) https:// (?: - app\.lecturio\.com/([^/]+/(?P<nt>[^/?#&]+)\.lecture|(?:\#/)?lecture/c/\d+/(?P<id>\d+))| - (?:www\.)?lecturio\.de/[^/]+/(?P<nt_de>[^/?#&]+)\.vortrag + app\.lecturio\.com/([^/?#]+/(?P<nt>[^/?#&]+)\.lecture|(?:\#/)?lecture/c/\d+/(?P<id>\d+))| + (?:www\.)?lecturio\.de/(?:[^/?#]+/)+(?P<nt_de>[^/?#&]+)\.vortrag ) ''' _TESTS = [{ @@ -73,6 +73,9 @@ class LecturioIE(LecturioBaseIE): }, { 'url': 'https://www.lecturio.de/jura/oeffentliches-recht-staatsexamen.vortrag', 'only_matching': True, + }, { + 'url': 'https://www.lecturio.de/jura/oeffentliches-recht-at-1-staatsexamen/oeffentliches-recht-staatsexamen.vortrag', + 'only_matching': True, }, { 'url': 'https://app.lecturio.com/#/lecture/c/6434/39634', 'only_matching': True, @@ -127,7 +130,7 @@ def _real_extract(self, url): f = { 'url': file_url, 'format_id': label, - 'filesize': float_or_none(filesize, invscale=1000) + 'filesize': float_or_none(filesize, invscale=1000), } if label: mobj = re.match(r'(\d+)p\s*\(([^)]+)\)', label) @@ -169,7 +172,7 @@ def _real_extract(self, url): class LecturioCourseIE(LecturioBaseIE): - _VALID_URL = r'https://app\.lecturio\.com/(?:[^/]+/(?P<nt>[^/?#&]+)\.course|(?:#/)?course/c/(?P<id>\d+))' + _VALID_URL = r'https?://app\.lecturio\.com/(?:[^/]+/(?P<nt>[^/?#&]+)\.course|(?:#/)?course/c/(?P<id>\d+))' _TESTS = [{ 'url': 'https://app.lecturio.com/medical-courses/microbiology-introduction.course#/', 'info_dict': { @@ -197,7 +200,7 @@ def _real_extract(self, url): if lecture_url: lecture_url = urljoin(url, lecture_url) else: - lecture_url = 'https://app.lecturio.com/#/lecture/c/%s/%s' % (course_id, lecture_id) + lecture_url = f'https://app.lecturio.com/#/lecture/c/{course_id}/{lecture_id}' entries.append(self.url_result( lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id)) return self.playlist_result( @@ -206,7 +209,7 @@ def _real_extract(self, url): class LecturioDeCourseIE(LecturioBaseIE): - _VALID_URL = r'https://(?:www\.)?lecturio\.de/[^/]+/(?P<id>[^/?#&]+)\.kurs' + _VALID_URL = r'https?://(?:www\.)?lecturio\.de/[^/]+/(?P<id>[^/?#&]+)\.kurs' _TEST = { 'url': 'https://www.lecturio.de/jura/grundrechte.kurs', 'only_matching': True, diff --git a/yt_dlp/extractor/leeco.py b/yt_dlp/extractor/leeco.py index 85033b8f8b..58baa3fea7 100644 --- a/yt_dlp/extractor/leeco.py +++ b/yt_dlp/extractor/leeco.py @@ -1,19 +1,16 @@ -import datetime +import base64 +import datetime as dt import hashlib import re import time +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_ord, - compat_str, - compat_urllib_parse_urlencode, -) +from ..compat import compat_ord from ..utils import ( + ExtractorError, determine_ext, encode_data_uri, - ExtractorError, int_or_none, orderedSet, parse_iso8601, @@ -140,7 +137,7 @@ def _real_extract(self, url): def get_flash_urls(media_url, format_id): nodes_data = self._download_json( media_url, media_id, - 'Download JSON metadata for format %s' % format_id, + f'Download JSON metadata for format {format_id}', query={ 'm3v': 1, 'format': 1, @@ -150,7 +147,7 @@ def get_flash_urls(media_url, format_id): req = self._request_webpage( nodes_data['nodelist'][0]['location'], media_id, - note='Downloading m3u8 information for format %s' % format_id) + note=f'Downloading m3u8 information for format {format_id}') m3u8_data = self.decrypt_m3u8(req.read()) @@ -173,7 +170,7 @@ def get_flash_urls(media_url, format_id): f = { 'url': format_url, 'ext': determine_ext(format_data[1]), - 'format_id': '%s-%s' % (protocol, format_id), + 'format_id': f'{protocol}-{format_id}', 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', 'quality': int_or_none(format_id), } @@ -185,7 +182,7 @@ def get_flash_urls(media_url, format_id): publish_time = parse_iso8601(self._html_search_regex( r'发布时间 ([^<>]+) ', page, 'publish time', default=None), - delimiter=' ', timezone=datetime.timedelta(hours=8)) + delimiter=' ', timezone=dt.timedelta(hours=8)) description = self._html_search_meta('description', page, fatal=False) return { @@ -207,18 +204,18 @@ class LePlaylistIE(InfoExtractor): 'info_dict': { 'id': '46177', 'title': '美人天下', - 'description': 'md5:395666ff41b44080396e59570dbac01c' + 'description': 'md5:395666ff41b44080396e59570dbac01c', }, - 'playlist_count': 35 + 'playlist_count': 35, }, { 'url': 'http://tv.le.com/izt/wuzetian/index.html', 'info_dict': { 'id': 'wuzetian', 'title': '武媚娘传奇', - 'description': 'md5:e12499475ab3d50219e5bba00b3cb248' + 'description': 'md5:e12499475ab3d50219e5bba00b3cb248', }, # This playlist contains some extra videos other than the drama itself - 'playlist_mincount': 96 + 'playlist_mincount': 96, }, { 'url': 'http://tv.le.com/pzt/lswjzzjc/index.shtml', # This series is moved to http://www.le.com/tv/10005297.html @@ -233,7 +230,7 @@ class LePlaylistIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if LeIE.suitable(url) else super(LePlaylistIE, cls).suitable(url) + return False if LeIE.suitable(url) else super().suitable(url) def _real_extract(self, url): playlist_id = self._match_id(url) @@ -294,7 +291,7 @@ def sign_data(obj): salt = 'fbeh5player12c43eccf2bec3300344' items = ['cf', 'ran', 'uu', 'bver', 'vu'] input_data = ''.join([item + obj[item] for item in items]) + salt - obj['sign'] = hashlib.md5(input_data.encode('utf-8')).hexdigest() + obj['sign'] = hashlib.md5(input_data.encode()).hexdigest() def _get_formats(self, cf, uu, vu, media_id): def get_play_json(cf, timestamp): @@ -305,12 +302,12 @@ def get_play_json(cf, timestamp): 'format': 'json', 'uu': uu, 'vu': vu, - 'ran': compat_str(timestamp), + 'ran': str(timestamp), } self.sign_data(data) return self._download_json( - 'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse_urlencode(data), - media_id, 'Downloading playJson data for type %s' % cf) + 'http://api.letvcloud.com/gpc.php?' + urllib.parse.urlencode(data), + media_id, f'Downloading playJson data for type {cf}') play_json = get_play_json(cf, time.time()) # The server time may be different from local time @@ -319,14 +316,14 @@ def get_play_json(cf, timestamp): if not play_json.get('data'): if play_json.get('message'): - raise ExtractorError('Letv cloud said: %s' % play_json['message'], expected=True) + raise ExtractorError('Letv cloud said: {}'.format(play_json['message']), expected=True) elif play_json.get('code'): raise ExtractorError('Letv cloud returned error %d' % play_json['code'], expected=True) else: raise ExtractorError('Letv cloud returned an unknown error') def b64decode(s): - return compat_b64decode(s).decode('utf-8') + return base64.b64decode(s).decode('utf-8') formats = [] for media in play_json['data']['video_info']['media'].values(): @@ -349,7 +346,7 @@ def _real_extract(self, url): vu_mobj = re.search(r'vu=([\w]+)', url) if not uu_mobj or not vu_mobj: - raise ExtractorError('Invalid URL: %s' % url, expected=True) + raise ExtractorError(f'Invalid URL: {url}', expected=True) uu = uu_mobj.group(1) vu = vu_mobj.group(1) @@ -359,6 +356,6 @@ def _real_extract(self, url): return { 'id': media_id, - 'title': 'Video %s' % media_id, + 'title': f'Video {media_id}', 'formats': formats, } diff --git a/yt_dlp/extractor/lefigaro.py b/yt_dlp/extractor/lefigaro.py new file mode 100644 index 0000000000..a452d87062 --- /dev/null +++ b/yt_dlp/extractor/lefigaro.py @@ -0,0 +1,136 @@ +import json +import math + +from .common import InfoExtractor +from ..utils import ( + InAdvancePagedList, + traverse_obj, +) + + +class LeFigaroVideoEmbedIE(InfoExtractor): + _VALID_URL = r'https?://video\.lefigaro\.fr/embed/[^?#]+/(?P<id>[\w-]+)' + + _TESTS = [{ + 'url': 'https://video.lefigaro.fr/embed/figaro/video/les-francais-ne-veulent-ils-plus-travailler-suivez-en-direct-le-club-le-figaro-idees/', + 'md5': 'a0c3069b7e4c4526abf0053a7713f56f', + 'info_dict': { + 'id': 'g9j7Eovo', + 'title': 'Les Français ne veulent-ils plus travailler ? Retrouvez Le Club Le Figaro Idées', + 'description': 'md5:862b8813148ba4bf10763a65a69dfe41', + 'upload_date': '20230216', + 'timestamp': 1676581615, + 'duration': 3076, + 'thumbnail': r're:^https?://[^?#]+\.(?:jpeg|jpg)', + 'ext': 'mp4', + }, + }, { + 'url': 'https://video.lefigaro.fr/embed/figaro/video/intelligence-artificielle-faut-il-sen-mefier/', + 'md5': '319c662943dd777bab835cae1e2d73a5', + 'info_dict': { + 'id': 'LeAgybyc', + 'title': 'Intelligence artificielle : faut-il s’en méfier ?', + 'description': 'md5:249d136e3e5934a67c8cb704f8abf4d2', + 'upload_date': '20230124', + 'timestamp': 1674584477, + 'duration': 860, + 'thumbnail': r're:^https?://[^?#]+\.(?:jpeg|jpg)', + 'ext': 'mp4', + }, + }] + + _WEBPAGE_TESTS = [{ + 'url': 'https://video.lefigaro.fr/figaro/video/suivez-en-direct-le-club-le-figaro-international-avec-philippe-gelie-9/', + 'md5': '6289f9489efb969e38245f31721596fe', + 'info_dict': { + 'id': 'QChnbPYA', + 'title': 'Où en est le couple franco-allemand ? Retrouvez Le Club Le Figaro International', + 'description': 'md5:6f47235b7e7c93b366fd8ebfa10572ac', + 'upload_date': '20230123', + 'timestamp': 1674503575, + 'duration': 3153, + 'thumbnail': r're:^https?://[^?#]+\.(?:jpeg|jpg)', + 'age_limit': 0, + 'ext': 'mp4', + }, + }, { + 'url': 'https://video.lefigaro.fr/figaro/video/la-philosophe-nathalie-sarthou-lajus-est-linvitee-du-figaro-live/', + 'md5': 'f6df814cae53e85937621599d2967520', + 'info_dict': { + 'id': 'QJzqoNbf', + 'title': 'La philosophe Nathalie Sarthou-Lajus est l’invitée du Figaro Live', + 'description': 'md5:c586793bb72e726c83aa257f99a8c8c4', + 'upload_date': '20230217', + 'timestamp': 1676661986, + 'duration': 1558, + 'thumbnail': r're:^https?://[^?#]+\.(?:jpeg|jpg)', + 'age_limit': 0, + 'ext': 'mp4', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + player_data = self._search_nextjs_data( + webpage, display_id)['props']['pageProps']['initialProps']['pageData']['playerData'] + + return self.url_result( + f'jwplatform:{player_data["videoId"]}', title=player_data.get('title'), + description=player_data.get('description'), thumbnail=player_data.get('poster')) + + +class LeFigaroVideoSectionIE(InfoExtractor): + _VALID_URL = r'https?://video\.lefigaro\.fr/figaro/(?P<id>[\w-]+)/?(?:[#?]|$)' + + _TESTS = [{ + 'url': 'https://video.lefigaro.fr/figaro/le-club-le-figaro-idees/', + 'info_dict': { + 'id': 'le-club-le-figaro-idees', + 'title': 'Le Club Le Figaro Idées', + }, + 'playlist_mincount': 14, + }, { + 'url': 'https://video.lefigaro.fr/figaro/factu/', + 'info_dict': { + 'id': 'factu', + 'title': 'Factu', + }, + 'playlist_mincount': 519, + }] + + _PAGE_SIZE = 20 + + def _get_api_response(self, display_id, page_num, note=None): + return self._download_json( + 'https://api-graphql.lefigaro.fr/graphql', display_id, note=note, + query={ + 'id': 'flive-website_UpdateListPage_1fb260f996bca2d78960805ac382544186b3225f5bedb43ad08b9b8abef79af6', + 'variables': json.dumps({ + 'slug': display_id, + 'videosLimit': self._PAGE_SIZE, + 'sort': 'DESC', + 'order': 'PUBLISHED_AT', + 'page': page_num, + }).encode(), + }) + + def _real_extract(self, url): + display_id = self._match_id(url) + initial_response = self._get_api_response(display_id, page_num=1)['data']['playlist'] + + def page_func(page_num): + api_response = self._get_api_response(display_id, page_num + 1, note=f'Downloading page {page_num + 1}') + + return [self.url_result( + video['embedUrl'], LeFigaroVideoEmbedIE, **traverse_obj(video, { + 'title': 'name', + 'description': 'description', + 'thumbnail': 'thumbnailUrl', + })) for video in api_response['data']['playlist']['jsonLd'][0]['itemListElement']] + + entries = InAdvancePagedList( + page_func, math.ceil(initial_response['videoCount'] / self._PAGE_SIZE), self._PAGE_SIZE) + + return self.playlist_result(entries, playlist_id=display_id, playlist_title=initial_response.get('title')) diff --git a/yt_dlp/extractor/lego.py b/yt_dlp/extractor/lego.py index 811b447587..5a98cc7497 100644 --- a/yt_dlp/extractor/lego.py +++ b/yt_dlp/extractor/lego.py @@ -1,7 +1,7 @@ import uuid from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -72,10 +72,10 @@ def _real_extract(self, url): # https://contentfeed.services.lego.com/api/v2/item/[VIDEO_ID]?culture=[LOCALE]&contentType=Video 'https://services.slingshot.lego.com/mediaplayer/v2', video_id, query={ - 'videoId': '%s_%s' % (uuid.UUID(video_id), locale), + 'videoId': f'{uuid.UUID(video_id)}_{locale}', }, headers=self.geo_verification_headers()) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 451: + if isinstance(e.cause, HTTPError) and e.cause.status == 451: self.raise_geo_restricted(countries=countries) raise @@ -111,7 +111,7 @@ def _real_extract(self, url): 'abr': quality[0], 'height': quality[1], 'width': quality[2], - }), + }) formats.append(f) subtitles = {} @@ -123,7 +123,7 @@ def _real_extract(self, url): video_version = video.get('VideoVersion') if net_storage_path and invariant_id and video_file_id and video_version: subtitles.setdefault(locale[:2], []).append({ - 'url': 'https://lc-mediaplayerns-live-s.legocdn.com/public/%s/%s_%s_%s_%s_sub.srt' % (net_storage_path, invariant_id, video_file_id, locale, video_version), + 'url': f'https://lc-mediaplayerns-live-s.legocdn.com/public/{net_storage_path}/{invariant_id}_{video_file_id}_{locale}_{video_version}_sub.srt', }) return { diff --git a/yt_dlp/extractor/lenta.py b/yt_dlp/extractor/lenta.py index 10aac984e4..105ec371e0 100644 --- a/yt_dlp/extractor/lenta.py +++ b/yt_dlp/extractor/lenta.py @@ -2,6 +2,7 @@ class LentaIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?lenta\.ru/[^/]+/\d+/\d+/\d+/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://lenta.ru/news/2018/03/22/savshenko_go/', @@ -44,7 +45,7 @@ def _real_extract(self, url): default=None) if video_id: return self.url_result( - 'eagleplatform:lentaru.media.eagleplatform.com:%s' % video_id, + f'eagleplatform:lentaru.media.eagleplatform.com:{video_id}', ie='EaglePlatform', video_id=video_id) return self.url_result(url, ie='Generic') diff --git a/yt_dlp/extractor/libraryofcongress.py b/yt_dlp/extractor/libraryofcongress.py index b76ca09081..6185605744 100644 --- a/yt_dlp/extractor/libraryofcongress.py +++ b/yt_dlp/extractor/libraryofcongress.py @@ -1,7 +1,6 @@ import re from .common import InfoExtractor - from ..utils import ( determine_ext, float_or_none, @@ -74,7 +73,7 @@ def _real_extract(self, url): webpage, 'media id', group='id') data = self._download_json( - 'https://media.loc.gov/services/v1/media?id=%s&context=json' % media_id, + f'https://media.loc.gov/services/v1/media?id={media_id}&context=json', media_id)['mediaObject'] derivative = data['derivatives'][0] diff --git a/yt_dlp/extractor/libsyn.py b/yt_dlp/extractor/libsyn.py index 29bbb03def..4ca521a106 100644 --- a/yt_dlp/extractor/libsyn.py +++ b/yt_dlp/extractor/libsyn.py @@ -18,7 +18,7 @@ class LibsynIE(InfoExtractor): 'info_dict': { 'id': '6385796', 'ext': 'mp3', - 'title': "Champion Minded - Developing a Growth Mindset", + 'title': 'Champion Minded - Developing a Growth Mindset', # description fetched using another request: # http://html5-player.libsyn.com/embed/getitemdetails?item_id=6385796 # 'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.', @@ -34,7 +34,7 @@ class LibsynIE(InfoExtractor): 'title': 'Clients From Hell Podcast - How a Sex Toy Company Kickstarted my Freelance Career', 'upload_date': '20150818', 'thumbnail': 're:^https?://.*', - } + }, }] def _real_extract(self, url): @@ -56,7 +56,7 @@ def _real_extract(self, url): r'<h3>([^<]+)</h3>', webpage, 'podcast title', default=None) or get_element_by_class('podcast-title', webpage))) - title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title + title = f'{podcast_title} - {episode_title}' if podcast_title else episode_title formats = [] for k, format_id in (('media_url_libsyn', 'libsyn'), ('media_url', 'main'), ('download_link', 'download')): diff --git a/yt_dlp/extractor/lifenews.py b/yt_dlp/extractor/lifenews.py index 919cfcb374..60d50b1d19 100644 --- a/yt_dlp/extractor/lifenews.py +++ b/yt_dlp/extractor/lifenews.py @@ -1,13 +1,10 @@ import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) from ..utils import ( - determine_ext, ExtractorError, + determine_ext, int_or_none, parse_iso8601, remove_end, @@ -31,7 +28,7 @@ class LifeNewsIE(InfoExtractor): 'timestamp': 1344154740, 'upload_date': '20120805', 'view_count': int, - } + }, }, { # single video embedded via iframe 'url': 'https://life.ru/t/новости/152125', @@ -44,7 +41,7 @@ class LifeNewsIE(InfoExtractor): 'timestamp': 1427961840, 'upload_date': '20150402', 'view_count': int, - } + }, }, { # two videos embedded via iframe 'url': 'https://life.ru/t/новости/153461', @@ -100,7 +97,7 @@ def _real_extract(self, url): webpage) if not video_urls and not iframe_links: - raise ExtractorError('No media links available for %s' % video_id) + raise ExtractorError(f'No media links available for {video_id}') title = remove_end( self._og_search_title(webpage), @@ -125,14 +122,14 @@ def _real_extract(self, url): def make_entry(video_id, video_url, index=None): cur_info = dict(common_info) cur_info.update({ - 'id': video_id if not index else '%s-video%s' % (video_id, index), + 'id': video_id if not index else f'{video_id}-video{index}', 'url': video_url, - 'title': title if not index else '%s (Видео %s)' % (title, index), + 'title': title if not index else f'{title} (Видео {index})', }) return cur_info def make_video_entry(video_id, video_url, index=None): - video_url = compat_urlparse.urljoin(url, video_url) + video_url = urllib.parse.urljoin(url, video_url) return make_entry(video_id, video_url, index) def make_iframe_entry(video_id, video_url, index=None): @@ -174,7 +171,7 @@ class LifeEmbedIE(InfoExtractor): 'ext': 'mp4', 'title': 'e50c2dec2867350528e2574c899b8291', 'thumbnail': r're:http://.*\.jpg', - } + }, }, { # with 1080p 'url': 'https://embed.life.ru/video/e50c2dec2867350528e2574c899b8291', @@ -207,17 +204,17 @@ def extract_original(original_url): video_id).get('playlist', {}) if playlist: master = playlist.get('master') - if isinstance(master, compat_str) and determine_ext(master) == 'm3u8': - extract_m3u8(compat_urlparse.urljoin(url, master)) + if isinstance(master, str) and determine_ext(master) == 'm3u8': + extract_m3u8(urllib.parse.urljoin(url, master)) original = playlist.get('original') - if isinstance(original, compat_str): + if isinstance(original, str): extract_original(original) thumbnail = playlist.get('image') # Old rendition fallback if not formats: for video_url in re.findall(r'"file"\s*:\s*"([^"]+)', webpage): - video_url = compat_urlparse.urljoin(url, video_url) + video_url = urllib.parse.urljoin(url, video_url) if determine_ext(video_url) == 'm3u8': extract_m3u8(video_url) else: diff --git a/yt_dlp/extractor/likee.py b/yt_dlp/extractor/likee.py index 74ee2bea9c..f6a51c8ee5 100644 --- a/yt_dlp/extractor/likee.py +++ b/yt_dlp/extractor/likee.py @@ -22,8 +22,6 @@ class LikeeIE(InfoExtractor): 'description': 'md5:9a7ebe816f0e78722ee5ed76f75983b4', 'thumbnail': r're:^https?://.+\.jpg', 'uploader': 'Huỳnh Hồng Quân ', - 'play_count': int, - 'download_count': int, 'artist': 'Huỳnh Hồng Quân ', 'timestamp': 1651571320, 'upload_date': '20220503', @@ -44,11 +42,9 @@ class LikeeIE(InfoExtractor): 'comment_count': int, 'like_count': int, 'uploader': 'Vương Phước Nhi', - 'download_count': int, 'timestamp': 1651506835, 'upload_date': '20220502', 'duration': 60024, - 'play_count': int, 'artist': 'Vương Phước Nhi', 'uploader_id': '649222262', 'view_count': int, @@ -65,9 +61,7 @@ class LikeeIE(InfoExtractor): 'duration': 9684, 'uploader_id': 'fernanda_rivasg', 'view_count': int, - 'play_count': int, 'artist': 'La Cami La✨', - 'download_count': int, 'like_count': int, 'uploader': 'Fernanda Rivas🎶', 'timestamp': 1614034308, @@ -83,13 +77,11 @@ class LikeeIE(InfoExtractor): 'thumbnail': r're:^https?://.+\.jpg', 'comment_count': int, 'duration': 18014, - 'play_count': int, 'view_count': int, 'timestamp': 1611694774, 'like_count': int, 'uploader': 'Fernanda Rivas🎶', 'uploader_id': 'fernanda_rivasg', - 'download_count': int, 'artist': 'ʟᴇʀɪᴋ_ᴜɴɪᴄᴏʀɴ♡︎', 'upload_date': '20210126', }, @@ -128,8 +120,6 @@ def _real_extract(self, url): 'description': info.get('share_desc'), 'view_count': int_or_none(info.get('video_count')), 'like_count': int_or_none(info.get('likeCount')), - 'play_count': int_or_none(info.get('play_count')), - 'download_count': int_or_none(info.get('download_count')), 'comment_count': int_or_none(info.get('comment_count')), 'uploader': str_or_none(info.get('nick_name')), 'uploader_id': str_or_none(info.get('likeeId')), @@ -172,7 +162,7 @@ def _entries(self, user_name, user_id): 'count': self._PAGE_SIZE, 'lastPostId': last_post_id, 'tabType': 0, - }).encode('utf-8'), + }).encode(), headers={'content-type': 'application/json'}, note=f'Get user info with lastPostId #{last_post_id}') items = traverse_obj(user_videos, ('data', 'videoList')) diff --git a/yt_dlp/extractor/limelight.py b/yt_dlp/extractor/limelight.py index e11ec43d66..763a01448c 100644 --- a/yt_dlp/extractor/limelight.py +++ b/yt_dlp/extractor/limelight.py @@ -1,15 +1,15 @@ import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( + ExtractorError, determine_ext, float_or_none, int_or_none, smuggle_url, try_get, unsmuggle_url, - ExtractorError, ) @@ -32,8 +32,8 @@ def smuggle(url): r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage): entries.append(cls.url_result( - smuggle('limelight:%s:%s' % (lm[kind], video_id)), - 'Limelight%s' % kind, video_id)) + smuggle(f'limelight:{lm[kind]}:{video_id}'), + f'Limelight{kind}', video_id)) for mobj in re.finditer( # As per [1] class attribute should be exactly equal to # LimelightEmbeddedPlayerFlash but numerous examples seen @@ -48,14 +48,14 @@ def smuggle(url): ''', webpage): kind, video_id = mobj.group('kind'), mobj.group('id') entries.append(cls.url_result( - smuggle('limelight:%s:%s' % (kind, video_id)), - 'Limelight%s' % kind.capitalize(), video_id)) + smuggle(f'limelight:{kind}:{video_id}'), + f'Limelight{kind.capitalize()}', video_id)) # http://support.3playmedia.com/hc/en-us/articles/115009517327-Limelight-Embedding-the-Audio-Description-Plugin-with-the-Limelight-Player-on-Your-Web-Page) for video_id in re.findall( r'(?s)LimelightPlayerUtil\.embed\s*\(\s*{.*?\bmediaId["\']\s*:\s*["\'](?P<id>[a-z0-9]{32})', webpage): entries.append(cls.url_result( - smuggle('limelight:media:%s' % video_id), + smuggle(f'limelight:media:{video_id}'), LimelightMediaIE.ie_key(), video_id)) return entries @@ -66,11 +66,11 @@ def _call_playlist_service(self, item_id, method, fatal=True, referer=None): try: return self._download_json( self._PLAYLIST_SERVICE_URL % (self._PLAYLIST_SERVICE_PATH, item_id, method), - item_id, 'Downloading PlaylistService %s JSON' % method, + item_id, f'Downloading PlaylistService {method} JSON', fatal=fatal, headers=headers) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - error = self._parse_json(e.cause.read().decode(), item_id)['detail']['contentAccessPermission'] + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + error = self._parse_json(e.cause.response.read().decode(), item_id)['detail']['contentAccessPermission'] if error == 'CountryDisabled': self.raise_geo_restricted() raise ExtractorError(error, expected=True) @@ -134,7 +134,7 @@ def _extract_info(self, pc, mobile, i, referer): for cdn_host, http_host in CDN_HOSTS: if cdn_host not in rtmp.group('host').lower(): continue - http_url = 'http://%s/%s' % (http_host, rtmp.group('playpath')[4:]) + http_url = 'http://{}/{}'.format(http_host, rtmp.group('playpath')[4:]) urls.append(http_url) if self._is_valid_url(http_url, video_id, http_format_id): http_fmt = fmt.copy() @@ -351,7 +351,7 @@ def _real_extract(self, url): channel_list_id, 'getMobileChannelListById') entries = [ - self.url_result('limelight:channel:%s' % channel['id'], 'LimelightChannel') + self.url_result('limelight:channel:{}'.format(channel['id']), 'LimelightChannel') for channel in channel_list['channelList']] return self.playlist_result( diff --git a/yt_dlp/extractor/line.py b/yt_dlp/extractor/line.py deleted file mode 100644 index 3fab9c8a5d..0000000000 --- a/yt_dlp/extractor/line.py +++ /dev/null @@ -1,143 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - format_field, - int_or_none, - str_or_none, -) - - -class LineLiveBaseIE(InfoExtractor): - _API_BASE_URL = 'https://live-api.line-apps.com/web/v4.0/channel/' - - def _parse_broadcast_item(self, item): - broadcast_id = compat_str(item['id']) - title = item['title'] - is_live = item.get('isBroadcastingNow') - - thumbnails = [] - for thumbnail_id, thumbnail_url in (item.get('thumbnailURLs') or {}).items(): - if not thumbnail_url: - continue - thumbnails.append({ - 'id': thumbnail_id, - 'url': thumbnail_url, - }) - - channel = item.get('channel') or {} - channel_id = str_or_none(channel.get('id')) - - return { - 'id': broadcast_id, - 'title': title, - 'thumbnails': thumbnails, - 'timestamp': int_or_none(item.get('createdAt')), - 'channel': channel.get('name'), - 'channel_id': channel_id, - 'channel_url': format_field(channel_id, None, 'https://live.line.me/channels/%s'), - 'duration': int_or_none(item.get('archiveDuration')), - 'view_count': int_or_none(item.get('viewerCount')), - 'comment_count': int_or_none(item.get('chatCount')), - 'is_live': is_live, - } - - -class LineLiveIE(LineLiveBaseIE): - _VALID_URL = r'https?://live\.line\.me/channels/(?P<channel_id>\d+)/broadcast/(?P<id>\d+)' - _TESTS = [{ - 'url': 'https://live.line.me/channels/5833718/broadcast/18373277', - 'md5': '2c15843b8cb3acd55009ddcb2db91f7c', - 'info_dict': { - 'id': '18373277', - 'title': '2021/12/05 (15分犬)定例譲渡会🐶', - 'ext': 'mp4', - 'timestamp': 1638674925, - 'upload_date': '20211205', - 'thumbnail': 'md5:e1f5817e60f4a72b7e43377cf308d7ef', - 'channel_url': 'https://live.line.me/channels/5833718', - 'channel': 'Yahooニュース掲載🗞プロフ見てね🐕🐕', - 'channel_id': '5833718', - 'duration': 937, - 'view_count': int, - 'comment_count': int, - 'is_live': False, - } - }, { - # archiveStatus == 'DELETED' - 'url': 'https://live.line.me/channels/4778159/broadcast/16378488', - 'only_matching': True, - }] - - def _real_extract(self, url): - channel_id, broadcast_id = self._match_valid_url(url).groups() - broadcast = self._download_json( - self._API_BASE_URL + '%s/broadcast/%s' % (channel_id, broadcast_id), - broadcast_id) - item = broadcast['item'] - info = self._parse_broadcast_item(item) - protocol = 'm3u8' if info['is_live'] else 'm3u8_native' - formats = [] - for k, v in (broadcast.get(('live' if info['is_live'] else 'archived') + 'HLSURLs') or {}).items(): - if not v: - continue - if k == 'abr': - formats.extend(self._extract_m3u8_formats( - v, broadcast_id, 'mp4', protocol, - m3u8_id='hls', fatal=False)) - continue - f = { - 'ext': 'mp4', - 'format_id': 'hls-' + k, - 'protocol': protocol, - 'url': v, - } - if not k.isdigit(): - f['vcodec'] = 'none' - formats.append(f) - if not formats: - archive_status = item.get('archiveStatus') - if archive_status != 'ARCHIVED': - self.raise_no_formats('this video has been ' + archive_status.lower(), expected=True) - info['formats'] = formats - return info - - -class LineLiveChannelIE(LineLiveBaseIE): - _VALID_URL = r'https?://live\.line\.me/channels/(?P<id>\d+)(?!/broadcast/\d+)(?:[/?&#]|$)' - _TEST = { - 'url': 'https://live.line.me/channels/5893542', - 'info_dict': { - 'id': '5893542', - 'title': 'いくらちゃんだよぉ🦒', - 'description': 'md5:4d418087973ad081ceb1b3481f0b1816', - }, - 'playlist_mincount': 29 - } - - def _archived_broadcasts_entries(self, archived_broadcasts, channel_id): - while True: - for row in (archived_broadcasts.get('rows') or []): - share_url = str_or_none(row.get('shareURL')) - if not share_url: - continue - info = self._parse_broadcast_item(row) - info.update({ - '_type': 'url', - 'url': share_url, - 'ie_key': LineLiveIE.ie_key(), - }) - yield info - if not archived_broadcasts.get('hasNextPage'): - return - archived_broadcasts = self._download_json( - self._API_BASE_URL + channel_id + '/archived_broadcasts', - channel_id, query={ - 'lastId': info['id'], - }) - - def _real_extract(self, url): - channel_id = self._match_id(url) - channel = self._download_json(self._API_BASE_URL + channel_id, channel_id) - return self.playlist_result( - self._archived_broadcasts_entries(channel.get('archivedBroadcasts') or {}, channel_id), - channel_id, channel.get('title'), channel.get('information')) diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py index 2bf2e9a117..c8c8ae52ad 100644 --- a/yt_dlp/extractor/linkedin.py +++ b/yt_dlp/extractor/linkedin.py @@ -1,18 +1,17 @@ -from itertools import zip_longest +import itertools import re from .common import InfoExtractor from ..utils import ( - clean_html, - extract_attributes, ExtractorError, + extract_attributes, float_or_none, - get_element_by_class, int_or_none, - srt_subtitles_timecode, - strip_or_none, mimetype2ext, + srt_subtitles_timecode, + traverse_obj, try_get, + url_or_none, urlencode_postdata, urljoin, ) @@ -60,14 +59,14 @@ def _call_api(self, course_slug, fields, video_slug=None, resolution=None): if video_slug: query.update({ 'videoSlug': video_slug, - 'resolution': '_%s' % resolution, + 'resolution': f'_{resolution}', }) sub = ' %dp' % resolution api_url = 'https://www.linkedin.com/learning-api/detailedCourses' if not self._get_cookies(api_url).get('JSESSIONID'): self.raise_login_required() return self._download_json( - api_url, video_slug, 'Downloading%s JSON metadata' % sub, headers={ + api_url, video_slug, f'Downloading{sub} JSON metadata', headers={ 'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value, }, query=query)['elements'][0] @@ -79,19 +78,33 @@ def _get_urn_id(self, video_data): return mobj.group(1) def _get_video_id(self, video_data, course_slug, video_slug): - return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug) + return self._get_urn_id(video_data) or f'{course_slug}/{video_slug}' class LinkedInIE(LinkedInBaseIE): - _VALID_URL = r'https?://(?:www\.)?linkedin\.com/posts/.+?(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?linkedin\.com/posts/[^/?#]+-(?P<id>\d+)-\w{4}/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://www.linkedin.com/posts/mishalkhawaja_sendinblueviews-toronto-digitalmarketing-ugcPost-6850898786781339649-mM20', 'info_dict': { 'id': '6850898786781339649', 'ext': 'mp4', - 'title': 'Mishal K. on LinkedIn: #sendinblueviews #toronto #digitalmarketing', - 'description': 'md5:be125430bab1c574f16aeb186a4d5b19', - 'creator': 'Mishal K.' + 'title': 'Mishal K. on LinkedIn: #sendinblueviews #toronto #digitalmarketing #nowhiring #sendinblue…', + 'description': 'md5:2998a31f6f479376dd62831f53a80f71', + 'uploader': 'Mishal K.', + 'thumbnail': 're:^https?://media.licdn.com/dms/image/.*$', + 'like_count': int, + }, + }, { + 'url': 'https://www.linkedin.com/posts/the-mathworks_2_what-is-mathworks-cloud-center-activity-7151241570371948544-4Gu7', + 'info_dict': { + 'id': '7151241570371948544', + 'ext': 'mp4', + 'title': 'MathWorks on LinkedIn: What Is MathWorks Cloud Center?', + 'description': 'md5:95f9d4eeb6337882fb47eefe13d7a40c', + 'uploader': 'MathWorks', + 'thumbnail': 're:^https?://media.licdn.com/dms/image/.*$', + 'like_count': int, + 'subtitles': 'mincount:1', }, }] @@ -99,26 +112,30 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_extract_title(webpage) - description = clean_html(get_element_by_class('share-update-card__update-text', webpage)) - like_count = int_or_none(get_element_by_class('social-counts-reactions__social-counts-numRections', webpage)) - creator = strip_or_none(clean_html(get_element_by_class('comment__actor-name', webpage))) - - sources = self._parse_json(extract_attributes(self._search_regex(r'(<video[^>]+>)', webpage, 'video'))['data-sources'], video_id) + video_attrs = extract_attributes(self._search_regex(r'(<video[^>]+>)', webpage, 'video')) + sources = self._parse_json(video_attrs['data-sources'], video_id) formats = [{ 'url': source['src'], 'ext': mimetype2ext(source.get('type')), 'tbr': float_or_none(source.get('data-bitrate'), scale=1000), } for source in sources] + subtitles = {'en': [{ + 'url': video_attrs['data-captions-url'], + 'ext': 'vtt', + }]} if url_or_none(video_attrs.get('data-captions-url')) else {} return { 'id': video_id, 'formats': formats, - 'title': title, - 'like_count': like_count, - 'creator': creator, + 'title': self._og_search_title(webpage, default=None) or self._html_extract_title(webpage), + 'like_count': int_or_none(self._search_regex( + r'\bdata-num-reactions="(\d+)"', webpage, 'reactions', default=None)), + 'uploader': traverse_obj( + self._yield_json_ld(webpage, video_id), + (lambda _, v: v['@type'] == 'SocialMediaPosting', 'author', 'name', {str}), get_all=False), 'thumbnail': self._og_search_thumbnail(webpage), - 'description': description, + 'description': self._og_search_description(webpage, default=None), + 'subtitles': subtitles, } @@ -139,12 +156,13 @@ class LinkedInLearningIE(LinkedInLearningBaseIE): def json2srt(self, transcript_lines, duration=None): srt_data = '' - for line, (line_dict, next_dict) in enumerate(zip_longest(transcript_lines, transcript_lines[1:])): + for line, (line_dict, next_dict) in enumerate(itertools.zip_longest(transcript_lines, transcript_lines[1:])): start_time, caption = line_dict['transcriptStartAt'] / 1000, line_dict['caption'] end_time = next_dict['transcriptStartAt'] / 1000 if next_dict else duration or start_time + 1 - srt_data += '%d\n%s --> %s\n%s\n\n' % (line + 1, srt_subtitles_timecode(start_time), - srt_subtitles_timecode(end_time), - caption) + srt_data += ( + f'{line + 1}\n' + f'{srt_subtitles_timecode(start_time)} --> {srt_subtitles_timecode(end_time)}\n' + f'{caption}\n\n') return srt_data def _real_extract(self, url): @@ -159,7 +177,7 @@ def _real_extract(self, url): progressive_url = video_url_data.get('progressiveUrl') if progressive_url: formats.append({ - 'format_id': 'progressive-%dp' % height, + 'format_id': f'progressive-{height}p', 'url': progressive_url, 'ext': 'mp4', 'height': height, @@ -191,7 +209,7 @@ def _real_extract(self, url): if transcript_lines: subtitles['en'] = [{ 'ext': 'srt', - 'data': self.json2srt(transcript_lines, duration) + 'data': self.json2srt(transcript_lines, duration), }] return { @@ -205,7 +223,7 @@ def _real_extract(self, url): # It seems like this would be correctly handled by default # However, unless someone can confirm this, the old # behaviour is being kept as-is - '_format_sort_fields': ('res', 'source_preference') + '_format_sort_fields': ('res', 'source_preference'), } @@ -224,7 +242,7 @@ class LinkedInLearningCourseIE(LinkedInLearningBaseIE): @classmethod def suitable(cls, url): - return False if LinkedInLearningIE.suitable(url) else super(LinkedInLearningCourseIE, cls).suitable(url) + return False if LinkedInLearningIE.suitable(url) else super().suitable(url) def _real_extract(self, url): course_slug = self._match_id(url) @@ -242,7 +260,7 @@ def _real_extract(self, url): '_type': 'url_transparent', 'id': self._get_video_id(video, course_slug, video_slug), 'title': video.get('title'), - 'url': 'https://www.linkedin.com/learning/%s/%s' % (course_slug, video_slug), + 'url': f'https://www.linkedin.com/learning/{course_slug}/{video_slug}', 'chapter': chapter_title, 'chapter_number': chapter_number, 'chapter_id': chapter_id, diff --git a/yt_dlp/extractor/linuxacademy.py b/yt_dlp/extractor/linuxacademy.py deleted file mode 100644 index 7bb64e17c4..0000000000 --- a/yt_dlp/extractor/linuxacademy.py +++ /dev/null @@ -1,241 +0,0 @@ -import json -import random - -from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_HTTPError, - compat_str, -) -from ..utils import ( - clean_html, - ExtractorError, - js_to_json, - parse_duration, - try_get, - unified_timestamp, - urlencode_postdata, - urljoin, -) - - -class LinuxAcademyIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:www\.)?linuxacademy\.com/cp/ - (?: - courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)| - modules/view/id/(?P<course_id>\d+) - ) - ''' - _TESTS = [{ - 'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675', - 'info_dict': { - 'id': '7971-2', - 'ext': 'mp4', - 'title': 'What Is Data Science', - 'description': 'md5:c574a3c20607144fb36cb65bdde76c99', - 'timestamp': int, # The timestamp and upload date changes - 'upload_date': r're:\d+', - 'duration': 304, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Requires Linux Academy account credentials', - }, { - 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2', - 'only_matching': True, - }, { - 'url': 'https://linuxacademy.com/cp/modules/view/id/154', - 'info_dict': { - 'id': '154', - 'title': 'AWS Certified Cloud Practitioner', - 'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c', - 'duration': 28835, - }, - 'playlist_count': 41, - 'skip': 'Requires Linux Academy account credentials', - }, { - 'url': 'https://linuxacademy.com/cp/modules/view/id/39', - 'info_dict': { - 'id': '39', - 'title': 'Red Hat Certified Systems Administrator - RHCSA (EX200) Exam Prep (legacy)', - 'description': 'md5:0f1d3369e90c3fb14a79813b863c902f', - 'duration': 89280, - }, - 'playlist_count': 73, - 'skip': 'Requires Linux Academy account credentials', - }] - - _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize' - _ORIGIN_URL = 'https://linuxacademy.com' - _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx' - _NETRC_MACHINE = 'linuxacademy' - - def _perform_login(self, username, password): - def random_string(): - return ''.join(random.choices( - '0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~', k=32)) - - webpage, urlh = self._download_webpage_handle( - self._AUTHORIZE_URL, None, 'Downloading authorize page', query={ - 'client_id': self._CLIENT_ID, - 'response_type': 'token id_token', - 'response_mode': 'web_message', - 'redirect_uri': self._ORIGIN_URL, - 'scope': 'openid email user_impersonation profile', - 'audience': self._ORIGIN_URL, - 'state': random_string(), - 'nonce': random_string(), - }) - - login_data = self._parse_json( - self._search_regex( - r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, - 'login info', group='value'), None, - transform_source=lambda x: compat_b64decode(x).decode('utf-8') - )['extraParams'] - - login_data.update({ - 'client_id': self._CLIENT_ID, - 'redirect_uri': self._ORIGIN_URL, - 'tenant': 'lacausers', - 'connection': 'Username-Password-ACG-Proxy', - 'username': username, - 'password': password, - 'sso': 'true', - }) - - login_state_url = urlh.geturl() - - try: - login_page = self._download_webpage( - 'https://login.linuxacademy.com/usernamepassword/login', None, - 'Downloading login page', data=json.dumps(login_data).encode(), - headers={ - 'Content-Type': 'application/json', - 'Origin': 'https://login.linuxacademy.com', - 'Referer': login_state_url, - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read(), None) - message = error.get('description') or error['code'] - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, message), expected=True) - raise - - callback_page, urlh = self._download_webpage_handle( - 'https://login.linuxacademy.com/login/callback', None, - 'Downloading callback page', - data=urlencode_postdata(self._hidden_inputs(login_page)), - headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - 'Origin': 'https://login.linuxacademy.com', - 'Referer': login_state_url, - }) - - access_token = self._search_regex( - r'access_token=([^=&]+)', urlh.geturl(), - 'access token', default=None) - if not access_token: - access_token = self._parse_json( - self._search_regex( - r'authorizationResponse\s*=\s*({.+?})\s*;', callback_page, - 'authorization response'), None, - transform_source=js_to_json)['response']['access_token'] - - self._download_webpage( - 'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s' - % access_token, None, 'Downloading token validation page') - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id') - item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id) - - webpage = self._download_webpage(url, item_id) - - # course path - if course_id: - module = self._parse_json( - self._search_regex( - r'window\.module\s*=\s*({(?:(?!};)[^"]|"([^"]|\\")*")+})\s*;', webpage, 'module'), - item_id) - entries = [] - chapter_number = None - chapter = None - chapter_id = None - for item in module['items']: - if not isinstance(item, dict): - continue - - def type_field(key): - return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower() - type_fields = (type_field('name'), type_field('slug')) - # Move to next module section - if 'section' in type_fields: - chapter = item.get('course_name') - chapter_id = item.get('course_module') - chapter_number = 1 if not chapter_number else chapter_number + 1 - continue - # Skip non-lessons - if 'lesson' not in type_fields: - continue - lesson_url = urljoin(url, item.get('url')) - if not lesson_url: - continue - title = item.get('title') or item.get('lesson_name') - description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text')) - entries.append({ - '_type': 'url_transparent', - 'url': lesson_url, - 'ie_key': LinuxAcademyIE.ie_key(), - 'title': title, - 'description': description, - 'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')), - 'duration': parse_duration(item.get('duration')), - 'chapter': chapter, - 'chapter_id': chapter_id, - 'chapter_number': chapter_number, - }) - return { - '_type': 'playlist', - 'entries': entries, - 'id': course_id, - 'title': module.get('title'), - 'description': module.get('md_desc') or clean_html(module.get('desc')), - 'duration': parse_duration(module.get('duration')), - } - - # single video path - m3u8_url = self._parse_json( - self._search_regex( - r'player\.playlist\s*=\s*(\[.+?\])\s*;', webpage, 'playlist'), - item_id)[0]['file'] - formats = self._extract_m3u8_formats( - m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - info = { - 'id': item_id, - 'formats': formats, - } - lesson = self._parse_json( - self._search_regex( - (r'window\.lesson\s*=\s*({.+?})\s*;', - r'player\.lesson\s*=\s*({.+?})\s*;'), - webpage, 'lesson', default='{}'), item_id, fatal=False) - if lesson: - info.update({ - 'title': lesson.get('lesson_name'), - 'description': lesson.get('md_desc') or clean_html(lesson.get('desc')), - 'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')), - 'duration': parse_duration(lesson.get('duration')), - }) - if not info.get('title'): - info['title'] = self._search_regex( - (r'>Lecture\s*:\s*(?P<value>[^<]+)', - r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, - 'title', group='value') - return info diff --git a/yt_dlp/extractor/liputan6.py b/yt_dlp/extractor/liputan6.py index c4477b93e0..a29234a3ad 100644 --- a/yt_dlp/extractor/liputan6.py +++ b/yt_dlp/extractor/liputan6.py @@ -25,8 +25,8 @@ class Liputan6IE(InfoExtractor): 'tags': ['perawat indonesia', 'rumah sakit', 'Medan', 'viral hari ini', 'viral', 'enamplus'], 'channel': 'Default Channel', 'dislike_count': int, - 'upload_date': '20220707' - } + 'upload_date': '20220707', + }, }, { 'url': 'https://www.liputan6.com/tv/read/5007719/video-program-minyakita-minyak-goreng-kemasan-sederhana-seharga-rp-14-ribu', 'info_dict': { @@ -49,7 +49,7 @@ class Liputan6IE(InfoExtractor): 'thumbnail': 'https://thumbor.prod.vidiocdn.com/AAIOjz-64hKojjdw5hr0oNNEeJg=/640x360/filters:quality(70)/vidio-web-prod-video/uploads/video/image/7082543/program-minyakita-minyak-goreng-kemasan-sederhana-seharga-rp14-ribu-_-liputan-6-7d9fbb.jpg', 'channel': 'Liputan 6 Pagi', 'view_count': int, - } + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/listennotes.py b/yt_dlp/extractor/listennotes.py index 4ebc9be4d1..61eae95edf 100644 --- a/yt_dlp/extractor/listennotes.py +++ b/yt_dlp/extractor/listennotes.py @@ -31,7 +31,7 @@ class ListenNotesIE(InfoExtractor): 'thumbnail': 'https://production.listennotes.com/podcasts/thriving-on-overload-ross-dawson-1wb_KospA3P-ed84wITivxF.300x300.jpg', 'channel_url': 'https://www.listennotes.com/podcasts/thriving-on-overload-ross-dawson-ed84wITivxF/', 'cast': ['Tim O’Reilly', 'Cookie Monster', 'Lao Tzu', 'Wallace Steven', 'Eric Raymond', 'Christine Peterson', 'John Maynard Keyne', 'Ross Dawson'], - } + }, }, { 'url': 'https://www.listennotes.com/podcasts/ask-noah-show/episode-177-wireguard-with-lwEA3154JzG/', 'md5': '62fb4ffe7fc525632a1138bf72a5ce53', @@ -47,7 +47,7 @@ class ListenNotesIE(InfoExtractor): 'channel_url': 'https://www.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-4DQTzdS5-j7/', 'thumbnail': 'https://production.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-cfbRUw9Gs3F-4DQTzdS5-j7.300x300.jpg', 'cast': ['noah showlink', 'noah show', 'noah dashboard', 'jason donenfeld'], - } + }, }] def _clean_description(self, description): @@ -82,5 +82,5 @@ def _real_extract(self, url): 'cast': ('nlp_entities', ..., 'name'), 'channel_url': 'channel_url', 'channel_id': 'channel_short_uuid', - }) + }), } diff --git a/yt_dlp/extractor/litv.py b/yt_dlp/extractor/litv.py index 31826ac99e..93f926a9ff 100644 --- a/yt_dlp/extractor/litv.py +++ b/yt_dlp/extractor/litv.py @@ -4,8 +4,9 @@ from ..utils import ( ExtractorError, int_or_none, - traverse_obj, smuggle_url, + traverse_obj, + try_call, unsmuggle_url, ) @@ -13,7 +14,7 @@ class LiTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?litv\.tv/(?:vod|promo)/[^/]+/(?:content\.do)?\?.*?\b(?:content_)?id=(?P<id>[^&]+)' - _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s' + _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?content_id=%s' _TESTS = [{ 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', @@ -21,16 +22,18 @@ class LiTVIE(InfoExtractor): 'id': 'VOD00041606', 'title': '花千骨', }, - 'playlist_count': 50, + 'playlist_count': 51, # 50 episodes + 1 trailer }, { 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', - 'md5': '969e343d9244778cb29acec608e53640', + 'md5': 'b90ff1e9f1d8f5cfcd0a44c3e2b34c7a', 'info_dict': { 'id': 'VOD00041610', 'ext': 'mp4', 'title': '花千骨第1集', 'thumbnail': r're:https?://.*\.jpg$', - 'description': 'md5:c7017aa144c87467c4fb2909c4b05d6f', + 'description': '《花千骨》陸劇線上看。十六年前,平靜的村莊內,一名女嬰隨異相出生,途徑此地的蜀山掌門清虛道長算出此女命運非同一般,她體內散發的異香易招惹妖魔。一念慈悲下,他在村莊周邊設下結界阻擋妖魔入侵,讓其年滿十六後去蜀山,並賜名花千骨。', + 'categories': ['奇幻', '愛情', '中國', '仙俠'], + 'episode': 'Episode 1', 'episode_number': 1, }, 'params': { @@ -46,20 +49,17 @@ class LiTVIE(InfoExtractor): 'title': '芈月傳第1集 霸星芈月降世楚國', 'description': '楚威王二年,太史令唐昧夜觀星象,發現霸星即將現世。王后得知霸星的預言後,想盡辦法不讓孩子順利出生,幸得莒姬相護化解危機。沒想到眾人期待下出生的霸星卻是位公主,楚威王對此失望至極。楚王后命人將女嬰丟棄河中,居然奇蹟似的被少司命像攔下,楚威王認為此女非同凡響,為她取名芈月。', }, - 'skip': 'Georestricted to Taiwan', + 'skip': 'No longer exists', }] - def _extract_playlist(self, season_list, video_id, program_info, prompt=True): - episode_title = program_info['title'] - content_id = season_list['contentId'] - + def _extract_playlist(self, playlist_data, content_type): all_episodes = [ self.url_result(smuggle_url( - self._URL_TEMPLATE % (program_info['contentType'], episode['contentId']), + self._URL_TEMPLATE % (content_type, episode['contentId']), {'force_noplaylist': True})) # To prevent infinite recursion - for episode in season_list['episode']] + for episode in traverse_obj(playlist_data, ('seasons', ..., 'episode', lambda _, v: v['contentId']))] - return self.playlist_result(all_episodes, content_id, episode_title) + return self.playlist_result(all_episodes, playlist_data['contentId'], playlist_data.get('title')) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -68,36 +68,52 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) + if self._search_regex( + r'(?i)<meta\s[^>]*http-equiv="refresh"\s[^>]*content="[0-9]+;\s*url=https://www\.litv\.tv/"', + webpage, 'meta refresh redirect', default=False, group=0): + raise ExtractorError('No such content found', expected=True) + program_info = self._parse_json(self._search_regex( r'var\s+programInfo\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'), video_id) - season_list = list(program_info.get('seasonList', {}).values()) - playlist_id = traverse_obj(season_list, 0, 'contentId') - if self._yes_playlist(playlist_id, video_id, smuggled_data): - return self._extract_playlist(season_list[0], video_id, program_info) - - # In browsers `getMainUrl` request is always issued. Usually this + # In browsers `getProgramInfo` request is always issued. Usually this # endpoint gives the same result as the data embedded in the webpage. - # If georestricted, there are no embedded data, so an extra request is - # necessary to get the error code + # If, for some reason, there are no embedded data, we do an extra request. if 'assetId' not in program_info: program_info = self._download_json( 'https://www.litv.tv/vod/ajax/getProgramInfo', video_id, query={'contentId': video_id}, headers={'Accept': 'application/json'}) + + series_id = program_info['seriesId'] + if self._yes_playlist(series_id, video_id, smuggled_data): + playlist_data = self._download_json( + 'https://www.litv.tv/vod/ajax/getSeriesTree', video_id, + query={'seriesId': series_id}, headers={'Accept': 'application/json'}) + return self._extract_playlist(playlist_data, program_info['contentType']) + video_data = self._parse_json(self._search_regex( r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);', webpage, 'video data', default='{}'), video_id) if not video_data: - payload = { - 'assetId': program_info['assetId'], - 'watchDevices': program_info['watchDevices'], - 'contentType': program_info['contentType'], - } + payload = {'assetId': program_info['assetId']} + puid = try_call(lambda: self._get_cookies('https://www.litv.tv/')['PUID'].value) + if puid: + payload.update({ + 'type': 'auth', + 'puid': puid, + }) + endpoint = 'getUrl' + else: + payload.update({ + 'watchDevices': program_info['watchDevices'], + 'contentType': program_info['contentType'], + }) + endpoint = 'getMainUrlNoAuth' video_data = self._download_json( - 'https://www.litv.tv/vod/getMainUrl', video_id, - data=json.dumps(payload).encode('utf-8'), + f'https://www.litv.tv/vod/ajax/{endpoint}', video_id, + data=json.dumps(payload).encode(), headers={'Content-Type': 'application/json'}) if not video_data.get('fullpath'): @@ -105,15 +121,15 @@ def _real_extract(self, url): if error_msg == 'vod.error.outsideregionerror': self.raise_geo_restricted('This video is available in Taiwan only') if error_msg: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error_msg), expected=True) - raise ExtractorError('Unexpected result from %s' % self.IE_NAME) + raise ExtractorError(f'{self.IE_NAME} said: {error_msg}', expected=True) + raise ExtractorError(f'Unexpected result from {self.IE_NAME}') formats = self._extract_m3u8_formats( video_data['fullpath'], video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls') for a_format in formats: # LiTV HLS segments doesn't like compressions - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True + a_format.setdefault('http_headers', {})['Accept-Encoding'] = 'identity' title = program_info['title'] + program_info.get('secondaryMark', '') description = program_info.get('description') diff --git a/yt_dlp/extractor/livejournal.py b/yt_dlp/extractor/livejournal.py index 96bd8b2335..c61f9bec7a 100644 --- a/yt_dlp/extractor/livejournal.py +++ b/yt_dlp/extractor/livejournal.py @@ -1,5 +1,4 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import int_or_none @@ -14,7 +13,7 @@ class LiveJournalIE(InfoExtractor): 'title': 'Истребители против БПЛА', 'upload_date': '20190624', 'timestamp': 1561406715, - } + }, } def _real_extract(self, url): @@ -23,7 +22,7 @@ def _real_extract(self, url): record = self._parse_json(self._search_regex( r'Site\.page\s*=\s*({.+?});', webpage, 'page data'), video_id)['video']['record'] - storage_id = compat_str(record['storageid']) + storage_id = str(record['storageid']) title = record.get('name') if title: # remove filename extension(.mp4, .mov, etc...) diff --git a/yt_dlp/extractor/livestream.py b/yt_dlp/extractor/livestream.py index d883eafcff..7f7947ee7b 100644 --- a/yt_dlp/extractor/livestream.py +++ b/yt_dlp/extractor/livestream.py @@ -1,33 +1,36 @@ -import re import itertools +import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) from ..utils import ( - find_xpath_attr, - xpath_attr, - xpath_with_ns, - xpath_text, - orderedSet, - update_url_query, - int_or_none, - float_or_none, - parse_iso8601, determine_ext, + find_xpath_attr, + float_or_none, + int_or_none, + orderedSet, + parse_iso8601, + traverse_obj, + update_url_query, + xpath_attr, + xpath_text, + xpath_with_ns, ) class LivestreamIE(InfoExtractor): IE_NAME = 'livestream' - _VALID_URL = r'https?://(?:new\.)?livestream\.com/(?:accounts/(?P<account_id>\d+)|(?P<account_name>[^/]+))/(?:events/(?P<event_id>\d+)|(?P<event_name>[^/]+))(?:/videos/(?P<id>\d+))?' + _VALID_URL = r'''(?x) + https?://(?:new\.)?livestream\.com/ + (?:accounts/(?P<account_id>\d+)|(?P<account_name>[^/]+)) + (?:/events/(?P<event_id>\d+)|/(?P<event_name>[^/]+))? + (?:/videos/(?P<id>\d+))? + ''' _EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"'] _TESTS = [{ 'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', - 'md5': '53274c76ba7754fb0e8d072716f2292b', + 'md5': '7876c5f5dc3e711b6b73acce4aac1527', 'info_dict': { 'id': '4719370', 'ext': 'mp4', @@ -37,22 +40,37 @@ class LivestreamIE(InfoExtractor): 'duration': 5968.0, 'like_count': int, 'view_count': int, - 'thumbnail': r're:^http://.*\.jpg$' - } + 'comment_count': int, + 'thumbnail': r're:^http://.*\.jpg$', + }, }, { - 'url': 'http://new.livestream.com/tedx/cityenglish', + 'url': 'https://livestream.com/coheedandcambria/websterhall', 'info_dict': { - 'title': 'TEDCity2.0 (English)', - 'id': '2245590', + 'id': '1585861', + 'title': 'Live From Webster Hall', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://livestream.com/dayananda/events/7954027', + 'info_dict': { + 'title': 'Live from Mevo', + 'id': '7954027', }, 'playlist_mincount': 4, }, { - 'url': 'http://new.livestream.com/chess24/tatasteelchess', + 'url': 'https://livestream.com/accounts/82', 'info_dict': { - 'title': 'Tata Steel Chess', - 'id': '3705884', + 'id': '253978', + 'view_count': int, + 'title': 'trsr', + 'comment_count': int, + 'like_count': int, + 'upload_date': '20120306', + 'timestamp': 1331042383, + 'thumbnail': 'http://img.new.livestream.com/videos/0000000000000372/cacbeed6-fb68-4b5e-ad9c-e148124e68a9_640x427.jpg', + 'duration': 15.332, + 'ext': 'mp4', }, - 'playlist_mincount': 60, }, { 'url': 'https://new.livestream.com/accounts/362/events/3557232/videos/67864563/player?autoPlay=false&height=360&mute=false&width=640', 'only_matching': True, @@ -62,7 +80,8 @@ class LivestreamIE(InfoExtractor): }] _API_URL_TEMPLATE = 'http://livestream.com/api/accounts/%s/events/%s' - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + def _parse_smil_formats_and_subtitles( + self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): base_ele = find_xpath_attr( smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase') base = base_ele.get('content') if base_ele is not None else 'http://livestreamvod-f.akamaihd.net/' @@ -73,7 +92,7 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para for vn in video_nodes: tbr = int_or_none(vn.attrib.get('system-bitrate'), 1000) furl = ( - update_url_query(compat_urlparse.urljoin(base, vn.attrib['src']), { + update_url_query(urllib.parse.urljoin(base, vn.attrib['src']), { 'v': '3.0.3', 'fp': 'WIN% 14,0,0,145', })) @@ -86,10 +105,10 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para 'tbr': tbr, 'preference': -1000, # Strictly inferior than all other formats? }) - return formats + return formats, {} def _extract_video_info(self, video_data): - video_id = compat_str(video_data['id']) + video_id = str(video_data['id']) FORMAT_KEYS = ( ('sd', 'progressive_url'), @@ -104,7 +123,7 @@ def _extract_video_info(self, video_data): if ext == 'm3u8': continue bitrate = int_or_none(self._search_regex( - r'(\d+)\.%s' % ext, video_url, 'bitrate', default=None)) + rf'(\d+)\.{ext}', video_url, 'bitrate', default=None)) formats.append({ 'url': video_url, 'format_id': format_id, @@ -150,7 +169,7 @@ def _extract_video_info(self, video_data): } def _extract_stream_info(self, stream_info): - broadcast_id = compat_str(stream_info['broadcast_id']) + broadcast_id = str(stream_info['broadcast_id']) is_live = stream_info.get('is_live') formats = [] @@ -179,9 +198,9 @@ def _extract_stream_info(self, stream_info): 'is_live': is_live, } - def _extract_event(self, event_data): - event_id = compat_str(event_data['id']) - account_id = compat_str(event_data['owner_account_id']) + def _generate_event_playlist(self, event_data): + event_id = str(event_data['id']) + account_id = str(event_data['owner_account_id']) feed_root_url = self._API_URL_TEMPLATE % (account_id, event_id) + '/feed.json' stream_info = event_data.get('stream_info') @@ -189,39 +208,44 @@ def _extract_event(self, event_data): return self._extract_stream_info(stream_info) last_video = None - entries = [] for i in itertools.count(1): if last_video is None: info_url = feed_root_url else: - info_url = '{root}?&id={id}&newer=-1&type=video'.format( - root=feed_root_url, id=last_video) + info_url = f'{feed_root_url}?&id={last_video}&newer=-1&type=video' videos_info = self._download_json( - info_url, event_id, 'Downloading page {0}'.format(i))['data'] + info_url, event_id, f'Downloading page {i}')['data'] videos_info = [v['data'] for v in videos_info if v['type'] == 'video'] if not videos_info: break for v in videos_info: - v_id = compat_str(v['id']) - entries.append(self.url_result( - 'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v_id), - 'Livestream', v_id, v.get('caption'))) + v_id = str(v['id']) + yield self.url_result( + f'http://livestream.com/accounts/{account_id}/events/{event_id}/videos/{v_id}', + LivestreamIE, v_id, v.get('caption')) last_video = videos_info[-1]['id'] - return self.playlist_result(entries, event_id, event_data['full_name']) def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') event = mobj.group('event_id') or mobj.group('event_name') account = mobj.group('account_id') or mobj.group('account_name') - api_url = self._API_URL_TEMPLATE % (account, event) + api_url = f'http://livestream.com/api/accounts/{account}' + if video_id: video_data = self._download_json( - api_url + '/videos/%s' % video_id, video_id) + f'{api_url}/events/{event}/videos/{video_id}', video_id) return self._extract_video_info(video_data) - else: - event_data = self._download_json(api_url, video_id) - return self._extract_event(event_data) + elif event: + event_data = self._download_json(f'{api_url}/events/{event}', None) + return self.playlist_result( + self._generate_event_playlist(event_data), str(event_data['id']), event_data['full_name']) + + account_data = self._download_json(api_url, None) + items = traverse_obj(account_data, (('upcoming_events', 'past_events'), 'data', ...)) + return self.playlist_result( + itertools.chain.from_iterable(map(self._generate_event_playlist, items)), + account_data.get('id'), account_data.get('full_name')) # The original version of Livestream uses a different system @@ -253,7 +277,7 @@ class LivestreamOriginalIE(InfoExtractor): }] def _extract_video_info(self, user, video_id): - api_url = 'http://x%sx.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id=%s' % (user, video_id) + api_url = f'http://x{user}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={video_id}' info = self._download_xml(api_url, video_id) item = info.find('channel').find('item') @@ -310,7 +334,7 @@ def _extract_folder(self, url, folder_id): entries = [{ '_type': 'url', - 'url': compat_urlparse.urljoin(url, p), + 'url': urllib.parse.urljoin(url, p), } for p in paths] return self.playlist_result(entries, folder_id) @@ -324,10 +348,10 @@ def _real_extract(self, url): return self._extract_folder(url, content_id) else: # this url is used on mobile devices - stream_url = 'http://x%sx.api.channel.livestream.com/3.0/getstream.json' % user + stream_url = f'http://x{user}x.api.channel.livestream.com/3.0/getstream.json' info = {} if content_id: - stream_url += '?id=%s' % content_id + stream_url += f'?id={content_id}' info = self._extract_video_info(user, content_id) else: content_id = user @@ -356,8 +380,7 @@ class LivestreamShortenerIE(InfoExtractor): _VALID_URL = r'https?://livestre\.am/(?P<id>.+)' def _real_extract(self, url): - mobj = self._match_valid_url(url) - id = mobj.group('id') - webpage = self._download_webpage(url, id) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) return self.url_result(self._og_search_url(webpage)) diff --git a/yt_dlp/extractor/livestreamfails.py b/yt_dlp/extractor/livestreamfails.py index 0df638422c..c18d05d500 100644 --- a/yt_dlp/extractor/livestreamfails.py +++ b/yt_dlp/extractor/livestreamfails.py @@ -16,7 +16,7 @@ class LivestreamfailsIE(InfoExtractor): 'thumbnail': r're:^https?://.+', 'timestamp': 1656271785, 'upload_date': '20220626', - } + }, }, { 'url': 'https://livestreamfails.com/post/139200', 'only_matching': True, @@ -33,5 +33,5 @@ def _real_extract(self, url): 'url': f'https://livestreamfails-video-prod.b-cdn.net/video/{api_response["videoId"]}', 'title': api_response.get('label'), 'creator': traverse_obj(api_response, ('streamer', 'label')), - 'thumbnail': format_field(api_response, 'imageId', 'https://livestreamfails-image-prod.b-cdn.net/image/%s') + 'thumbnail': format_field(api_response, 'imageId', 'https://livestreamfails-image-prod.b-cdn.net/image/%s'), } diff --git a/yt_dlp/extractor/lnkgo.py b/yt_dlp/extractor/lnkgo.py index 6282d2eaf3..31a7cefd82 100644 --- a/yt_dlp/extractor/lnkgo.py +++ b/yt_dlp/extractor/lnkgo.py @@ -1,5 +1,4 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( clean_html, format_field, @@ -58,10 +57,10 @@ def _real_extract(self, url): display_id, video_id = self._match_valid_url(url).groups() video_info = self._download_json( - 'https://lnk.lt/api/main/video-page/%s/%s/false' % (display_id, video_id or '0'), + 'https://lnk.lt/api/main/video-page/{}/{}/false'.format(display_id, video_id or '0'), display_id)['videoConfig']['videoInfo'] - video_id = compat_str(video_info['id']) + video_id = str(video_info['id']) title = video_info['title'] prefix = 'smil' if video_info.get('isQualityChangeAvailable') else 'mp4' formats = self._extract_m3u8_formats( @@ -98,9 +97,9 @@ class LnkIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'episode_number': 13431, 'series': 'Naujausi žinių reportažai', - 'episode': 'Episode 13431' + 'episode': 'Episode 13431', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }, { 'url': 'https://lnk.lt/istorijos-trumpai/152546', 'info_dict': { @@ -114,9 +113,9 @@ class LnkIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'episode_number': 1036, 'series': 'Istorijos trumpai', - 'episode': 'Episode 1036' + 'episode': 'Episode 1036', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }, { 'url': 'https://lnk.lt/gyvunu-pasaulis/151549', 'info_dict': { @@ -130,26 +129,26 @@ class LnkIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'episode_number': 16, 'series': 'Gyvūnų pasaulis', - 'episode': 'Episode 16' + 'episode': 'Episode 16', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }] def _real_extract(self, url): - id = self._match_id(url) - video_json = self._download_json(f'https://lnk.lt/api/video/video-config/{id}', id)['videoInfo'] + video_id = self._match_id(url) + video_json = self._download_json(f'https://lnk.lt/api/video/video-config/{video_id}', video_id)['videoInfo'] formats, subtitles = [], {} if video_json.get('videoUrl'): - fmts, subs = self._extract_m3u8_formats_and_subtitles(video_json['videoUrl'], id) + fmts, subs = self._extract_m3u8_formats_and_subtitles(video_json['videoUrl'], video_id) formats.extend(fmts) subtitles = self._merge_subtitles(subtitles, subs) if video_json.get('videoFairplayUrl') and not video_json.get('drm'): - fmts, subs = self._extract_m3u8_formats_and_subtitles(video_json['videoFairplayUrl'], id) + fmts, subs = self._extract_m3u8_formats_and_subtitles(video_json['videoFairplayUrl'], video_id) formats.extend(fmts) subtitles = self._merge_subtitles(subtitles, subs) return { - 'id': id, + 'id': video_id, 'title': video_json.get('title'), 'description': video_json.get('description'), 'view_count': video_json.get('viewsCount'), diff --git a/yt_dlp/extractor/localnews8.py b/yt_dlp/extractor/localnews8.py deleted file mode 100644 index 6f3f02c705..0000000000 --- a/yt_dlp/extractor/localnews8.py +++ /dev/null @@ -1,42 +0,0 @@ -from .common import InfoExtractor - - -class LocalNews8IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?localnews8\.com/(?:[^/]+/)*(?P<display_id>[^/]+)/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.localnews8.com/news/rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings/35183304', - 'md5': 'be4d48aea61aa2bde7be2ee47691ad20', - 'info_dict': { - 'id': '35183304', - 'display_id': 'rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings', - 'ext': 'mp4', - 'title': 'Rexburg business turns carbon fiber scraps into wedding ring', - 'description': 'The process was first invented by Lamborghini and less than a dozen companies around the world use it.', - 'duration': 153, - 'timestamp': 1441844822, - 'upload_date': '20150910', - 'uploader_id': 'api', - } - } - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - partner_id = self._search_regex( - r'partnerId\s*[:=]\s*(["\'])(?P<id>\d+)\1', - webpage, 'partner id', group='id') - kaltura_id = self._search_regex( - r'videoIdString\s*[:=]\s*(["\'])kaltura:(?P<id>[0-9a-z_]+)\1', - webpage, 'videl id', group='id') - - return { - '_type': 'url_transparent', - 'url': 'kaltura:%s:%s' % (partner_id, kaltura_id), - 'ie_key': 'Kaltura', - 'id': video_id, - 'display_id': display_id, - } diff --git a/yt_dlp/extractor/loom.py b/yt_dlp/extractor/loom.py new file mode 100644 index 0000000000..1191aa17ea --- /dev/null +++ b/yt_dlp/extractor/loom.py @@ -0,0 +1,461 @@ +import json +import textwrap +import urllib.parse +import uuid + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + filter_dict, + get_first, + int_or_none, + parse_iso8601, + update_url, + url_or_none, + variadic, +) +from ..utils.traversal import traverse_obj + + +class LoomIE(InfoExtractor): + IE_NAME = 'loom' + _VALID_URL = r'https?://(?:www\.)?loom\.com/(?:share|embed)/(?P<id>[\da-f]{32})' + _EMBED_REGEX = [rf'<iframe[^>]+\bsrc=["\'](?P<url>{_VALID_URL})'] + _TESTS = [{ + # m3u8 raw-url, mp4 transcoded-url, cdn url == raw-url, json subs only + 'url': 'https://www.loom.com/share/43d05f362f734614a2e81b4694a3a523', + 'md5': 'bfc2d7e9c2e0eb4813212230794b6f42', + 'info_dict': { + 'id': '43d05f362f734614a2e81b4694a3a523', + 'ext': 'mp4', + 'title': 'A Ruler for Windows - 28 March 2022', + 'uploader': 'wILLIAM PIP', + 'upload_date': '20220328', + 'timestamp': 1648454238, + 'duration': 27, + }, + }, { + # webm raw-url, mp4 transcoded-url, cdn url == transcoded-url, no subs + 'url': 'https://www.loom.com/share/c43a642f815f4378b6f80a889bb73d8d', + 'md5': '70f529317be8cf880fcc2c649a531900', + 'info_dict': { + 'id': 'c43a642f815f4378b6f80a889bb73d8d', + 'ext': 'webm', + 'title': 'Lilah Nielsen Intro Video', + 'uploader': 'Lilah Nielsen', + 'upload_date': '20200826', + 'timestamp': 1598480716, + 'duration': 20, + }, + }, { + # m3u8 raw-url, mp4 transcoded-url, cdn url == raw-url, vtt sub and json subs + 'url': 'https://www.loom.com/share/9458bcbf79784162aa62ffb8dd66201b', + 'md5': '51737ec002969dd28344db4d60b9cbbb', + 'info_dict': { + 'id': '9458bcbf79784162aa62ffb8dd66201b', + 'ext': 'mp4', + 'title': 'Sharing screen with gpt-4', + 'description': 'Sharing screen with GPT 4 vision model and asking questions to guide through blender.', + 'uploader': 'Suneel Matham', + 'chapters': 'count:3', + 'upload_date': '20231109', + 'timestamp': 1699518978, + 'duration': 93, + }, + }, { + # mpd raw-url, mp4 transcoded-url, cdn url == raw-url, no subs + 'url': 'https://www.loom.com/share/24351eb8b317420289b158e4b7e96ff2', + 'info_dict': { + 'id': '24351eb8b317420289b158e4b7e96ff2', + 'ext': 'webm', + 'title': 'OMFG clown', + 'description': 'md5:285c5ee9d62aa087b7e3271b08796815', + 'uploader': 'MrPumkin B', + 'upload_date': '20210924', + 'timestamp': 1632519618, + 'duration': 210, + }, + 'params': {'skip_download': 'dash'}, + }, { + # password-protected + 'url': 'https://www.loom.com/share/50e26e8aeb7940189dff5630f95ce1f4', + 'md5': '5cc7655e7d55d281d203f8ffd14771f7', + 'info_dict': { + 'id': '50e26e8aeb7940189dff5630f95ce1f4', + 'ext': 'mp4', + 'title': 'iOS Mobile Upload', + 'uploader': 'Simon Curran', + 'upload_date': '20200520', + 'timestamp': 1590000123, + 'duration': 35, + }, + 'params': {'videopassword': 'seniorinfants2'}, + }, { + # embed, transcoded-url endpoint sends empty JSON response + 'url': 'https://www.loom.com/embed/ddcf1c1ad21f451ea7468b1e33917e4e', + 'md5': '8488817242a0db1cb2ad0ea522553cf6', + 'info_dict': { + 'id': 'ddcf1c1ad21f451ea7468b1e33917e4e', + 'ext': 'mp4', + 'title': 'CF Reset User\'s Password', + 'uploader': 'Aimee Heintz', + 'upload_date': '20220707', + 'timestamp': 1657216459, + 'duration': 181, + }, + 'expected_warnings': ['Failed to parse JSON'], + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.loom.com/community/e1229802a8694a09909e8ba0fbb6d073-pg', + 'md5': 'ec838cd01b576cf0386f32e1ae424609', + 'info_dict': { + 'id': 'e1229802a8694a09909e8ba0fbb6d073', + 'ext': 'mp4', + 'title': 'Rexie Jane Cimafranca - Founder\'s Presentation', + 'uploader': 'Rexie Cimafranca', + 'upload_date': '20230213', + 'duration': 247, + 'timestamp': 1676274030, + }, + }] + + _GRAPHQL_VARIABLES = { + 'GetVideoSource': { + 'acceptableMimes': ['DASH', 'M3U8', 'MP4'], + }, + } + _GRAPHQL_QUERIES = { + 'GetVideoSSR': textwrap.dedent('''\ + query GetVideoSSR($videoId: ID!, $password: String) { + getVideo(id: $videoId, password: $password) { + __typename + ... on PrivateVideo { + id + status + message + __typename + } + ... on VideoPasswordMissingOrIncorrect { + id + message + __typename + } + ... on RegularUserVideo { + id + __typename + createdAt + description + download_enabled + folder_id + is_protected + needs_password + owner { + display_name + __typename + } + privacy + s3_id + name + video_properties { + avgBitRate + client + camera_enabled + client_version + duration + durationMs + format + height + microphone_enabled + os + os_version + recordingClient + recording_type + recording_version + screen_type + tab_audio + trim_duration + width + __typename + } + playable_duration + source_duration + visibility + } + } + }\n'''), + 'GetVideoSource': textwrap.dedent('''\ + query GetVideoSource($videoId: ID!, $password: String, $acceptableMimes: [CloudfrontVideoAcceptableMime]) { + getVideo(id: $videoId, password: $password) { + ... on RegularUserVideo { + id + nullableRawCdnUrl(acceptableMimes: $acceptableMimes, password: $password) { + url + __typename + } + __typename + } + __typename + } + }\n'''), + 'FetchVideoTranscript': textwrap.dedent('''\ + query FetchVideoTranscript($videoId: ID!, $password: String) { + fetchVideoTranscript(videoId: $videoId, password: $password) { + ... on VideoTranscriptDetails { + id + video_id + source_url + captions_source_url + __typename + } + ... on GenericError { + message + __typename + } + __typename + } + }\n'''), + 'FetchChapters': textwrap.dedent('''\ + query FetchChapters($videoId: ID!, $password: String) { + fetchVideoChapters(videoId: $videoId, password: $password) { + ... on VideoChapters { + video_id + content + __typename + } + ... on EmptyChaptersPayload { + content + __typename + } + ... on InvalidRequestWarning { + message + __typename + } + ... on Error { + message + __typename + } + __typename + } + }\n'''), + } + _APOLLO_GRAPHQL_VERSION = '0a1856c' + + def _call_graphql_api(self, operations, video_id, note=None, errnote=None): + password = self.get_param('videopassword') + return self._download_json( + 'https://www.loom.com/graphql', video_id, note or 'Downloading GraphQL JSON', + errnote or 'Failed to download GraphQL JSON', headers={ + 'Accept': 'application/json', + 'Content-Type': 'application/json', + 'x-loom-request-source': f'loom_web_{self._APOLLO_GRAPHQL_VERSION}', + 'apollographql-client-name': 'web', + 'apollographql-client-version': self._APOLLO_GRAPHQL_VERSION, + }, data=json.dumps([{ + 'operationName': operation_name, + 'variables': { + 'videoId': video_id, + 'password': password, + **self._GRAPHQL_VARIABLES.get(operation_name, {}), + }, + 'query': self._GRAPHQL_QUERIES[operation_name], + } for operation_name in variadic(operations)], separators=(',', ':')).encode()) + + def _call_url_api(self, endpoint, video_id): + response = self._download_json( + f'https://www.loom.com/api/campaigns/sessions/{video_id}/{endpoint}', video_id, + f'Downloading {endpoint} JSON', f'Failed to download {endpoint} JSON', fatal=False, + headers={'Accept': 'application/json', 'Content-Type': 'application/json'}, + data=json.dumps({ + 'anonID': str(uuid.uuid4()), + 'deviceID': None, + 'force_original': False, # HTTP error 401 if True + 'password': self.get_param('videopassword'), + }, separators=(',', ':')).encode()) + return traverse_obj(response, ('url', {url_or_none})) + + def _extract_formats(self, video_id, metadata, gql_data): + formats = [] + video_properties = traverse_obj(metadata, ('video_properties', { + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'acodec': ('microphone_enabled', {lambda x: 'none' if x is False else None}), + })) + + def get_formats(format_url, format_id, quality): + if not format_url: + return + ext = determine_ext(format_url) + query = urllib.parse.urlparse(format_url).query + + if ext == 'm3u8': + # Extract pre-merged HLS formats to avoid buggy parsing of metadata in split playlists + format_url = format_url.replace('-split.m3u8', '.m3u8') + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id=f'hls-{format_id}', fatal=False, quality=quality) + for fmt in m3u8_formats: + yield { + **fmt, + 'url': update_url(fmt['url'], query=query), + 'extra_param_to_segment_url': query, + } + + elif ext == 'mpd': + dash_formats = self._extract_mpd_formats( + format_url, video_id, mpd_id=f'dash-{format_id}', fatal=False) + for fmt in dash_formats: + yield { + **fmt, + 'extra_param_to_segment_url': query, + 'quality': quality, + } + + else: + yield { + 'url': format_url, + 'ext': ext, + 'format_id': f'http-{format_id}', + 'quality': quality, + **video_properties, + } + + raw_url = self._call_url_api('raw-url', video_id) + formats.extend(get_formats(raw_url, 'raw', quality=1)) # original quality + + transcoded_url = self._call_url_api('transcoded-url', video_id) + formats.extend(get_formats(transcoded_url, 'transcoded', quality=-1)) # transcoded quality + + cdn_url = get_first(gql_data, ('data', 'getVideo', 'nullableRawCdnUrl', 'url', {url_or_none})) + # cdn_url is usually a dupe, but the raw-url/transcoded-url endpoints could return errors + valid_urls = [update_url(url, query=None) for url in (raw_url, transcoded_url) if url] + if cdn_url and update_url(cdn_url, query=None) not in valid_urls: + formats.extend(get_formats(cdn_url, 'cdn', quality=0)) # could be original or transcoded + + return formats + + def _real_extract(self, url): + video_id = self._match_id(url) + metadata = get_first( + self._call_graphql_api('GetVideoSSR', video_id, 'Downloading GraphQL metadata JSON'), + ('data', 'getVideo', {dict})) or {} + + if metadata.get('__typename') == 'VideoPasswordMissingOrIncorrect': + if not self.get_param('videopassword'): + raise ExtractorError( + 'This video is password-protected, use the --video-password option', expected=True) + raise ExtractorError('Invalid video password', expected=True) + + gql_data = self._call_graphql_api(['FetchChapters', 'FetchVideoTranscript', 'GetVideoSource'], video_id) + duration = traverse_obj(metadata, ('video_properties', 'duration', {int_or_none})) + + return { + 'id': video_id, + 'duration': duration, + 'chapters': self._extract_chapters_from_description( + get_first(gql_data, ('data', 'fetchVideoChapters', 'content', {str})), duration) or None, + 'formats': self._extract_formats(video_id, metadata, gql_data), + 'subtitles': filter_dict({ + 'en': traverse_obj(gql_data, ( + ..., 'data', 'fetchVideoTranscript', + ('source_url', 'captions_source_url'), { + 'url': {url_or_none}, + })) or None, + }), + **traverse_obj(metadata, { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'uploader': ('owner', 'display_name', {str}), + 'timestamp': ('createdAt', {parse_iso8601}), + }), + } + + +class LoomFolderIE(InfoExtractor): + IE_NAME = 'loom:folder' + _VALID_URL = r'https?://(?:www\.)?loom\.com/share/folder/(?P<id>[\da-f]{32})' + _TESTS = [{ + # 2 subfolders, no videos in root + 'url': 'https://www.loom.com/share/folder/997db4db046f43e5912f10dc5f817b5c', + 'playlist_mincount': 16, + 'info_dict': { + 'id': '997db4db046f43e5912f10dc5f817b5c', + 'title': 'Blending Lessons', + }, + }, { + # only videos, no subfolders + 'url': 'https://www.loom.com/share/folder/9a8a87f6b6f546d9a400c8e7575ff7f2', + 'playlist_mincount': 12, + 'info_dict': { + 'id': '9a8a87f6b6f546d9a400c8e7575ff7f2', + 'title': 'List A- a, i, o', + }, + }, { + # videos in root and empty subfolder + 'url': 'https://www.loom.com/share/folder/886e534218c24fd292e97e9563078cc4', + 'playlist_mincount': 21, + 'info_dict': { + 'id': '886e534218c24fd292e97e9563078cc4', + 'title': 'Medicare Agent Training videos', + }, + }, { + # videos in root and videos in subfolders + 'url': 'https://www.loom.com/share/folder/b72c4ecdf04745da9403926d80a40c38', + 'playlist_mincount': 21, + 'info_dict': { + 'id': 'b72c4ecdf04745da9403926d80a40c38', + 'title': 'Quick Altos Q & A Tutorials', + }, + }, { + # recursive folder extraction + 'url': 'https://www.loom.com/share/folder/8b458a94e0e4449b8df9ea7a68fafc4e', + 'playlist_count': 23, + 'info_dict': { + 'id': '8b458a94e0e4449b8df9ea7a68fafc4e', + 'title': 'Sezer Texting Guide', + }, + }, { + # more than 50 videos in 1 folder + 'url': 'https://www.loom.com/share/folder/e056a91d290d47ca9b00c9d1df56c463', + 'playlist_mincount': 61, + 'info_dict': { + 'id': 'e056a91d290d47ca9b00c9d1df56c463', + 'title': 'User Videos', + }, + }, { + # many subfolders + 'url': 'https://www.loom.com/share/folder/c2dde8cc67454f0e99031677279d8954', + 'playlist_mincount': 75, + 'info_dict': { + 'id': 'c2dde8cc67454f0e99031677279d8954', + 'title': 'Honors 1', + }, + }, { + 'url': 'https://www.loom.com/share/folder/bae17109a68146c7803454f2893c8cf8/Edpuzzle', + 'only_matching': True, + }] + + def _extract_folder_data(self, folder_id): + return self._download_json( + f'https://www.loom.com/v1/folders/{folder_id}', folder_id, + 'Downloading folder info JSON', query={'limit': '10000'}) + + def _extract_folder_entries(self, folder_id, initial_folder_data=None): + folder_data = initial_folder_data or self._extract_folder_data(folder_id) + + for video in traverse_obj(folder_data, ('videos', lambda _, v: v['id'])): + video_id = video['id'] + yield self.url_result( + f'https://www.loom.com/share/{video_id}', LoomIE, video_id, video.get('name')) + + # Recurse into subfolders + for subfolder_id in traverse_obj(folder_data, ( + 'folders', lambda _, v: v['id'] != folder_id, 'id', {str})): + yield from self._extract_folder_entries(subfolder_id) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + playlist_data = self._extract_folder_data(playlist_id) + + return self.playlist_result( + self._extract_folder_entries(playlist_id, playlist_data), playlist_id, + traverse_obj(playlist_data, ('folder', 'name', {str.strip}))) diff --git a/yt_dlp/extractor/lovehomeporn.py b/yt_dlp/extractor/lovehomeporn.py index ba5a13acd8..63b75a3d37 100644 --- a/yt_dlp/extractor/lovehomeporn.py +++ b/yt_dlp/extractor/lovehomeporn.py @@ -15,7 +15,7 @@ class LoveHomePornIE(NuevoBaseIE): }, 'params': { 'skip_download': True, - } + }, } def _real_extract(self, url): @@ -24,10 +24,10 @@ def _real_extract(self, url): display_id = mobj.group('display_id') info = self._extract_nuevo( - 'http://lovehomeporn.com/media/nuevo/config.php?key=%s' % video_id, + f'http://lovehomeporn.com/media/nuevo/config.php?key={video_id}', video_id) info.update({ 'display_id': display_id, - 'age_limit': 18 + 'age_limit': 18, }) return info diff --git a/yt_dlp/extractor/lrt.py b/yt_dlp/extractor/lrt.py index 80d4d1cdb3..1a0b6da230 100644 --- a/yt_dlp/extractor/lrt.py +++ b/yt_dlp/extractor/lrt.py @@ -22,8 +22,8 @@ class LRTStreamIE(LRTBaseIE): 'id': 'lrt-opus', 'live_status': 'is_live', 'title': 're:^LRT Opus.+$', - 'ext': 'mp4' - } + 'ext': 'mp4', + }, }] def _real_extract(self, url): @@ -44,7 +44,7 @@ def _real_extract(self, url): 'formats': formats, 'subtitles': subtitles, 'is_live': True, - 'title': f'{self._og_search_title(webpage)} - {stream_title}' + 'title': f'{self._og_search_title(webpage)} - {stream_title}', } @@ -62,7 +62,7 @@ class LRTVODIE(LRTBaseIE): 'timestamp': 1604079000, 'upload_date': '20201030', 'tags': ['LRT TELEVIZIJA', 'Beatos virtuvė', 'Beata Nicholson', 'Makaronai', 'Baklažanai', 'Vakarienė', 'Receptas'], - 'thumbnail': 'https://www.lrt.lt/img/2020/10/30/764041-126478-1287x836.jpg' + 'thumbnail': 'https://www.lrt.lt/img/2020/10/30/764041-126478-1287x836.jpg', }, }, { # direct mp3 download diff --git a/yt_dlp/extractor/lsm.py b/yt_dlp/extractor/lsm.py new file mode 100644 index 0000000000..f5be08f97d --- /dev/null +++ b/yt_dlp/extractor/lsm.py @@ -0,0 +1,282 @@ +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + int_or_none, + js_to_json, + parse_iso8601, + parse_qs, + str_or_none, + url_or_none, + urljoin, +) +from ..utils.traversal import traverse_obj + + +class LSMLREmbedIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?: + (?:latvijasradio|lr1|lr2|klasika|lr4|naba|radioteatris)\.lsm| + pieci + )\.lv/[^/?#]+/(?: + pleijeris|embed + )/?\?(?:[^#]+&)?(?:show|id)=(?P<id>\d+)''' + _TESTS = [{ + 'url': 'https://latvijasradio.lsm.lv/lv/embed/?theme=black&size=16x9&showCaptions=0&id=183522', + 'md5': '719b33875cd1429846eeeaeec6df2830', + 'info_dict': { + 'id': 'a342781', + 'ext': 'mp3', + 'duration': 1823, + 'title': '#138 Nepilnīgā kompensējamo zāļu sistēma pat mēnešiem dzenā pacientus pa aptiekām', + 'thumbnail': 'https://pic.latvijasradio.lv/public/assets/media/9/d/gallery_fd4675ac.jpg', + }, + }, { + 'url': 'https://radioteatris.lsm.lv/lv/embed/?id=&show=1270&theme=white&size=16x9', + 'info_dict': { + 'id': '1270', + }, + 'playlist_count': 3, + 'playlist': [{ + 'md5': '2e61b6eceff00d14d57fdbbe6ab24cac', + 'info_dict': { + 'id': 'a297397', + 'ext': 'mp3', + 'title': 'Eriks Emanuels Šmits "Pilāta evaņģēlijs". 1. daļa', + 'thumbnail': 'https://radioteatris.lsm.lv/public/assets/shows/62f131ae81e3c.jpg', + 'duration': 3300, + }, + }], + }, { + 'url': 'https://radioteatris.lsm.lv/lv/embed/?id=&show=1269&theme=white&size=16x9', + 'md5': '24810d4a961da2295d9860afdcaf4f5a', + 'info_dict': { + 'id': 'a230690', + 'ext': 'mp3', + 'title': 'Jens Ahlboms "Spārni". Radioizrāde ar Mārtiņa Freimaņa mūziku', + 'thumbnail': 'https://radioteatris.lsm.lv/public/assets/shows/62f13023a457c.jpg', + 'duration': 1788, + }, + }, { + 'url': 'https://lr1.lsm.lv/lv/embed/?id=166557&show=0&theme=white&size=16x9', + 'info_dict': { + 'id': '166557', + }, + 'playlist_count': 2, + 'playlist': [{ + 'md5': '6a8b0927572f443f09c6e50a3ad65f2d', + 'info_dict': { + 'id': 'a303104', + 'ext': 'mp3', + 'thumbnail': 'https://pic.latvijasradio.lv/public/assets/media/c/5/gallery_a83ad2c2.jpg', + 'title': 'Krustpunktā Lielā intervija: Valsts prezidents Egils Levits', + 'duration': 3222, + }, + }, { + 'md5': '5d5e191e718b7644e5118b7b4e093a6d', + 'info_dict': { + 'id': 'v303104', + 'ext': 'mp4', + 'thumbnail': 'https://pic.latvijasradio.lv/public/assets/media/c/5/gallery_a83ad2c2.jpg', + 'title': 'Krustpunktā Lielā intervija: Valsts prezidents Egils Levits - Video Version', + 'duration': 3222, + }, + }], + }, { + 'url': 'https://lr1.lsm.lv/lv/embed/?id=183522&show=0&theme=white&size=16x9', + 'only_matching': True, + }, { + 'url': 'https://lr2.lsm.lv/lv/embed/?id=182126&show=0&theme=white&size=16x9', + 'only_matching': True, + }, { + 'url': 'https://klasika.lsm.lv/lv/embed/?id=110806&show=0&theme=white&size=16x9', + 'only_matching': True, + }, { + 'url': 'https://lr4.lsm.lv/lv/embed/?id=184282&show=0&theme=white&size=16x9', + 'only_matching': True, + }, { + 'url': 'https://pieci.lv/lv/embed/?id=168896&show=0&theme=white&size=16x9', + 'only_matching': True, + }, { + 'url': 'https://naba.lsm.lv/lv/embed/?id=182901&show=0&theme=white&size=16x9', + 'only_matching': True, + }, { + 'url': 'https://radioteatris.lsm.lv/lv/embed/?id=176439&show=0&theme=white&size=16x9', + 'only_matching': True, + }, { + 'url': 'https://lr1.lsm.lv/lv/pleijeris/?embed=0&id=48205&time=00%3A00&idx=0', + 'only_matching': True, + }] + + def _real_extract(self, url): + query = parse_qs(url) + video_id = traverse_obj(query, ( + ('show', 'id'), 0, {int_or_none}, {lambda x: x or None}, {str_or_none}), get_all=False) + webpage = self._download_webpage(url, video_id) + + player_data, media_data = self._search_regex( + r'LR\.audio\.Player\s*\([^{]*(?P<player>\{.*?\}),(?P<media>\{.*\})\);', + webpage, 'player json', group=('player', 'media')) + + player_json = self._parse_json( + player_data, video_id, transform_source=js_to_json, fatal=False) or {} + media_json = self._parse_json(media_data, video_id, transform_source=js_to_json) + + entries = [] + for item in traverse_obj(media_json, (('audio', 'video'), lambda _, v: v['id'])): + formats = [] + for source_url in traverse_obj(item, ('sources', ..., 'file', {url_or_none})): + if determine_ext(source_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats(source_url, video_id, fatal=False)) + else: + formats.append({'url': source_url}) + + id_ = item['id'] + title = item.get('title') + if id_.startswith('v') and not title: + title = traverse_obj( + media_json, ('audio', lambda _, v: v['id'][1:] == id_[1:], 'title', + {lambda x: x and f'{x} - Video Version'}), get_all=False) + + entries.append({ + 'formats': formats, + 'thumbnail': urljoin(url, player_json.get('poster')), + 'id': id_, + 'title': title, + 'duration': traverse_obj(item, ('duration', {int_or_none})), + }) + + if len(entries) == 1: + return entries[0] + + return self.playlist_result(entries, video_id) + + +class LSMLTVEmbedIE(InfoExtractor): + _VALID_URL = r'https?://ltv\.lsm\.lv/embed\?(?:[^#]+&)?c=(?P<id>[^#&]+)' + _TESTS = [{ + 'url': 'https://ltv.lsm.lv/embed?c=eyJpdiI6IjQzbHVUeHAyaDJiamFjcjdSUUFKdnc9PSIsInZhbHVlIjoiMHl3SnJNRmd2TmFIdnZwOGtGUUpzODFzUEZ4SVVsN2xoRjliSW9vckUyMWZIWG8vbWVzaFFkY0lhNmRjbjRpaCIsIm1hYyI6ImMzNjdhMzFhNTFhZmY1ZmE0NWI5YmFjZGI1YmJiNGEyNjgzNDM4MjUzMWEwM2FmMDMyZDMwYWM1MDFjZmM5MGIiLCJ0YWciOiIifQ==', + 'md5': '64f72a360ca530d5ed89c77646c9eee5', + 'info_dict': { + 'id': '46k_d23-6000-105', + 'ext': 'mp4', + 'timestamp': 1700589151, + 'duration': 1442, + 'upload_date': '20231121', + 'title': 'D23-6000-105_cetstud', + 'thumbnail': 'https://store.cloudycdn.services/tmsp00060/assets/media/660858/placeholder1700589200.jpg', + }, + }, { + 'url': 'https://ltv.lsm.lv/embed?enablesdkjs=1&c=eyJpdiI6IncwVzZmUFk2MU12enVWK1I3SUcwQ1E9PSIsInZhbHVlIjoid3FhV29vamc3T2sxL1RaRmJ5Rm1GTXozU0o2dVczdUtLK0cwZEZJMDQ2a3ZIRG5DK2pneGlnbktBQy9uazVleHN6VXhxdWIweWNvcHRDSnlISlNYOHlVZ1lpcTUrcWZSTUZPQW14TVdkMW9aOUtRWVNDcFF4eWpHNGcrT0VZbUNFQStKQk91cGpndW9FVjJIa0lpbkh3PT0iLCJtYWMiOiIyZGI1NDJlMWRlM2QyMGNhOGEwYTM2MmNlN2JlOGRhY2QyYjdkMmEzN2RlOTEzYTVkNzI1ODlhZDlhZjU4MjQ2IiwidGFnIjoiIn0=', + 'md5': 'a1711e190fe680fdb68fd8413b378e87', + 'info_dict': { + 'id': 'wUnFArIPDSY', + 'ext': 'mp4', + 'uploader': 'LTV_16plus', + 'release_date': '20220514', + 'channel_url': 'https://www.youtube.com/channel/UCNMrnafwXD2XKeeQOyfkFCw', + 'view_count': int, + 'availability': 'public', + 'thumbnail': 'https://i.ytimg.com/vi/wUnFArIPDSY/maxresdefault.jpg', + 'release_timestamp': 1652544074, + 'title': 'EIROVĪZIJA SALĀTOS', + 'live_status': 'was_live', + 'uploader_id': '@LTV16plus', + 'comment_count': int, + 'channel_id': 'UCNMrnafwXD2XKeeQOyfkFCw', + 'channel_follower_count': int, + 'categories': ['Entertainment'], + 'duration': 5269, + 'upload_date': '20220514', + 'age_limit': 0, + 'channel': 'LTV_16plus', + 'playable_in_embed': True, + 'tags': [], + 'uploader_url': 'https://www.youtube.com/@LTV16plus', + 'like_count': int, + 'description': 'md5:7ff0c42ba971e3c13e4b8a2ff03b70b5', + }, + }] + + def _real_extract(self, url): + video_id = urllib.parse.unquote(self._match_id(url)) + webpage = self._download_webpage(url, video_id) + data = self._search_json( + r'window\.ltvEmbedPayload\s*=', webpage, 'embed json', video_id) + embed_type = traverse_obj(data, ('source', 'name', {str})) + + if embed_type == 'telia': + ie_key = 'CloudyCDN' + embed_url = traverse_obj(data, ('source', 'embed_url', {url_or_none})) + elif embed_type == 'youtube': + ie_key = 'Youtube' + embed_url = traverse_obj(data, ('source', 'id', {str})) + else: + raise ExtractorError(f'Unsupported embed type {embed_type!r}') + + return self.url_result( + embed_url, ie_key, video_id, **traverse_obj(data, { + 'title': ('parentInfo', 'title'), + 'duration': ('parentInfo', 'duration', {int_or_none}), + 'thumbnail': ('source', 'poster', {url_or_none}), + })) + + +class LSMReplayIE(InfoExtractor): + _VALID_URL = r'https?://replay\.lsm\.lv/[^/?#]+/(?:ieraksts|statja)/[^/?#]+/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://replay.lsm.lv/lv/ieraksts/ltv/311130/4-studija-zolitudes-tragedija-un-incupes-stacija', + 'md5': '64f72a360ca530d5ed89c77646c9eee5', + 'info_dict': { + 'id': '46k_d23-6000-105', + 'ext': 'mp4', + 'timestamp': 1700586300, + 'description': 'md5:0f1b14798cc39e1ae578bd0eb268f759', + 'duration': 1442, + 'upload_date': '20231121', + 'title': '4. studija. Zolitūdes traģēdija un Inčupes stacija', + 'thumbnail': 'https://ltv.lsm.lv/storage/media/8/7/large/5/1f9604e1.jpg', + }, + }, { + 'url': 'https://replay.lsm.lv/lv/ieraksts/lr/183522/138-nepilniga-kompensejamo-zalu-sistema-pat-menesiem-dzena-pacientus-pa-aptiekam', + 'md5': '719b33875cd1429846eeeaeec6df2830', + 'info_dict': { + 'id': 'a342781', + 'ext': 'mp3', + 'duration': 1823, + 'title': '#138 Nepilnīgā kompensējamo zāļu sistēma pat mēnešiem dzenā pacientus pa aptiekām', + 'thumbnail': 'https://pic.latvijasradio.lv/public/assets/media/9/d/large_fd4675ac.jpg', + 'upload_date': '20231102', + 'timestamp': 1698921060, + 'description': 'md5:7bac3b2dd41e44325032943251c357b1', + }, + }, { + 'url': 'https://replay.lsm.lv/ru/statja/ltv/311130/4-studija-zolitudes-tragedija-un-incupes-stacija', + 'only_matching': True, + }] + + def _fix_nuxt_data(self, webpage): + return re.sub(r'Object\.create\(null(?:,(\{.+\}))?\)', lambda m: m.group(1) or 'null', webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + data = self._search_nuxt_data( + self._fix_nuxt_data(webpage), video_id, context_name='__REPLAY__') + + return { + '_type': 'url_transparent', + 'id': video_id, + **traverse_obj(data, { + 'url': ('playback', 'service', 'url', {url_or_none}), + 'title': ('mediaItem', 'title'), + 'description': ('mediaItem', ('lead', 'body')), + 'duration': ('mediaItem', 'duration', {int_or_none}), + 'timestamp': ('mediaItem', 'aired_at', {parse_iso8601}), + 'thumbnail': ('mediaItem', 'largeThumbnail', {url_or_none}), + }, get_all=False), + } diff --git a/yt_dlp/extractor/lumni.py b/yt_dlp/extractor/lumni.py new file mode 100644 index 0000000000..8c26f5b472 --- /dev/null +++ b/yt_dlp/extractor/lumni.py @@ -0,0 +1,23 @@ +from .francetv import FranceTVBaseInfoExtractor + + +class LumniIE(FranceTVBaseInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?lumni\.fr/video/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.lumni.fr/video/l-homme-et-son-environnement-dans-la-revolution-industrielle', + 'md5': '960e8240c4f2c7a20854503a71e52f5e', + 'info_dict': { + 'id': 'd2b9a4e5-a526-495b-866c-ab72737e3645', + 'ext': 'mp4', + 'title': "L'homme et son environnement dans la révolution industrielle - L'ère de l'homme", + 'thumbnail': 'https://assets.webservices.francetelevisions.fr/v1/assets/images/a7/17/9f/a7179f5f-63a5-4e11-8d4d-012ab942d905.jpg', + 'duration': 230, + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._html_search_regex( + r'<div[^>]+data-factoryid\s*=\s*["\']([^"\']+)', webpage, 'video id') + return self._make_url_result(video_id, url=url) diff --git a/yt_dlp/extractor/lynda.py b/yt_dlp/extractor/lynda.py index 768ce913ee..bfd4619337 100644 --- a/yt_dlp/extractor/lynda.py +++ b/yt_dlp/extractor/lynda.py @@ -1,10 +1,7 @@ import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) from ..utils import ( ExtractorError, int_or_none, @@ -21,11 +18,11 @@ class LyndaBaseIE(InfoExtractor): @staticmethod def _check_error(json_string, key_or_keys): - keys = [key_or_keys] if isinstance(key_or_keys, compat_str) else key_or_keys + keys = [key_or_keys] if isinstance(key_or_keys, str) else key_or_keys for key in keys: error = json_string.get(key) if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError(f'Unable to login: {error}', expected=True) def _perform_login_step(self, form_html, fallback_action_url, extra_form_data, note, referrer_url): action_url = self._search_regex( @@ -33,7 +30,7 @@ def _perform_login_step(self, form_html, fallback_action_url, extra_form_data, n 'post url', default=fallback_action_url, group='url') if not action_url.startswith('http'): - action_url = compat_urlparse.urljoin(self._SIGNIN_URL, action_url) + action_url = urllib.parse.urljoin(self._SIGNIN_URL, action_url) form_data = self._hidden_inputs(form_html) form_data.update(extra_form_data) @@ -44,7 +41,7 @@ def _perform_login_step(self, form_html, fallback_action_url, extra_form_data, n headers={ 'Referer': referrer_url, 'X-Requested-With': 'XMLHttpRequest', - }, expected_status=(418, 500, )) + }, expected_status=(418, 500)) self._check_error(response, ('email', 'password', 'ErrorMessage')) @@ -97,8 +94,8 @@ class LyndaIE(LyndaBaseIE): 'id': '114408', 'ext': 'mp4', 'title': 'Using the exercise files', - 'duration': 68 - } + 'duration': 68, + }, }, { 'url': 'https://www.lynda.com/player/embed/133770?tr=foo=1;bar=g;fizz=rt&fs=0', 'only_matching': True, @@ -116,7 +113,7 @@ class LyndaIE(LyndaBaseIE): def _raise_unavailable(self, video_id): self.raise_login_required( - 'Video %s is only available for members' % video_id) + f'Video {video_id} is only available for members') def _real_extract(self, url): mobj = self._match_valid_url(url) @@ -137,8 +134,7 @@ def _real_extract(self, url): query['courseId'] = course_id play = self._download_json( - 'https://www.lynda.com/ajax/course/%s/%s/play' - % (course_id, video_id), video_id, 'Downloading play JSON') + f'https://www.lynda.com/ajax/course/{course_id}/{video_id}/play', video_id, 'Downloading play JSON') if not play: self._raise_unavailable(video_id) @@ -154,7 +150,7 @@ def _real_extract(self, url): continue formats.append({ 'url': format_url, - 'format_id': '%s-%s' % (cdn, format_id) if cdn else format_id, + 'format_id': f'{cdn}-{format_id}' if cdn else format_id, 'height': int_or_none(format_id), }) @@ -174,12 +170,12 @@ def _real_extract(self, url): if 'Status' in video: raise ExtractorError( - 'lynda returned error: %s' % video['Message'], expected=True) + 'lynda returned error: {}'.format(video['Message']), expected=True) if video.get('HasAccess') is False: self._raise_unavailable(video_id) - video_id = compat_str(video.get('ID') or video_id) + video_id = str(video.get('ID') or video_id) duration = int_or_none(video.get('DurationInSeconds')) title = video['Title'] @@ -193,7 +189,7 @@ def _real_extract(self, url): 'width': int_or_none(f.get('Width')), 'height': int_or_none(f.get('Height')), 'filesize': int_or_none(f.get('FileSize')), - 'format_id': compat_str(f.get('Resolution')) if f.get('Resolution') else None, + 'format_id': str(f.get('Resolution')) if f.get('Resolution') else None, } for f in fmts if f.get('Url')]) prioritized_streams = video.get('PrioritizedStreams') @@ -202,7 +198,7 @@ def _real_extract(self, url): formats.extend([{ 'url': video_url, 'height': int_or_none(format_id), - 'format_id': '%s-%s' % (prioritized_stream_id, format_id), + 'format_id': f'{prioritized_stream_id}-{format_id}', } for format_id, video_url in prioritized_stream.items()]) self._check_formats(formats, video_id) @@ -214,18 +210,16 @@ def _real_extract(self, url): 'title': title, 'duration': duration, 'subtitles': subtitles, - 'formats': formats + 'formats': formats, } def _fix_subtitles(self, subs): srt = '' seq_counter = 0 - for pos in range(0, len(subs) - 1): - seq_current = subs[pos] + for seq_current, seq_next in zip(subs, subs[1:]): m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode']) if m_current is None: continue - seq_next = subs[pos + 1] m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode']) if m_next is None: continue @@ -234,12 +228,12 @@ def _fix_subtitles(self, subs): text = seq_current['Caption'].strip() if text: seq_counter += 1 - srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (seq_counter, appear_time, disappear_time, text) + srt += f'{seq_counter}\r\n{appear_time} --> {disappear_time}\r\n{text}\r\n\r\n' if srt: return srt def _get_subtitles(self, video_id): - url = 'https://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id + url = f'https://www.lynda.com/ajax/player?videoId={video_id}&type=transcript' subs = self._download_webpage( url, video_id, 'Downloading subtitles JSON', fatal=False) if not subs or 'Status="NotFound"' in subs: @@ -274,10 +268,10 @@ def _real_extract(self, url): course_path = mobj.group('coursepath') course_id = mobj.group('courseid') - item_template = 'https://www.lynda.com/%s/%%s-4.html' % course_path + item_template = f'https://www.lynda.com/{course_path}/%s-4.html' course = self._download_json( - 'https://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id, + f'https://www.lynda.com/ajax/player?courseId={course_id}&type=course', course_id, 'Downloading course JSON', fatal=False) if not course: @@ -295,7 +289,7 @@ def _real_extract(self, url): if course.get('Status') == 'NotFound': raise ExtractorError( - 'Course %s does not exist' % course_id, expected=True) + f'Course {course_id} does not exist', expected=True) unaccessible_videos = 0 entries = [] @@ -316,13 +310,13 @@ def _real_extract(self, url): 'ie_key': LyndaIE.ie_key(), 'chapter': chapter.get('Title'), 'chapter_number': int_or_none(chapter.get('ChapterIndex')), - 'chapter_id': compat_str(chapter.get('ID')), + 'chapter_id': str(chapter.get('ID')), }) if unaccessible_videos > 0: self.report_warning( - '%s videos are only available for members (or paid members) and will not be downloaded. ' - % unaccessible_videos + self._ACCOUNT_CREDENTIALS_HINT) + f'{unaccessible_videos} videos are only available for members (or paid members) ' + f'and will not be downloaded. {self._ACCOUNT_CREDENTIALS_HINT}') course_title = course.get('Title') course_description = course.get('Description') diff --git a/yt_dlp/extractor/m6.py b/yt_dlp/extractor/m6.py deleted file mode 100644 index 9dcc601642..0000000000 --- a/yt_dlp/extractor/m6.py +++ /dev/null @@ -1,22 +0,0 @@ -from .common import InfoExtractor - - -class M6IE(InfoExtractor): - IE_NAME = 'm6' - _VALID_URL = r'https?://(?:www\.)?m6\.fr/[^/]+/videos/(?P<id>\d+)-[^\.]+\.html' - - _TEST = { - 'url': 'http://www.m6.fr/emission-les_reines_du_shopping/videos/11323908-emeline_est_la_reine_du_shopping_sur_le_theme_ma_fete_d_8217_anniversaire.html', - 'md5': '242994a87de2c316891428e0176bcb77', - 'info_dict': { - 'id': '11323908', - 'ext': 'mp4', - 'title': 'Emeline est la Reine du Shopping sur le thème « Ma fête d’anniversaire ! »', - 'description': 'md5:1212ae8fb4b7baa4dc3886c5676007c2', - 'duration': 100, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result('6play:%s' % video_id, 'SixPlay', video_id) diff --git a/yt_dlp/extractor/maariv.py b/yt_dlp/extractor/maariv.py new file mode 100644 index 0000000000..425a8b3b4a --- /dev/null +++ b/yt_dlp/extractor/maariv.py @@ -0,0 +1,62 @@ +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_resolution, + unified_timestamp, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class MaarivIE(InfoExtractor): + IE_NAME = 'maariv.co.il' + _VALID_URL = r'https?://player\.maariv\.co\.il/public/player\.html\?(?:[^#]+&)?media=(?P<id>\d+)' + _EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL})'] + _TESTS = [{ + 'url': 'https://player.maariv.co.il/public/player.html?player=maariv-desktop&media=3611585', + 'info_dict': { + 'id': '3611585', + 'duration': 75, + 'ext': 'mp4', + 'upload_date': '20231009', + 'title': 'מבצע חרבות ברזל', + 'timestamp': 1696851301, + }, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.maariv.co.il/news/law/Article-1044008', + 'info_dict': { + 'id': '3611585', + 'duration': 75, + 'ext': 'mp4', + 'upload_date': '20231009', + 'title': 'מבצע חרבות ברזל', + 'timestamp': 1696851301, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._download_json( + f'https://dal.walla.co.il/media/{video_id}?origin=player.maariv.co.il', video_id)['data'] + + formats = [] + if hls_url := traverse_obj(data, ('video', 'url', {url_or_none})): + formats.extend(self._extract_m3u8_formats(hls_url, video_id, m3u8_id='hls', fatal=False)) + + for http_format in traverse_obj(data, ('video', 'stream_urls', ..., 'stream_url', {url_or_none})): + formats.append({ + 'url': http_format, + 'format_id': 'http', + **parse_resolution(http_format), + }) + + return { + 'id': video_id, + **traverse_obj(data, { + 'title': 'title', + 'duration': ('video', 'duration', {int_or_none}), + 'timestamp': ('upload_date', {unified_timestamp}), + }), + 'formats': formats, + } diff --git a/yt_dlp/extractor/magellantv.py b/yt_dlp/extractor/magellantv.py new file mode 100644 index 0000000000..6f2524ba22 --- /dev/null +++ b/yt_dlp/extractor/magellantv.py @@ -0,0 +1,62 @@ +from .common import InfoExtractor +from ..utils import parse_age_limit, parse_duration, traverse_obj + + +class MagellanTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?magellantv\.com/(?:watch|video)/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.magellantv.com/watch/my-dads-on-death-row?type=v', + 'info_dict': { + 'id': 'my-dads-on-death-row', + 'ext': 'mp4', + 'title': 'My Dad\'s On Death Row', + 'description': 'md5:33ba23b9f0651fc4537ed19b1d5b0d7a', + 'duration': 3780.0, + 'age_limit': 14, + 'tags': ['Justice', 'Reality', 'United States', 'True Crime'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.magellantv.com/video/james-bulger-the-new-revelations', + 'info_dict': { + 'id': 'james-bulger-the-new-revelations', + 'ext': 'mp4', + 'title': 'James Bulger: The New Revelations', + 'description': 'md5:7b97922038bad1d0fe8d0470d8a189f2', + 'duration': 2640.0, + 'age_limit': 0, + 'tags': ['Investigation', 'True Crime', 'Justice', 'Europe'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.magellantv.com/watch/celebration-nation', + 'info_dict': { + 'id': 'celebration-nation', + 'ext': 'mp4', + 'tags': ['Art & Culture', 'Human Interest', 'Anthropology', 'China', 'History'], + 'duration': 2640.0, + 'title': 'Ancestors', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data = traverse_obj(self._search_nextjs_data(webpage, video_id), ( + 'props', 'pageProps', 'reactContext', + (('video', 'detail'), ('series', 'currentEpisode')), {dict}), get_all=False) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(data['jwpVideoUrl'], video_id) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('metadata', 'description', {str}), + 'duration': ('duration', {parse_duration}), + 'age_limit': ('ratingCategory', {parse_age_limit}), + 'tags': ('tags', ..., {str}), + }), + } diff --git a/yt_dlp/extractor/magentamusik.py b/yt_dlp/extractor/magentamusik.py new file mode 100644 index 0000000000..5bfc0a1545 --- /dev/null +++ b/yt_dlp/extractor/magentamusik.py @@ -0,0 +1,62 @@ +from .common import InfoExtractor +from ..utils import ExtractorError, int_or_none, join_nonempty, url_or_none +from ..utils.traversal import traverse_obj + + +class MagentaMusikIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?magentamusik\.de/(?P<id>[^/?#]+)' + + _TESTS = [{ + 'url': 'https://www.magentamusik.de/marty-friedman-woa-2023-9208205928595409235', + 'md5': 'd82dd4748f55fc91957094546aaf8584', + 'info_dict': { + 'id': '9208205928595409235', + 'display_id': 'marty-friedman-woa-2023-9208205928595409235', + 'ext': 'mp4', + 'title': 'Marty Friedman: W:O:A 2023', + 'alt_title': 'Konzert vom: 05.08.2023 13:00', + 'duration': 2760, + 'categories': ['Musikkonzert'], + 'release_year': 2023, + 'location': 'Deutschland', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + player_config = self._search_json( + r'data-js-element="o-video-player__config">', webpage, 'player config', display_id, fatal=False) + if not player_config: + raise ExtractorError('No video found', expected=True) + + asset_id = player_config['assetId'] + asset_details = self._download_json( + f'https://wcps.t-online.de/cvss/magentamusic/vodclient/v2/assetdetails/58938/{asset_id}', + display_id, note='Downloading asset details') + + video_id = traverse_obj( + asset_details, ('content', 'partnerInformation', ..., 'reference', {str}), get_all=False) + if not video_id: + raise ExtractorError('Unable to extract video id') + + vod_data = self._download_json( + f'https://wcps.t-online.de/cvss/magentamusic/vodclient/v2/player/58935/{video_id}/Main%20Movie', video_id) + smil_url = traverse_obj( + vod_data, ('content', 'feature', 'representations', ..., + 'contentPackages', ..., 'media', 'href', {url_or_none}), get_all=False) + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': self._extract_smil_formats(smil_url, video_id), + **traverse_obj(vod_data, ('content', 'feature', 'metadata', { + 'title': 'title', + 'alt_title': 'originalTitle', + 'description': 'longDescription', + 'duration': ('runtimeInSeconds', {int_or_none}), + 'location': ('countriesOfProduction', {list}, {lambda x: join_nonempty(*x, delim=', ')}), + 'release_year': ('yearOfProduction', {int_or_none}), + 'categories': ('mainGenre', {str}, {lambda x: x and [x]}), + })), + } diff --git a/yt_dlp/extractor/magentamusik360.py b/yt_dlp/extractor/magentamusik360.py deleted file mode 100644 index 5d0cb3bfb5..0000000000 --- a/yt_dlp/extractor/magentamusik360.py +++ /dev/null @@ -1,58 +0,0 @@ -from .common import InfoExtractor - - -class MagentaMusik360IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?magenta-musik-360\.de/([a-z0-9-]+-(?P<id>[0-9]+)|festivals/.+)' - _TESTS = [{ - 'url': 'https://www.magenta-musik-360.de/within-temptation-wacken-2019-1-9208205928595185932', - 'md5': '65b6f060b40d90276ec6fb9b992c1216', - 'info_dict': { - 'id': '9208205928595185932', - 'ext': 'm3u8', - 'title': 'WITHIN TEMPTATION', - 'description': 'Robert Westerholt und Sharon Janny den Adel gründeten die Symphonic Metal-Band. Privat sind die Niederländer ein Paar und haben zwei Kinder. Die Single Ice Queen brachte ihnen Platin und Gold und verhalf 2002 zum internationalen Durchbruch. Charakteristisch für die Band war Anfangs der hohe Gesang von Frontfrau Sharon. Stilistisch fing die Band im Gothic Metal an. Mit neuem Sound, schnellen Gitarrenriffs und Gitarrensoli, avancierte Within Temptation zur erfolgreichen Rockband. Auch dieses Jahr wird die Band ihre Fangemeinde wieder mitreißen.', - } - }, { - 'url': 'https://www.magenta-musik-360.de/festivals/wacken-world-wide-2020-body-count-feat-ice-t', - 'md5': '81010d27d7cab3f7da0b0f681b983b7e', - 'info_dict': { - 'id': '9208205928595231363', - 'ext': 'm3u8', - 'title': 'Body Count feat. Ice-T', - 'description': 'Body Count feat. Ice-T konnten bereits im vergangenen Jahr auf dem „Holy Ground“ in Wacken überzeugen. 2020 gehen die Crossover-Metaller aus einem Club in Los Angeles auf Sendung und bringen mit ihrer Mischung aus Metal und Hip-Hop Abwechslung und ordentlich Alarm zum WWW. Bereits seit 1990 stehen die beiden Gründer Ice-T (Gesang) und Ernie C (Gitarre) auf der Bühne. Sieben Studioalben hat die Gruppe bis jetzt veröffentlicht, darunter das Debüt „Body Count“ (1992) mit dem kontroversen Track „Cop Killer“.', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - # _match_id casts to string, but since "None" is not a valid video_id for magenta - # there is no risk for confusion - if video_id == "None": - webpage = self._download_webpage(url, video_id) - video_id = self._html_search_regex(r'data-asset-id="([^"]+)"', webpage, 'video_id') - json = self._download_json("https://wcps.t-online.de/cvss/magentamusic/vodplayer/v3/player/58935/%s/Main%%20Movie" % video_id, video_id) - xml_url = json['content']['feature']['representations'][0]['contentPackages'][0]['media']['href'] - metadata = json['content']['feature'].get('metadata') - title = None - description = None - duration = None - thumbnails = [] - if metadata: - title = metadata.get('title') - description = metadata.get('fullDescription') - duration = metadata.get('runtimeInSeconds') - for img_key in ('teaserImageWide', 'smallCoverImage'): - if img_key in metadata: - thumbnails.append({'url': metadata[img_key].get('href')}) - - xml = self._download_xml(xml_url, video_id) - final_url = xml[0][0][0].attrib['src'] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'url': final_url, - 'duration': duration, - 'thumbnails': thumbnails - } diff --git a/yt_dlp/extractor/mailru.py b/yt_dlp/extractor/mailru.py index 387d211fe1..cca678f14a 100644 --- a/yt_dlp/extractor/mailru.py +++ b/yt_dlp/extractor/mailru.py @@ -1,9 +1,9 @@ import itertools import json import re +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote from ..utils import ( int_or_none, parse_duration, @@ -99,7 +99,7 @@ class MailRuIE(InfoExtractor): { 'url': 'https://videoapi.my.mail.ru/videos/embed/mail/cloud-strife/Games/2009.html', 'only_matching': True, - } + }, ] def _real_extract(self, url): @@ -108,7 +108,7 @@ def _real_extract(self, url): video_id = None if meta_id: - meta_url = 'https://my.mail.ru/+/video/meta/%s' % meta_id + meta_url = f'https://my.mail.ru/+/video/meta/{meta_id}' else: video_id = mobj.group('idv1') if not video_id: @@ -137,20 +137,18 @@ def _real_extract(self, url): # Fallback old approach if not video_data: video_data = self._download_json( - 'http://api.video.mail.ru/videos/%s.json?new=1' % video_id, + f'http://api.video.mail.ru/videos/{video_id}.json?new=1', video_id, 'Downloading video JSON') - headers = {} - video_key = self._get_cookies('https://my.mail.ru').get('video_key') - if video_key: - headers['Cookie'] = 'video_key=%s' % video_key.value formats = [] for f in video_data['videos']: video_url = f.get('url') if not video_url: continue + if video_key: + self._set_cookie(urllib.parse.urlparse(video_url).hostname, 'video_key', video_key.value) format_id = f.get('key') height = int_or_none(self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None)) if format_id else None @@ -158,7 +156,6 @@ def _real_extract(self, url): 'url': video_url, 'format_id': format_id, 'height': height, - 'http_headers': headers, }) meta_data = video_data['meta'] @@ -171,7 +168,7 @@ def _real_extract(self, url): acc_id = meta_data.get('accId') item_id = meta_data.get('itemId') - content_id = '%s_%s' % (acc_id, item_id) if acc_id and item_id else video_id + content_id = f'{acc_id}_{item_id}' if acc_id and item_id else video_id thumbnail = meta_data.get('poster') duration = int_or_none(meta_data.get('duration')) @@ -194,7 +191,7 @@ class MailRuMusicSearchBaseIE(InfoExtractor): def _search(self, query, url, audio_id, limit=100, offset=0): search = self._download_json( 'https://my.mail.ru/cgi-bin/my/ajax', audio_id, - 'Downloading songs JSON page %d' % (offset // limit + 1), + f'Downloading songs JSON page {offset // limit + 1}', headers={ 'Referer': url, 'X-Requested-With': 'XMLHttpRequest', @@ -238,7 +235,7 @@ def _extract_track(t, fatal=True): artist = t.get('Author') or t.get('Author_Text_HTML') if track: - title = '%s - %s' % (artist, track) if artist else track + title = f'{artist} - {track}' if artist else track else: title = audio_id @@ -309,7 +306,7 @@ class MailRuMusicSearchIE(MailRuMusicSearchBaseIE): }] def _real_extract(self, url): - query = compat_urllib_parse_unquote(self._match_id(url)) + query = urllib.parse.unquote(self._match_id(url)) entries = [] diff --git a/yt_dlp/extractor/mainstreaming.py b/yt_dlp/extractor/mainstreaming.py index fe5589d598..fb9350584a 100644 --- a/yt_dlp/extractor/mainstreaming.py +++ b/yt_dlp/extractor/mainstreaming.py @@ -1,19 +1,18 @@ import re from .common import InfoExtractor - from ..utils import ( int_or_none, js_to_json, parse_duration, traverse_obj, try_get, - urljoin + urljoin, ) class MainStreamingIE(InfoExtractor): - _VALID_URL = r'https?://(?:webtools-?)?(?P<host>[A-Za-z0-9-]*\.msvdn.net)/(?:embed|amp_embed|content)/(?P<id>\w+)' + _VALID_URL = r'https?://(?:webtools-?)?(?P<host>[A-Za-z0-9-]*\.msvdn\.net)/(?:embed|amp_embed|content)/(?P<id>\w+)' _EMBED_REGEX = [rf'<iframe[^>]+?src=["\']?(?P<url>{_VALID_URL})["\']?'] IE_DESC = 'MainStreaming Player' @@ -31,9 +30,9 @@ class MainStreamingIE(InfoExtractor): }, 'expected_warnings': [ 'Ignoring alternative content ID: WDAF1KOWUpH3', - 'MainStreaming said: Live event is OFFLINE' + 'MainStreaming said: Live event is OFFLINE', ], - 'skip': 'live stream offline' + 'skip': 'live stream offline', }, { # playlist 'url': 'https://webtools-e18da6642b684f8aa9ae449862783a56.msvdn.net/embed/WDAF1KOWUpH3', @@ -41,7 +40,7 @@ class MainStreamingIE(InfoExtractor): 'id': 'WDAF1KOWUpH3', 'title': 'Playlist homepage', }, - 'playlist_mincount': 2 + 'playlist_mincount': 2, }, { # livestream 'url': 'https://webtools-859c1818ed614cc5b0047439470927b0.msvdn.net/embed/tDoFkZD3T1Lw', @@ -52,7 +51,7 @@ class MainStreamingIE(InfoExtractor): 'ext': 'mp4', 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster', }, - 'skip': 'live stream' + 'skip': 'live stream', }, { 'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/EUlZfGWkGpOd?autoPlay=false', 'info_dict': { @@ -62,8 +61,8 @@ class MainStreamingIE(InfoExtractor): 'ext': 'mp4', 'live_status': 'not_live', 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster', - 'duration': 1512 - } + 'duration': 1512, + }, }, { # video without webtools- prefix 'url': 'https://f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/MfuWmzL2lGkA?autoplay=false&T=1635860445', @@ -74,8 +73,8 @@ class MainStreamingIE(InfoExtractor): 'ext': 'mp4', 'live_status': 'not_live', 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster', - 'duration': 789.04 - } + 'duration': 789.04, + }, }, { # always-on livestream with DVR 'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/HVvPMzy', @@ -93,14 +92,14 @@ class MainStreamingIE(InfoExtractor): }, { # no host 'url': 'https://webtools.msvdn.net/embed/MfuWmzL2lGkA', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/amp_embed/tDoFkZD3T1Lw', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/content/tDoFkZD3T1Lw#', - 'only_matching': True - } + 'only_matching': True, + }, ] def _playlist_entries(self, host, playlist_content): @@ -112,7 +111,7 @@ def _playlist_entries(self, host, playlist_content): 'id': content_id, 'duration': int_or_none(traverse_obj(entry, ('duration', 'totalSeconds'))), 'title': entry.get('title'), - 'url': f'https://{host}/embed/{content_id}' + 'url': f'https://{host}/embed/{content_id}', } @staticmethod @@ -206,5 +205,5 @@ def _real_extract(self, url): 'duration': parse_duration(content_info.get('duration')), 'tags': content_info.get('tags'), 'subtitles': subtitles, - 'thumbnail': urljoin(self._get_webtools_base_url(host), f'image/{video_id}/poster') + 'thumbnail': urljoin(self._get_webtools_base_url(host), f'image/{video_id}/poster'), } diff --git a/yt_dlp/extractor/malltv.py b/yt_dlp/extractor/malltv.py deleted file mode 100644 index e1031d8da4..0000000000 --- a/yt_dlp/extractor/malltv.py +++ /dev/null @@ -1,107 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - clean_html, - dict_get, - float_or_none, - int_or_none, - merge_dicts, - parse_duration, - try_get, -) - - -class MallTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|sk)\.)?mall\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.mall.tv/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', - 'md5': 'cd69ce29176f6533b65bff69ed9a5f2a', - 'info_dict': { - 'id': 't0zzt0', - 'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', - 'ext': 'mp4', - 'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?', - 'description': 'md5:db7d5744a4bd4043d9d98324aa72ab35', - 'duration': 216, - 'timestamp': 1538870400, - 'upload_date': '20181007', - 'view_count': int, - 'comment_count': int, - 'thumbnail': 'https://cdn.vpplayer.tech/agmipnzv/encode/vjsnigfq/thumbnails/retina.jpg', - 'average_rating': 9.060869565217391, - 'dislike_count': int, - 'like_count': int, - } - }, { - 'url': 'https://www.mall.tv/kdo-to-plati/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', - 'only_matching': True, - }, { - 'url': 'https://sk.mall.tv/gejmhaus/reklamacia-nehreje-vyrobnik-tepla-alebo-spekacka', - 'only_matching': True, - }, { - 'url': 'https://www.mall.tv/zivoty-slavnych/nadeje-vychodu-i-zapadu-jak-michail-gorbacov-zmenil-politickou-mapu-sveta-a-ziskal-za-to-nobelovu-cenu-miru', - 'info_dict': { - 'id': 'yx010y', - 'ext': 'mp4', - 'dislike_count': int, - 'description': 'md5:aee02bee5a8d072c6a8207b91d1905a9', - 'thumbnail': 'https://cdn.vpplayer.tech/agmipnzv/encode/vjsnjdeu/thumbnails/retina.jpg', - 'comment_count': int, - 'display_id': 'md5:0ec2afa94d2e2b7091c019cef2a43a9b', - 'like_count': int, - 'duration': 752, - 'timestamp': 1646956800, - 'title': 'md5:fe79385daaf16d74c12c1ec4a26687af', - 'view_count': int, - 'upload_date': '20220311', - 'average_rating': 9.685714285714285, - } - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage( - url, display_id, headers=self.geo_verification_headers()) - - video = self._parse_json(self._search_regex( - r'videoObject\s*=\s*JSON\.parse\(JSON\.stringify\(({.+?})\)\);', - webpage, 'video object'), display_id) - - video_id = self._search_regex( - r'<input\s*id\s*=\s*player-id-name\s*[^>]+value\s*=\s*(\w+)', webpage, 'video id') - - formats = self._extract_m3u8_formats( - video['VideoSource'], video_id, 'mp4', 'm3u8_native') - - subtitles = {} - for s in (video.get('Subtitles') or {}): - s_url = s.get('Url') - if not s_url: - continue - subtitles.setdefault(s.get('Language') or 'cz', []).append({ - 'url': s_url, - }) - - entity_counts = video.get('EntityCounts') or {} - - def get_count(k): - v = entity_counts.get(k + 's') or {} - return int_or_none(dict_get(v, ('Count', 'StrCount'))) - - info = self._search_json_ld(webpage, video_id, default={}) - - return merge_dicts({ - 'id': str(video_id), - 'display_id': display_id, - 'title': video.get('Title'), - 'description': clean_html(video.get('Description')), - 'thumbnail': video.get('ThumbnailUrl'), - 'formats': formats, - 'subtitles': subtitles, - 'duration': int_or_none(video.get('DurationSeconds')) or parse_duration(video.get('Duration')), - 'view_count': get_count('View'), - 'like_count': get_count('Like'), - 'dislike_count': get_count('Dislike'), - 'average_rating': float_or_none(try_get(video, lambda x: x['EntityRating']['AvarageRate'])), - 'comment_count': get_count('Comment'), - }, info) diff --git a/yt_dlp/extractor/mangomolo.py b/yt_dlp/extractor/mangomolo.py index efaf66fa20..2231f71e8f 100644 --- a/yt_dlp/extractor/mangomolo.py +++ b/yt_dlp/extractor/mangomolo.py @@ -1,8 +1,7 @@ +import base64 +import urllib.parse + from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_urllib_parse_unquote, -) from ..utils import classproperty, int_or_none @@ -33,14 +32,14 @@ def _get_real_id(self, page_id): def _real_extract(self, url): page_id = self._get_real_id(self._match_id(url)) webpage = self._download_webpage( - 'https://player.mangomolo.com/v1/%s?%s' % (self._TYPE, url.split('?')[1]), page_id) + 'https://player.mangomolo.com/v1/{}?{}'.format(self._TYPE, url.split('?')[1]), page_id) hidden_inputs = self._hidden_inputs(webpage) m3u8_entry_protocol = 'm3u8' if self._IS_LIVE else 'm3u8_native' format_url = self._html_search_regex( [ r'(?:file|src)\s*:\s*"(https?://[^"]+?/playlist\.m3u8)', - r'<a[^>]+href="(rtsp://[^"]+)"' + r'<a[^>]+href="(rtsp://[^"]+)"', ], webpage, 'format url') formats = self._extract_wowza_formats( format_url, page_id, m3u8_entry_protocol, ['smil']) @@ -70,4 +69,4 @@ class MangomoloLiveIE(MangomoloBaseIE): _IS_LIVE = True def _get_real_id(self, page_id): - return compat_b64decode(compat_urllib_parse_unquote(page_id)).decode() + return base64.b64decode(urllib.parse.unquote(page_id)).decode() diff --git a/yt_dlp/extractor/manoto.py b/yt_dlp/extractor/manoto.py index 2792e6e707..1dd0b15c1e 100644 --- a/yt_dlp/extractor/manoto.py +++ b/yt_dlp/extractor/manoto.py @@ -1,10 +1,5 @@ from .common import InfoExtractor -from ..utils import ( - clean_html, - int_or_none, - traverse_obj -) - +from ..utils import clean_html, int_or_none, traverse_obj _API_URL = 'https://dak1vd5vmi7x6.cloudfront.net/api/v1/publicrole/{}/{}?id={}' @@ -25,11 +20,11 @@ class ManotoTVIE(InfoExtractor): 'title': 'کارول و جان', 'description': 'md5:d0fff1f8ba5c6775d312a00165d1a97e', 'thumbnail': r're:^https?://.*\.(jpeg|png|jpg)$', - 'ext': 'mp4' + 'ext': 'mp4', }, 'params': { 'skip_download': 'm3u8', - } + }, }, { 'url': 'https://www.manototv.com/episode/12576', 'info_dict': { @@ -42,11 +37,11 @@ class ManotoTVIE(InfoExtractor): 'title': 'سه ماه تعطیلی', 'description': 'سه ماه تعطیلی فیلمی به کارگردانی و نویسندگی شاپور قریب ساختهٔ سال ۱۳۵۶ است.', 'thumbnail': r're:^https?://.*\.(jpeg|png|jpg)$', - 'ext': 'mp4' + 'ext': 'mp4', }, 'params': { 'skip_download': 'm3u8', - } + }, }] def _real_extract(self, url): @@ -98,7 +93,7 @@ def _real_extract(self, url): entries = [ self.url_result( - 'https://www.manototv.com/episode/%s' % item['slideID'], ie=ManotoTVIE.ie_key(), video_id=item['slideID']) + 'https://www.manototv.com/episode/{}'.format(item['slideID']), ie=ManotoTVIE.ie_key(), video_id=item['slideID']) for item in playlist] return self.playlist_result(entries, show_id, title, description) @@ -116,7 +111,7 @@ class ManotoTVLiveIE(InfoExtractor): }, 'params': { 'skip_download': 'm3u8', - } + }, } def _real_extract(self, url): diff --git a/yt_dlp/extractor/manyvids.py b/yt_dlp/extractor/manyvids.py index 741745378b..8caa8f87fe 100644 --- a/yt_dlp/extractor/manyvids.py +++ b/yt_dlp/extractor/manyvids.py @@ -12,6 +12,7 @@ class ManyVidsIE(InfoExtractor): + _WORKING = False _VALID_URL = r'(?i)https?://(?:www\.)?manyvids\.com/video/(?P<id>\d+)' _TESTS = [{ # preview video @@ -43,7 +44,7 @@ class ManyVidsIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - real_url = 'https://www.manyvids.com/video/%s/gtm.js' % (video_id, ) + real_url = f'https://www.manyvids.com/video/{video_id}/gtm.js' try: webpage = self._download_webpage(real_url, video_id) except Exception: @@ -74,7 +75,7 @@ def txt_or_none(s, default=None): def mung_title(s): if uploader: - s = re.sub(r'^\s*%s\s+[|-]' % (re.escape(uploader), ), '', s) + s = re.sub(rf'^\s*{re.escape(uploader)}\s+[|-]', '', s) return txt_or_none(s) title = ( @@ -105,7 +106,7 @@ def mung_title(s): 'vid': video_id, }), headers={ 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest' + 'X-Requested-With': 'XMLHttpRequest', }) formats = [] @@ -137,7 +138,7 @@ def mung_title(s): def get_likes(): likes = self._search_regex( - r'''(<a\b[^>]*\bdata-id\s*=\s*(['"])%s\2[^>]*>)''' % (video_id, ), + rf'''(<a\b[^>]*\bdata-id\s*=\s*(['"]){video_id}\2[^>]*>)''', webpage, 'likes', default='') likes = extract_attributes(likes) return int_or_none(likes.get('data-likes')) diff --git a/yt_dlp/extractor/markiza.py b/yt_dlp/extractor/markiza.py index 53ed79158f..088b60d554 100644 --- a/yt_dlp/extractor/markiza.py +++ b/yt_dlp/extractor/markiza.py @@ -1,7 +1,6 @@ import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( orderedSet, parse_duration, @@ -10,6 +9,7 @@ class MarkizaIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?videoarchiv\.markiza\.sk/(?:video/(?:[^/]+/)*|embed/)(?P<id>\d+)(?:[_/]|$)' _TESTS = [{ 'url': 'http://videoarchiv.markiza.sk/video/oteckovia/84723_oteckovia-109', @@ -59,15 +59,16 @@ def _real_extract(self, url): info.update({ 'id': video_id, 'title': try_get( - data, lambda x: x['details']['name'], compat_str), + data, lambda x: x['details']['name'], str), }) else: info['duration'] = parse_duration( - try_get(data, lambda x: x['details']['duration'], compat_str)) + try_get(data, lambda x: x['details']['duration'], str)) return info class MarkizaPageIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?(?:(?:[^/]+\.)?markiza|tvnoviny)\.sk/(?:[^/]+/)*(?P<id>\d+)_' _TESTS = [{ 'url': 'http://www.markiza.sk/soubiz/zahranicny/1923705_oteckovia-maju-svoj-den-ti-slavni-nie-su-o-nic-menej-rozkosni', @@ -102,7 +103,7 @@ class MarkizaPageIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if MarkizaIE.suitable(url) else super(MarkizaPageIE, cls).suitable(url) + return False if MarkizaIE.suitable(url) else super().suitable(url) def _real_extract(self, url): playlist_id = self._match_id(url) @@ -114,7 +115,7 @@ def _real_extract(self, url): url, playlist_id, expected_status=500) entries = [ - self.url_result('http://videoarchiv.markiza.sk/video/%s' % video_id) + self.url_result(f'http://videoarchiv.markiza.sk/video/{video_id}') for video_id in orderedSet(re.findall( r'(?:initPlayer_|data-entity=["\']|id=["\']player_)(\d+)', webpage))] diff --git a/yt_dlp/extractor/massengeschmacktv.py b/yt_dlp/extractor/massengeschmacktv.py index 7dacb43e02..43c0873347 100644 --- a/yt_dlp/extractor/massengeschmacktv.py +++ b/yt_dlp/extractor/massengeschmacktv.py @@ -17,11 +17,12 @@ class MassengeschmackTVIE(InfoExtractor): _TEST = { 'url': 'https://massengeschmack.tv/play/fktv202', - 'md5': 'a9e054db9c2b5a08f0a0527cc201e8d3', + 'md5': '9996f314994a49fefe5f39aa1b07ae21', 'info_dict': { 'id': 'fktv202', 'ext': 'mp4', - 'title': 'Fernsehkritik-TV - Folge 202', + 'title': 'Fernsehkritik-TV #202', + 'thumbnail': 'https://cache.massengeschmack.tv/img/mag/fktv202.jpg', }, } @@ -29,9 +30,6 @@ def _real_extract(self, url): episode = self._match_id(url) webpage = self._download_webpage(url, episode) - title = clean_html(self._html_search_regex( - '<h3>([^<]+)</h3>', webpage, 'title')) - thumbnail = self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False) sources = self._parse_json(self._search_regex(r'(?s)MEDIA\s*=\s*(\[.+?\]);', webpage, 'media'), episode, js_to_json) formats = [] @@ -67,7 +65,8 @@ def _real_extract(self, url): return { 'id': episode, - 'title': title, + 'title': clean_html(self._html_search_regex( + r'<span[^>]+\bid=["\']clip-title["\'][^>]*>([^<]+)', webpage, 'title', fatal=False)), 'formats': formats, - 'thumbnail': thumbnail, + 'thumbnail': self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False), } diff --git a/yt_dlp/extractor/masters.py b/yt_dlp/extractor/masters.py index 716f1c9615..4aa2c989a1 100644 --- a/yt_dlp/extractor/masters.py +++ b/yt_dlp/extractor/masters.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( traverse_obj, @@ -16,7 +15,7 @@ class MastersIE(InfoExtractor): 'title': 'Sungjae Im: Thursday Interview 2022', 'upload_date': '20220407', 'thumbnail': r're:^https?://.*\.jpg$', - } + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/matchtv.py b/yt_dlp/extractor/matchtv.py index a67fa9fe4c..93799fe859 100644 --- a/yt_dlp/extractor/matchtv.py +++ b/yt_dlp/extractor/matchtv.py @@ -1,51 +1,35 @@ -import random - from .common import InfoExtractor -from ..utils import xpath_text class MatchTVIE(InfoExtractor): - _VALID_URL = r'https?://matchtv\.ru(?:/on-air|/?#live-player)' + _VALID_URL = [ + r'https?://matchtv\.ru/on-air/?(?:$|[?#])', + r'https?://video\.matchtv\.ru/iframe/channel/106/?(?:$|[?#])', + ] _TESTS = [{ - 'url': 'http://matchtv.ru/#live-player', + 'url': 'http://matchtv.ru/on-air/', 'info_dict': { 'id': 'matchtv-live', - 'ext': 'flv', + 'ext': 'mp4', 'title': r're:^Матч ТВ - Прямой эфир \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', - 'is_live': True, + 'live_status': 'is_live', }, 'params': { 'skip_download': True, }, }, { - 'url': 'http://matchtv.ru/on-air/', + 'url': 'https://video.matchtv.ru/iframe/channel/106', 'only_matching': True, }] def _real_extract(self, url): video_id = 'matchtv-live' - video_url = self._download_json( - 'http://player.matchtv.ntvplus.tv/player/smil', video_id, - query={ - 'ts': '', - 'quality': 'SD', - 'contentId': '561d2c0df7159b37178b4567', - 'sign': '', - 'includeHighlights': '0', - 'userId': '', - 'sessionId': random.randint(1, 1000000000), - 'contentType': 'channel', - 'timeShift': '0', - 'platform': 'portal', - }, - headers={ - 'Referer': 'http://player.matchtv.ntvplus.tv/embed-player/NTVEmbedPlayer.swf', - })['data']['videoUrl'] - f4m_url = xpath_text(self._download_xml(video_url, video_id), './to') - formats = self._extract_f4m_formats(f4m_url, video_id) + webpage = self._download_webpage('https://video.matchtv.ru/iframe/channel/106', video_id) + video_url = self._html_search_regex( + r'data-config="config=(https?://[^?"]+)[?"]', webpage, 'video URL').replace('/feed/', '/media/') + '.m3u8' return { 'id': video_id, 'title': 'Матч ТВ - Прямой эфир', 'is_live': True, - 'formats': formats, + 'formats': self._extract_m3u8_formats(video_url, video_id, 'mp4', live=True), } diff --git a/yt_dlp/extractor/mbn.py b/yt_dlp/extractor/mbn.py new file mode 100644 index 0000000000..4917c4698e --- /dev/null +++ b/yt_dlp/extractor/mbn.py @@ -0,0 +1,89 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class MBNIE(InfoExtractor): + IE_DESC = 'mbn.co.kr (매일방송)' + _VALID_URL = r'https?://(?:www\.)?mbn\.co\.kr/vod/programContents/preview(?:list)?/\d+/\d+/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://mbn.co.kr/vod/programContents/previewlist/861/5433/1276155', + 'md5': '85e1694e5b247c04d1386b7e3c90fd76', + 'info_dict': { + 'id': '1276155', + 'ext': 'mp4', + 'title': '결국 사로잡힌 권유리, 그녀를 목숨 걸고 구하려는 정일우!', + 'duration': 3891, + 'release_date': '20210703', + 'thumbnail': 'http://img.vod.mbn.co.kr/mbnvod2img/861/2021/07/03/20210703230811_20_861_1276155_360_7_0.jpg', + 'series': '보쌈 - 운명을 훔치다', + 'episode': 'Episode 19', + 'episode_number': 19, + }, + }, { + 'url': 'https://www.mbn.co.kr/vod/programContents/previewlist/835/5294/1084744', + 'md5': 'fc65d3aac85e85e0b5056f4ef99cde4a', + 'info_dict': { + 'id': '1084744', + 'ext': 'mp4', + 'title': '김정은♥최원영, 제자리를 찾은 위험한 부부! "결혼은 투쟁이면서, 어려운 방식이야.."', + 'duration': 93, + 'release_date': '20201124', + 'thumbnail': 'http://img.vod.mbn.co.kr/mbnvod2img/835/2020/11/25/20201125000221_21_835_1084744_360_7_0.jpg', + 'series': '나의 위험한 아내', + }, + }, { + 'url': 'https://www.mbn.co.kr/vod/programContents/preview/952/6088/1054797?next=1', + 'md5': 'c711103c72aeac8323a5cf1751f10097', + 'info_dict': { + 'id': '1054797', + 'ext': 'mp4', + 'title': '[2차 티저] MBN 주말 미니시리즈 <완벽한 결혼의 정석> l 그녀에게 주어진 두 번째 인생', + 'duration': 65, + 'release_date': '20231028', + 'thumbnail': 'http://img.vod.mbn.co.kr/vod2/952/2023/09/11/20230911130223_22_952_1054797_1080_7.jpg', + 'series': '완벽한 결혼의 정석', + }, + }] + + def _real_extract(self, url): + content_id = self._match_id(url) + webpage = self._download_webpage(url, content_id) + + content_cls_cd = self._search_regex( + r'"\?content_cls_cd=(\d+)&', webpage, 'content cls cd', fatal=False) or '20' + media_info = self._download_json( + 'https://www.mbn.co.kr/player/mbnVodPlayer_2020.mbn', content_id, + note='Fetching playback data', query={ + 'content_cls_cd': content_cls_cd, + 'content_id': content_id, + 'relay_type': '1', + }) + + formats = [] + for stream_url in traverse_obj(media_info, ('movie_list', ..., 'url', {url_or_none})): + stream_url = re.sub(r'/(?:chunk|play)list(?:_pd\d+)?\.m3u8', '/manifest.m3u8', stream_url) + final_url = url_or_none(self._download_webpage( + f'https://www.mbn.co.kr/player/mbnStreamAuth_new_vod.mbn?vod_url={stream_url}', + content_id, note='Fetching authenticated m3u8 url')) + + formats.extend(self._extract_m3u8_formats(final_url, content_id, fatal=False)) + + return { + 'id': content_id, + **traverse_obj(media_info, { + 'title': ('movie_title', {str}), + 'duration': ('play_sec', {int_or_none}), + 'release_date': ('bcast_date', {lambda x: x.replace('.', '')}, {unified_strdate}), + 'thumbnail': ('movie_start_Img', {url_or_none}), + 'series': ('prog_nm', {str}), + 'episode_number': ('ad_contentnumber', {int_or_none}), + }), + 'formats': formats, + } diff --git a/yt_dlp/extractor/mdr.py b/yt_dlp/extractor/mdr.py index 49f5b49a40..46097fa20e 100644 --- a/yt_dlp/extractor/mdr.py +++ b/yt_dlp/extractor/mdr.py @@ -1,5 +1,6 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( determine_ext, int_or_none, @@ -104,7 +105,7 @@ def _real_extract(self, url): webpage, 'data url', group='url').replace(r'\/', '/') doc = self._download_xml( - compat_urlparse.urljoin(url, data_url), video_id) + urllib.parse.urljoin(url, data_url), video_id) title = xpath_text(doc, ['./title', './broadcast/broadcastName'], 'title', fatal=True) @@ -118,7 +119,7 @@ def _real_extract(self, url): 'progressiveDownload', 'dynamicHttpStreamingRedirector', 'adaptiveHttpStreamingRedirector'): - url_el = asset.find('./%sUrl' % source) + url_el = asset.find(f'./{source}Url') if url_el is None: continue diff --git a/yt_dlp/extractor/medaltv.py b/yt_dlp/extractor/medaltv.py index 82be823b8a..d64dbfe638 100644 --- a/yt_dlp/extractor/medaltv.py +++ b/yt_dlp/extractor/medaltv.py @@ -1,11 +1,10 @@ import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, - format_field, float_or_none, + format_field, int_or_none, str_or_none, traverse_obj, @@ -13,10 +12,10 @@ class MedalTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?medal\.tv/(?P<path>games/[^/?#&]+/clips)/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?medal\.tv/games/[^/?#&]+/clips/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://medal.tv/games/valorant/clips/jTBFnLKdLy15K', - 'md5': '6930f8972914b6b9fdc2bb3918098ba0', + 'md5': '03e4911fdcf7fce563090705c2e79267', 'info_dict': { 'id': 'jTBFnLKdLy15K', 'ext': 'mp4', @@ -31,10 +30,10 @@ class MedalTVIE(InfoExtractor): 'view_count': int, 'like_count': int, 'duration': 13, - } + }, }, { - 'url': 'https://medal.tv/games/cod%20cold%20war/clips/2mA60jWAGQCBH', - 'md5': '3d19d426fe0b2d91c26e412684e66a06', + 'url': 'https://medal.tv/games/cod-cold-war/clips/2mA60jWAGQCBH', + 'md5': 'fc7a3e4552ae8993c1c4006db46be447', 'info_dict': { 'id': '2mA60jWAGQCBH', 'ext': 'mp4', @@ -50,9 +49,9 @@ class MedalTVIE(InfoExtractor): 'view_count': int, 'like_count': int, 'duration': 23, - } + }, }, { - 'url': 'https://medal.tv/games/cod%20cold%20war/clips/2um24TWdty0NA', + 'url': 'https://medal.tv/games/cod-cold-war/clips/2um24TWdty0NA', 'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148', 'info_dict': { 'id': '2um24TWdty0NA', @@ -69,7 +68,7 @@ class MedalTVIE(InfoExtractor): 'view_count': int, 'like_count': int, 'duration': 9, - } + }, }, { 'url': 'https://medal.tv/games/valorant/clips/37rMeFpryCC-9', 'only_matching': True, @@ -80,25 +79,14 @@ class MedalTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - path = self._match_valid_url(url).group('path') - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, video_id, query={'mobilebypass': 'true'}) - next_data = self._search_json( - '<script[^>]*__NEXT_DATA__[^>]*>', webpage, + hydration_data = self._search_json( + r'<script[^>]*>[^<]*\bhydrationData\s*=', webpage, 'next data', video_id, end_pattern='</script>', fatal=False) - build_id = next_data.get('buildId') - if not build_id: - raise ExtractorError( - 'Could not find build ID.', video_id=video_id) - - locale = next_data.get('locale', 'en') - - api_response = self._download_json( - f'https://medal.tv/_next/data/{build_id}/{locale}/{path}/{video_id}.json', video_id) - - clip = traverse_obj(api_response, ('pageProps', 'clip')) or {} + clip = traverse_obj(hydration_data, ('clips', ...), get_all=False) if not clip: raise ExtractorError( 'Could not find video information.', video_id=video_id) @@ -119,13 +107,13 @@ def add_item(container, item_url, height, id_key='format_id', item_id=None): 'url': item_url, id_key: item_id, 'width': width, - 'height': height + 'height': height, }) formats = [] thumbnails = [] for k, v in clip.items(): - if not (v and isinstance(v, compat_str)): + if not (v and isinstance(v, str)): continue mobj = re.match(r'(contentUrl|thumbnail)(?:(\d+)p)?$', k) if not mobj: @@ -147,12 +135,12 @@ def add_item(container, item_url, height, id_key='format_id', item_id=None): expected=True, video_id=video_id) else: self.raise_no_formats( - 'An unknown error occurred ({0}).'.format(error), + f'An unknown error occurred ({error}).', video_id=video_id) # Necessary because the id of the author is not known in advance. # Won't raise an issue if no profile can be found as this is optional. - author = traverse_obj(api_response, ('pageProps', 'profile')) or {} + author = traverse_obj(hydration_data, ('profiles', ...), get_all=False) or {} author_id = str_or_none(author.get('userId')) author_url = format_field(author_id, None, 'https://medal.tv/users/%s') diff --git a/yt_dlp/extractor/mediaite.py b/yt_dlp/extractor/mediaite.py index 0f9079b112..b3fa6a197f 100644 --- a/yt_dlp/extractor/mediaite.py +++ b/yt_dlp/extractor/mediaite.py @@ -2,7 +2,7 @@ class MediaiteIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mediaite.com(?!/category)(?:/[\w-]+){2}' + _VALID_URL = r'https?://(?:www\.)?mediaite\.com(?!/category)(?:/[\w-]+){2}' _TESTS = [{ 'url': 'https://www.mediaite.com/sports/bill-burr-roasts-nfl-for-promoting-black-lives-matter-while-scheduling-more-games-after-all-the-sht-they-know-about-cte/', 'info_dict': { @@ -15,7 +15,7 @@ class MediaiteIE(InfoExtractor): 'timestamp': 1631630185, 'upload_date': '20210914', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }, { 'url': 'https://www.mediaite.com/tv/joe-scarborough-goes-off-on-tax-breaks-for-super-wealthy-largest-income-redistribution-scam-in-american-history/', 'info_dict': { @@ -28,7 +28,7 @@ class MediaiteIE(InfoExtractor): 'timestamp': 1631618057, 'upload_date': '20210914', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }, { 'url': 'https://www.mediaite.com/politics/watch-rudy-giuliani-impersonates-queen-elizabeth-calls-mark-milley-an-asshle-in-bizarre-9-11-speech/', 'info_dict': { @@ -41,7 +41,7 @@ class MediaiteIE(InfoExtractor): 'timestamp': 1631536476, 'upload_date': '20210913', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }, { 'url': 'https://www.mediaite.com/podcasts/clarissa-ward-says-she-decided-to-become-a-journalist-on-9-11/', 'info_dict': { @@ -54,7 +54,7 @@ class MediaiteIE(InfoExtractor): 'timestamp': 1631311188, 'upload_date': '20210910', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }, { 'url': 'https://www.mediaite.com/opinion/mainstream-media-ignores-rose-mcgowans-bombshell-allegation-that-newsoms-wife-tried-to-silence-her-on-weinstein/', 'info_dict': { @@ -67,7 +67,7 @@ class MediaiteIE(InfoExtractor): 'timestamp': 1631553328, 'upload_date': '20210913', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }, { 'url': 'https://www.mediaite.com/news/watch-cnbcs-jim-cramer-says-nobody-wants-to-die-getting-infected-by-unvaccinated-coworker-even-for-22-an-hour/', 'info_dict': { @@ -80,11 +80,25 @@ class MediaiteIE(InfoExtractor): 'timestamp': 1633014214, 'upload_date': '20210930', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, + }, { + 'url': 'https://www.mediaite.com/politics/i-cant-read-it-fast-enough-while-defending-trump-larry-kudlow-overwhelmed-by-volume-of-ex-presidents-legal-troubles/', + 'info_dict': { + 'id': 'E6EhDX5z', + 'ext': 'mp4', + 'title': 'Fox Business Network - 4:00 PM - 5:00 PM - 1:39:42 pm - 1:42:20 pm', + 'description': '', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/E6EhDX5z/poster.jpg?width=720', + 'duration': 157, + 'timestamp': 1691015535, + 'upload_date': '20230802', + }, + 'params': {'skip_download': True}, }] def _real_extract(self, url): webpage = self._download_webpage(url, None) - id = self._search_regex(r'data-video-id\s?=\s?\"([^\"]+)\"', webpage, 'id') - data_json = self._download_json(f'https://cdn.jwplayer.com/v2/media/{id}', id) + video_id = self._search_regex( + [r'"https://cdn\.jwplayer\.com/players/(\w+)', r'data-video-id\s*=\s*\"([^\"]+)\"'], webpage, 'id') + data_json = self._download_json(f'https://cdn.jwplayer.com/v2/media/{video_id}', video_id) return self._parse_jwplayer_data(data_json) diff --git a/yt_dlp/extractor/mediaklikk.py b/yt_dlp/extractor/mediaklikk.py index 46365081b7..f51342060b 100644 --- a/yt_dlp/extractor/mediaklikk.py +++ b/yt_dlp/extractor/mediaklikk.py @@ -1,10 +1,11 @@ -from ..utils import ( - unified_strdate -) +import urllib.parse + from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_str +from ..utils import ( + ExtractorError, + traverse_obj, + unified_strdate, + url_or_none, ) @@ -15,53 +16,102 @@ class MediaKlikkIE(InfoExtractor): (?P<id>[^/#?_]+)''' _TESTS = [{ - # mediaklikk. date in html. + # (old) mediaklikk. date in html. 'url': 'https://mediaklikk.hu/video/hazajaro-delnyugat-bacska-a-duna-menten-palankatol-doroszloig/', 'info_dict': { 'id': '4754129', 'title': 'Hazajáró, DÉLNYUGAT-BÁCSKA – A Duna mentén Palánkától Doroszlóig', 'ext': 'mp4', 'upload_date': '20210901', - 'thumbnail': 'http://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg' - } + 'thumbnail': 'http://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg', + }, + 'skip': 'Webpage redirects to 404 page', }, { - # m4sport + # mediaklikk. date in html. + 'url': 'https://mediaklikk.hu/video/hazajaro-fabova-hegyseg-kishont-koronaja/', + 'info_dict': { + 'id': '6696133', + 'title': 'Hazajáró, Fabova-hegység - Kishont koronája', + 'display_id': 'hazajaro-fabova-hegyseg-kishont-koronaja', + 'ext': 'mp4', + 'upload_date': '20230903', + 'thumbnail': 'https://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg', + }, + }, { + # (old) m4sport 'url': 'https://m4sport.hu/video/2021/08/30/gyemant-liga-parizs/', 'info_dict': { 'id': '4754999', 'title': 'Gyémánt Liga, Párizs', 'ext': 'mp4', 'upload_date': '20210830', - 'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/08/vlcsnap-2021-08-30-18h21m20s10-1024x576.jpg' - } + 'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/08/vlcsnap-2021-08-30-18h21m20s10-1024x576.jpg', + }, + 'skip': 'Webpage redirects to 404 page', + }, { + # m4sport + 'url': 'https://m4sport.hu/sportkozvetitesek/video/2023/09/08/atletika-gyemant-liga-brusszel/', + 'info_dict': { + 'id': '6711136', + 'title': 'Atlétika – Gyémánt Liga, Brüsszel', + 'display_id': 'atletika-gyemant-liga-brusszel', + 'ext': 'mp4', + 'upload_date': '20230908', + 'thumbnail': 'https://m4sport.hu/wp-content/uploads/sites/4/2023/09/vlcsnap-2023-09-08-22h43m18s691.jpg', + }, }, { # m4sport with *video/ url and no date 'url': 'https://m4sport.hu/bl-video/real-madrid-chelsea-1-1/', 'info_dict': { 'id': '4492099', 'title': 'Real Madrid - Chelsea 1-1', + 'display_id': 'real-madrid-chelsea-1-1', 'ext': 'mp4', - 'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/04/Sequence-01.Still001-1024x576.png' - } + 'thumbnail': 'https://m4sport.hu/wp-content/uploads/sites/4/2021/04/Sequence-01.Still001-1024x576.png', + }, }, { - # hirado + # (old) hirado 'url': 'https://hirado.hu/videok/felteteleket-szabott-a-fovaros/', 'info_dict': { 'id': '4760120', 'title': 'Feltételeket szabott a főváros', 'ext': 'mp4', - 'thumbnail': 'http://hirado.hu/wp-content/uploads/sites/4/2021/09/vlcsnap-2021-09-01-20h20m37s165.jpg' - } + 'thumbnail': 'http://hirado.hu/wp-content/uploads/sites/4/2021/09/vlcsnap-2021-09-01-20h20m37s165.jpg', + }, + 'skip': 'Webpage redirects to video list page', }, { - # petofilive + # hirado + 'url': 'https://hirado.hu/belfold/video/2023/09/11/marad-az-eves-elszamolas-a-napelemekre-beruhazo-csaladoknal', + 'info_dict': { + 'id': '6716068', + 'title': 'Marad az éves elszámolás a napelemekre beruházó családoknál', + 'display_id': 'marad-az-eves-elszamolas-a-napelemekre-beruhazo-csaladoknal', + 'ext': 'mp4', + 'upload_date': '20230911', + 'thumbnail': 'https://hirado.hu/wp-content/uploads/sites/4/2023/09/vlcsnap-2023-09-11-09h16m09s882.jpg', + }, + }, { + # (old) petofilive 'url': 'https://petofilive.hu/video/2021/06/07/tha-shudras-az-akusztikban/', 'info_dict': { 'id': '4571948', 'title': 'Tha Shudras az Akusztikban', 'ext': 'mp4', 'upload_date': '20210607', - 'thumbnail': 'http://petofilive.hu/wp-content/uploads/sites/4/2021/06/vlcsnap-2021-06-07-22h14m23s915-1024x576.jpg' - } + 'thumbnail': 'http://petofilive.hu/wp-content/uploads/sites/4/2021/06/vlcsnap-2021-06-07-22h14m23s915-1024x576.jpg', + }, + 'skip': 'Webpage redirects to empty page', + }, { + # petofilive + 'url': 'https://petofilive.hu/video/2023/09/09/futball-fesztival-a-margitszigeten/', + 'info_dict': { + 'id': '6713233', + 'title': 'Futball Fesztivál a Margitszigeten', + 'display_id': 'futball-fesztival-a-margitszigeten', + 'ext': 'mp4', + 'upload_date': '20230909', + 'thumbnail': 'https://petofilive.hu/wp-content/uploads/sites/4/2023/09/Clipboard11-2.jpg', + }, }] def _real_extract(self, url): @@ -71,21 +121,27 @@ def _real_extract(self, url): player_data_str = self._html_search_regex( r'mtva_player_manager\.player\(document.getElementById\(.*\),\s?(\{.*\}).*\);', webpage, 'player data') - player_data = self._parse_json(player_data_str, display_id, compat_urllib_parse_unquote) - video_id = compat_str(player_data['contentId']) + player_data = self._parse_json(player_data_str, display_id, urllib.parse.unquote) + video_id = str(player_data['contentId']) title = player_data.get('title') or self._og_search_title(webpage, fatal=False) or \ self._html_search_regex(r'<h\d+\b[^>]+\bclass="article_title">([^<]+)<', webpage, 'title') upload_date = unified_strdate( - '%s-%s-%s' % (mobj.group('year'), mobj.group('month'), mobj.group('day'))) + '{}-{}-{}'.format(mobj.group('year'), mobj.group('month'), mobj.group('day'))) if not upload_date: upload_date = unified_strdate(self._html_search_regex( r'<p+\b[^>]+\bclass="article_date">([^<]+)<', webpage, 'upload date', default=None)) player_data['video'] = player_data.pop('token') - player_page = self._download_webpage('https://player.mediaklikk.hu/playernew/player.php', video_id, query=player_data) - playlist_url = self._proto_relative_url(compat_urllib_parse_unquote( - self._html_search_regex(r'\"file\":\s*\"(\\?/\\?/.*playlist\.m3u8)\"', player_page, 'playlist_url')).replace('\\/', '/')) + player_page = self._download_webpage( + 'https://player.mediaklikk.hu/playernew/player.php', video_id, + query=player_data, headers={'Referer': url}) + player_json = self._search_json( + r'\bpl\.setup\s*\(', player_page, 'player json', video_id, end_pattern=r'\);') + playlist_url = traverse_obj( + player_json, ('playlist', lambda _, v: v['type'] == 'hls', 'file', {url_or_none}), get_all=False) + if not playlist_url: + raise ExtractorError('Unable to extract playlist url') formats = self._extract_wowza_formats( playlist_url, video_id, skip_protocols=['f4m', 'smil', 'dash']) @@ -96,5 +152,5 @@ def _real_extract(self, url): 'display_id': display_id, 'formats': formats, 'upload_date': upload_date, - 'thumbnail': player_data.get('bgImage') or self._og_search_thumbnail(webpage) + 'thumbnail': player_data.get('bgImage') or self._og_search_thumbnail(webpage), } diff --git a/yt_dlp/extractor/mediaset.py b/yt_dlp/extractor/mediaset.py index 1fa5299141..8cb18e6096 100644 --- a/yt_dlp/extractor/mediaset.py +++ b/yt_dlp/extractor/mediaset.py @@ -5,11 +5,11 @@ from ..utils import ( ExtractorError, GeoRestrictedError, - int_or_none, OnDemandPagedList, + int_or_none, try_get, - urljoin, update_url_query, + urljoin, ) @@ -29,7 +29,7 @@ class MediasetIE(ThePlatformBaseIE): ''' _EMBED_REGEX = [ - rf'<iframe[^>]+src=[\'"](?P<url>(?:https?:)?//(?:\w+\.)+mediaset\.it/player/(?:v\d+/)?index\.html\?\S*?programGuid={_GUID_RE})[\'"&]' + rf'<iframe[^>]+src=[\'"](?P<url>(?:https?:)?//(?:\w+\.)+mediaset\.it/player/(?:v\d+/)?index\.html\?\S*?programGuid={_GUID_RE})[\'"&]', ] _TESTS = [{ # full episode @@ -73,6 +73,7 @@ class MediasetIE(ThePlatformBaseIE): 'season_number': 5, 'episode_number': 5, 'chapters': [{'start_time': 0.0, 'end_time': 3409.08}, {'start_time': 3409.08, 'end_time': 6565.008}], + 'categories': ['Informazione'], }, }, { # DRM @@ -127,7 +128,8 @@ class MediasetIE(ThePlatformBaseIE): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Dead link', }, { # WittyTV embed 'url': 'https://www.wittytv.it/mauriziocostanzoshow/ultima-puntata-venerdi-25-novembre/', @@ -148,16 +150,19 @@ class MediasetIE(ThePlatformBaseIE): 'season_number': 12, 'episode': 'Episode 8', 'episode_number': 8, + 'categories': ['Intrattenimento'], }, 'params': { 'skip_download': True, - } + }, }] - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + def _parse_smil_formats_and_subtitles( + self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): for video in smil.findall(self._xpath_ns('.//video', namespace)): video.attrib['src'] = re.sub(r'(https?://vod05)t(-mediaset-it\.akamaized\.net/.+?.mpd)\?.+', r'\1\2', video.attrib['src']) - return super(MediasetIE, self)._parse_smil_formats(smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url) + return super()._parse_smil_formats_and_subtitles( + smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url) def _check_drm_formats(self, tp_formats, video_id): has_nondrm, drm_manifest = False, '' diff --git a/yt_dlp/extractor/mediasite.py b/yt_dlp/extractor/mediasite.py index fe549c49fb..ad7ab27e28 100644 --- a/yt_dlp/extractor/mediasite.py +++ b/yt_dlp/extractor/mediasite.py @@ -1,31 +1,28 @@ -import re import json +import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) from ..utils import ( ExtractorError, float_or_none, mimetype2ext, + smuggle_url, str_or_none, try_call, try_get, - smuggle_url, unsmuggle_url, url_or_none, urljoin, ) - +from ..utils.traversal import traverse_obj _ID_RE = r'(?:[0-9a-f]{32,34}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12,14})' class MediasiteIE(InfoExtractor): - _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/[^/#?]+/Presentation)/(?P<id>%s)(?P<query>\?[^#]+|)' % _ID_RE - _EMBED_REGEX = [r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE] + _VALID_URL = rf'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/[^/#?]+/Presentation)/(?P<id>{_ID_RE})(?P<query>\?[^#]+|)' + _EMBED_REGEX = [rf'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/{_ID_RE}(?:\?.*?)?)\1'] _TESTS = [ { 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', @@ -87,7 +84,7 @@ class MediasiteIE(InfoExtractor): 'upload_date': '20120409', 'timestamp': 1333983600, 'duration': 7794, - } + }, }, { 'url': 'https://collegerama.tudelft.nl/Mediasite/Showcase/livebroadcast/Presentation/ada7020854f743c49fbb45c9ec7dbb351d', @@ -101,7 +98,7 @@ class MediasiteIE(InfoExtractor): # dashed id 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271-681e-4f19-9af3-c60d1f82869b1d', 'only_matching': True, - } + }, ] # look in Mediasite.Core.js (Mediasite.ContentStreamType[*]) @@ -118,16 +115,16 @@ def _extract_embed_urls(cls, url, webpage): for embed_url in super()._extract_embed_urls(url, webpage): yield smuggle_url(embed_url, {'UrlReferrer': url}) - def __extract_slides(self, *, stream_id, snum, Stream, duration, images): - slide_base_url = Stream['SlideBaseUrl'] + def __extract_slides(self, *, stream_id, snum, stream, duration, images): + slide_base_url = stream['SlideBaseUrl'] - fname_template = Stream['SlideImageFileNameTemplate'] + fname_template = stream['SlideImageFileNameTemplate'] if fname_template != 'slide_{0:D4}.jpg': self.report_warning('Unusual slide file name template; report a bug if slide downloading fails') fname_template = re.sub(r'\{0:D([0-9]+)\}', r'{0:0\1}', fname_template) fragments = [] - for i, slide in enumerate(Stream['Slides']): + for i, slide in enumerate(stream['Slides']): if i == 0: if slide['Time'] > 0: default_slide = images.get('DefaultSlide') @@ -142,18 +139,18 @@ def __extract_slides(self, *, stream_id, snum, Stream, duration, images): }) next_time = try_call( - lambda: Stream['Slides'][i + 1]['Time'], + lambda: stream['Slides'][i + 1]['Time'], lambda: duration, lambda: slide['Time'], expected_type=(int, float)) fragments.append({ 'path': fname_template.format(slide.get('Number', i + 1)), - 'duration': (next_time - slide['Time']) / 1000 + 'duration': (next_time - slide['Time']) / 1000, }) return { - 'format_id': '%s-%u.slides' % (stream_id, snum), + 'format_id': f'{stream_id}-{snum}.slides', 'ext': 'mhtml', 'url': slide_base_url, 'protocol': 'mhtml', @@ -171,15 +168,15 @@ def _real_extract(self, url): query = mobj.group('query') webpage, urlh = self._download_webpage_handle(url, resource_id) # XXX: add UrlReferrer? - redirect_url = urlh.geturl() + redirect_url = urlh.url # XXX: might have also extracted UrlReferrer and QueryString from the html - service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex( + service_path = urllib.parse.urljoin(redirect_url, self._html_search_regex( r'<div[^>]+\bid=["\']ServicePath[^>]+>(.+?)</div>', webpage, resource_id, default='/Mediasite/PlayerService/PlayerService.svc/json')) player_options = self._download_json( - '%s/GetPlayerOptions' % service_path, resource_id, + f'{service_path}/GetPlayerOptions', resource_id, headers={ 'Content-type': 'application/json; charset=utf-8', 'X-Requested-With': 'XMLHttpRequest', @@ -190,25 +187,25 @@ def _real_extract(self, url): 'QueryString': query, 'UrlReferrer': data.get('UrlReferrer', ''), 'UseScreenReader': False, - } - }).encode('utf-8'))['d'] + }, + }).encode())['d'] presentation = player_options['Presentation'] title = presentation['Title'] if presentation is None: raise ExtractorError( - 'Mediasite says: %s' % player_options['PlayerPresentationStatusMessage'], + 'Mediasite says: {}'.format(player_options['PlayerPresentationStatusMessage']), expected=True) thumbnails = [] formats = [] - for snum, Stream in enumerate(presentation['Streams']): - stream_type = Stream.get('StreamType') + for snum, stream in enumerate(presentation['Streams']): + stream_type = stream.get('StreamType') if stream_type is None: continue - video_urls = Stream.get('VideoUrls') + video_urls = stream.get('VideoUrls') if not isinstance(video_urls, list): video_urls = [] @@ -216,36 +213,42 @@ def _real_extract(self, url): stream_type, 'type%u' % stream_type) stream_formats = [] - for unum, VideoUrl in enumerate(video_urls): - video_url = url_or_none(VideoUrl.get('Location')) + for unum, video in enumerate(video_urls): + video_url = url_or_none(video.get('Location')) if not video_url: continue # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS - media_type = VideoUrl.get('MediaType') + media_type = video.get('MediaType') + ext = mimetype2ext(video.get('MimeType')) if media_type == 'SS': stream_formats.extend(self._extract_ism_formats( video_url, resource_id, - ism_id='%s-%u.%u' % (stream_id, snum, unum), + ism_id=f'{stream_id}-{snum}.{unum}', fatal=False)) elif media_type == 'Dash': stream_formats.extend(self._extract_mpd_formats( video_url, resource_id, - mpd_id='%s-%u.%u' % (stream_id, snum, unum), + mpd_id=f'{stream_id}-{snum}.{unum}', + fatal=False)) + elif ext in ('m3u', 'm3u8'): + stream_formats.extend(self._extract_m3u8_formats( + video_url, resource_id, + m3u8_id=f'{stream_id}-{snum}.{unum}', fatal=False)) else: stream_formats.append({ - 'format_id': '%s-%u.%u' % (stream_id, snum, unum), + 'format_id': f'{stream_id}-{snum}.{unum}', 'url': video_url, - 'ext': mimetype2ext(VideoUrl.get('MimeType')), + 'ext': ext, }) - if Stream.get('HasSlideContent', False): - images = player_options['PlayerLayoutOptions']['Images'] + images = traverse_obj(player_options, ('PlayerLayoutOptions', 'Images', {dict})) + if stream.get('HasSlideContent') and images: stream_formats.append(self.__extract_slides( stream_id=stream_id, snum=snum, - Stream=Stream, + stream=stream, duration=presentation.get('Duration'), images=images, )) @@ -255,10 +258,10 @@ def _real_extract(self, url): for fmt in stream_formats: fmt['quality'] = -10 - thumbnail_url = Stream.get('ThumbnailUrl') + thumbnail_url = stream.get('ThumbnailUrl') if thumbnail_url: thumbnails.append({ - 'id': '%s-%u' % (stream_id, snum), + 'id': f'{stream_id}-{snum}', 'url': urljoin(redirect_url, thumbnail_url), 'preference': -1 if stream_type != 0 else 0, }) @@ -279,15 +282,15 @@ def _real_extract(self, url): class MediasiteCatalogIE(InfoExtractor): - _VALID_URL = r'''(?xi) + _VALID_URL = rf'''(?xi) (?P<url>https?://[^/]+/Mediasite) /Catalog/Full/ - (?P<catalog_id>{0}) + (?P<catalog_id>{_ID_RE}) (?: - /(?P<current_folder_id>{0}) - /(?P<root_dynamic_folder_id>{0}) + /(?P<current_folder_id>{_ID_RE}) + /(?P<root_dynamic_folder_id>{_ID_RE}) )? - '''.format(_ID_RE) + ''' _TESTS = [{ 'url': 'http://events7.mediasite.com/Mediasite/Catalog/Full/631f9e48530d454381549f955d08c75e21', 'info_dict': { @@ -369,7 +372,7 @@ def _real_extract(self, url): headers[anti_forgery_header] = anti_forgery_token catalog = self._download_json( - '%s/Catalog/Data/GetPresentationsForFolder' % mediasite_url, + f'{mediasite_url}/Catalog/Data/GetPresentationsForFolder', catalog_id, data=json.dumps(data).encode(), headers=headers) entries = [] @@ -380,13 +383,13 @@ def _real_extract(self, url): if not video_id: continue entries.append(self.url_result( - '%s/Play/%s' % (mediasite_url, video_id), + f'{mediasite_url}/Play/{video_id}', ie=MediasiteIE.ie_key(), video_id=video_id)) title = try_get( - catalog, lambda x: x['CurrentFolder']['Name'], compat_str) + catalog, lambda x: x['CurrentFolder']['Name'], str) - return self.playlist_result(entries, catalog_id, title,) + return self.playlist_result(entries, catalog_id, title) class MediasiteNamedCatalogIE(InfoExtractor): @@ -404,8 +407,8 @@ def _real_extract(self, url): webpage = self._download_webpage(url, catalog_name) catalog_id = self._search_regex( - r'CatalogId\s*:\s*["\'](%s)' % _ID_RE, webpage, 'catalog id') + rf'CatalogId\s*:\s*["\']({_ID_RE})', webpage, 'catalog id') return self.url_result( - '%s/Catalog/Full/%s' % (mediasite_url, catalog_id), + f'{mediasite_url}/Catalog/Full/{catalog_id}', ie=MediasiteCatalogIE.ie_key(), video_id=catalog_id) diff --git a/yt_dlp/extractor/mediastream.py b/yt_dlp/extractor/mediastream.py index 4d39495276..ae0fb2aed2 100644 --- a/yt_dlp/extractor/mediastream.py +++ b/yt_dlp/extractor/mediastream.py @@ -1,11 +1,48 @@ import re from .common import InfoExtractor -from ..utils import clean_html, get_element_html_by_class +from ..utils import ( + clean_html, + filter_dict, + parse_qs, + remove_end, + traverse_obj, + update_url_query, + urljoin, +) -class MediaStreamIE(InfoExtractor): - _VALID_URL = r'https?://mdstrm.com/(?:embed|live-stream)/(?P<id>\w+)' +class MediaStreamBaseIE(InfoExtractor): + _EMBED_BASE_URL = 'https://mdstrm.com/embed' + _BASE_URL_RE = r'https?://mdstrm\.com/(?:embed|live-stream)' + + def _extract_mediastream_urls(self, webpage): + yield from traverse_obj(list(self._yield_json_ld(webpage, None, fatal=False)), ( + lambda _, v: v['@type'] == 'VideoObject', ('embedUrl', 'contentUrl'), + {lambda x: x if re.match(rf'{self._BASE_URL_RE}/\w+', x) else None})) + + for mobj in re.finditer(r'<script[^>]+>[^>]*playerMdStream\.mdstreamVideo\(\s*[\'"](?P<video_id>\w+)', webpage): + yield f'{self._EMBED_BASE_URL}/{mobj.group("video_id")}' + + yield from re.findall( + rf'<iframe[^>]+\bsrc="({self._BASE_URL_RE}/\w+)', webpage) + + for mobj in re.finditer( + r'''(?x) + <(?:div|ps-mediastream)[^>]+ + (class="[^"]*MediaStreamVideoPlayer)[^"]*"[^>]+ + data-video-id="(?P<video_id>\w+)" + (?:\s*data-video-type="(?P<video_type>[^"]+))? + (?:[^>]*>\s*<div[^>]+\1[^"]*"[^>]+data-mediastream=["\'][^>]+ + https://mdstrm\.com/(?P<live>live-stream))? + ''', webpage): + + video_type = 'live-stream' if mobj.group('video_type') == 'live' or mobj.group('live') else 'embed' + yield f'https://mdstrm.com/{video_type}/{mobj.group("video_id")}' + + +class MediaStreamIE(MediaStreamBaseIE): + _VALID_URL = MediaStreamBaseIE._BASE_URL_RE + r'/(?P<id>\w+)' _TESTS = [{ 'url': 'https://mdstrm.com/embed/6318e3f1d1d316083ae48831', @@ -17,6 +54,7 @@ class MediaStreamIE(InfoExtractor): 'thumbnail': r're:^https?://[^?#]+6318e3f1d1d316083ae48831', 'ext': 'mp4', }, + 'params': {'skip_download': 'm3u8'}, }] _WEBPAGE_TESTS = [{ @@ -29,9 +67,7 @@ class MediaStreamIE(InfoExtractor): 'ext': 'mp4', 'live_status': 'is_live', }, - 'params': { - 'skip_download': 'Livestream' - }, + 'params': {'skip_download': 'Livestream'}, }, { 'url': 'https://www.multimedios.com/television/clases-de-llaves-y-castigos-quien-sabe-mas', 'md5': 'de31f0b1ecc321fb35bf22d58734ea40', @@ -42,6 +78,7 @@ class MediaStreamIE(InfoExtractor): 'thumbnail': 're:^https?://[^?#]+63731bab8ec9b308a2c9ed28', 'ext': 'mp4', }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://www.americatv.com.pe/videos/esto-es-guerra/facundo-gonzalez-sufrio-fuerte-golpe-durante-competencia-frente-hugo-garcia-eeg-noticia-139120', 'info_dict': { @@ -51,6 +88,7 @@ class MediaStreamIE(InfoExtractor): 'thumbnail': 're:^https?://[^?#]+63756df1c638b008a5659dec', 'ext': 'mp4', }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://www.americatv.com.pe/videos/al-fondo-hay-sitio/nuevas-lomas-town-bernardo-mata-se-enfrento-sujeto-luchar-amor-macarena-noticia-139083', 'info_dict': { @@ -60,40 +98,41 @@ class MediaStreamIE(InfoExtractor): 'thumbnail': 're:^https?://[^?#]+637307669609130f74cd3a6e', 'ext': 'mp4', }, + 'params': {'skip_download': 'm3u8'}, }] - @classmethod - def _extract_embed_urls(cls, url, webpage): - for mobj in re.finditer(r'<script[^>]+>[^>]*playerMdStream.mdstreamVideo\(\s*[\'"](?P<video_id>\w+)', webpage): - yield f'https://mdstrm.com/embed/{mobj.group("video_id")}' - - yield from re.findall( - r'<iframe[^>]src\s*=\s*"(https://mdstrm.com/[\w-]+/\w+)', webpage) - - for mobj in re.finditer( - r'''(?x) - <(?:div|ps-mediastream)[^>]+ - class\s*=\s*"[^"]*MediaStreamVideoPlayer[^"]*"[^>]+ - data-video-id\s*=\s*"(?P<video_id>\w+)\s*" - (?:\s*data-video-type\s*=\s*"(?P<video_type>[^"]+))? - ''', webpage): - - video_type = 'live-stream' if mobj.group('video_type') == 'live' else 'embed' - yield f'https://mdstrm.com/{video_type}/{mobj.group("video_id")}' + def _extract_from_webpage(self, url, webpage): + for embed_url in self._extract_mediastream_urls(webpage): + yield self.url_result(embed_url, MediaStreamIE, None) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - if 'Debido a tu ubicación no puedes ver el contenido' in webpage: - self.raise_geo_restricted() + for message in [ + 'Debido a tu ubicación no puedes ver el contenido', + 'You are not allowed to watch this video: Geo Fencing Restriction', + 'Este contenido no está disponible en tu zona geográfica.', + 'El contenido sólo está disponible dentro de', + ]: + if message in webpage: + self.raise_geo_restricted() - player_config = self._search_json(r'window.MDSTRM.OPTIONS\s*=', webpage, 'metadata', video_id) + player_config = self._search_json(r'window\.MDSTRM\.OPTIONS\s*=', webpage, 'metadata', video_id) formats, subtitles = [], {} for video_format in player_config['src']: if video_format == 'hls': - fmts, subs = self._extract_m3u8_formats_and_subtitles(player_config['src'][video_format], video_id) + params = { + 'at': 'web-app', + 'access_token': traverse_obj(parse_qs(url), ('access_token', 0)), + } + for name, key in (('MDSTRMUID', 'uid'), ('MDSTRMSID', 'sid'), ('MDSTRMPID', 'pid'), ('VERSION', 'av')): + params[key] = self._search_regex( + rf'window\.{name}\s*=\s*["\']([^"\']+)["\'];', webpage, key, default=None) + + fmts, subs = self._extract_m3u8_formats_and_subtitles( + update_url_query(player_config['src'][video_format], filter_dict(params)), video_id) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) elif video_format == 'mpd': @@ -116,40 +155,72 @@ def _real_extract(self, url): } -class WinSportsVideoIE(InfoExtractor): - _VALID_URL = r'https?://www\.winsports\.co/videos/(?P<display_id>[\w-]+)-(?P<id>\d+)' +class WinSportsVideoIE(MediaStreamBaseIE): + _VALID_URL = r'https?://www\.winsports\.co/videos/(?P<id>[\w-]+)' _TESTS = [{ 'url': 'https://www.winsports.co/videos/siempre-castellanos-gran-atajada-del-portero-cardenal-para-evitar-la-caida-de-su-arco-60536', 'info_dict': { 'id': '62dc8357162c4b0821fcfb3c', - 'display_id': 'siempre-castellanos-gran-atajada-del-portero-cardenal-para-evitar-la-caida-de-su-arco', + 'display_id': 'siempre-castellanos-gran-atajada-del-portero-cardenal-para-evitar-la-caida-de-su-arco-60536', 'title': '¡Siempre Castellanos! Gran atajada del portero \'cardenal\' para evitar la caída de su arco', 'description': 'md5:eb811b2b2882bdc59431732c06b905f2', 'thumbnail': r're:^https?://[^?#]+62dc8357162c4b0821fcfb3c', 'ext': 'mp4', }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://www.winsports.co/videos/observa-aqui-los-goles-del-empate-entre-tolima-y-nacional-60548', 'info_dict': { 'id': '62dcb875ef12a5526790b552', - 'display_id': 'observa-aqui-los-goles-del-empate-entre-tolima-y-nacional', + 'display_id': 'observa-aqui-los-goles-del-empate-entre-tolima-y-nacional-60548', 'title': 'Observa aquí los goles del empate entre Tolima y Nacional', 'description': 'md5:b19402ba6e46558b93fd24b873eea9c9', 'thumbnail': r're:^https?://[^?#]+62dcb875ef12a5526790b552', 'ext': 'mp4', }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.winsports.co/videos/equidad-vuelve-defender-su-arco-de-remates-de-junior', + 'info_dict': { + 'id': '63fa7eca72f1741ad3a4d515', + 'display_id': 'equidad-vuelve-defender-su-arco-de-remates-de-junior', + 'title': '⚽ Equidad vuelve a defender su arco de remates de Junior', + 'description': 'Remate de Sierra', + 'thumbnail': r're:^https?://[^?#]+63fa7eca72f1741ad3a4d515', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.winsports.co/videos/bucaramanga-se-quedo-con-el-grito-de-gol-en-la-garganta', + 'info_dict': { + 'id': '6402adb62bbf3b18d454e1b0', + 'display_id': 'bucaramanga-se-quedo-con-el-grito-de-gol-en-la-garganta', + 'title': '⚽Bucaramanga se quedó con el grito de gol en la garganta', + 'description': 'Gol anulado Bucaramanga', + 'thumbnail': r're:^https?://[^?#]+6402adb62bbf3b18d454e1b0', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): - display_id, video_id = self._match_valid_url(url).group('display_id', 'id') + display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) + data = self._search_json( + r'<script\s*[^>]+data-drupal-selector="drupal-settings-json">', webpage, 'data', display_id) - media_setting_json = self._search_json( - r'<script\s*[^>]+data-drupal-selector="drupal-settings-json">', webpage, 'drupal-setting-json', display_id) + mediastream_url = urljoin(f'{self._EMBED_BASE_URL}/', ( + traverse_obj(data, ( + (('settings', 'mediastream_formatter', ..., 'mediastream_id'), 'url'), {str}), get_all=False) + or next(self._extract_mediastream_urls(webpage), None))) - mediastream_id = media_setting_json['settings']['mediastream_formatter'][video_id]['mediastream_id'] + if not mediastream_url: + self.raise_no_formats('No MediaStream embed found in webpage') + + title = clean_html(remove_end( + self._search_json_ld(webpage, display_id, expected_type='VideoObject', default={}).get('title') + or self._og_search_title(webpage), '| Win Sports')) return self.url_result( - f'https://mdstrm.com/embed/{mediastream_id}', MediaStreamIE, video_id, url_transparent=True, - display_id=display_id, video_title=clean_html(get_element_html_by_class('title-news', webpage))) + mediastream_url, MediaStreamIE, display_id, url_transparent=True, display_id=display_id, video_title=title) diff --git a/yt_dlp/extractor/mediaworksnz.py b/yt_dlp/extractor/mediaworksnz.py index 62e37d24a9..be67b631eb 100644 --- a/yt_dlp/extractor/mediaworksnz.py +++ b/yt_dlp/extractor/mediaworksnz.py @@ -24,8 +24,8 @@ class MediaWorksNZVODIE(InfoExtractor): 'timestamp': 1604268608, 'upload_date': '20201101', 'thumbnail': r're:^https?://.*\.jpg$', - 'channel': 'George FM' - } + 'channel': 'George FM', + }, }, { # has audio-only format 'url': 'https://vodupload-api.mediaworks.nz/library/asset/published/VID02627', @@ -40,7 +40,7 @@ class MediaWorksNZVODIE(InfoExtractor): 'upload_date': '20220822', 'timestamp': 1661152289, }, - 'params': {'format': 'ba[ext=mp3]'} + 'params': {'format': 'ba[ext=mp3]'}, }] _WEBPAGE_TESTS = [{ @@ -55,7 +55,7 @@ class MediaWorksNZVODIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'Socrates Walks Into A Bar Podcast Episode 1', 'upload_date': '20220720', - } + }, }] @classmethod @@ -63,7 +63,7 @@ def _extract_embed_urls(cls, url, webpage): for mobj in re.finditer( rf'''(?x)<div\s+\bid=["']Player-Attributes-JWID[^>]+\b data-request-url=["']{cls._VALID_URL_BASE_RE}["'][^>]+\b - data-asset-id=["']{cls._VALID_URL_ID_RE}["']''', webpage + data-asset-id=["']{cls._VALID_URL_ID_RE}["']''', webpage, ): yield f'https://vodupload-api.mediaworks.nz/library/asset/published/{mobj.group("id")}' diff --git a/yt_dlp/extractor/medici.py b/yt_dlp/extractor/medici.py index 328ccd2c9b..b6235b64df 100644 --- a/yt_dlp/extractor/medici.py +++ b/yt_dlp/extractor/medici.py @@ -1,67 +1,153 @@ +import urllib.parse + from .common import InfoExtractor from ..utils import ( - unified_strdate, - update_url_query, - urlencode_postdata, + filter_dict, + parse_iso8601, + traverse_obj, + try_call, + url_or_none, ) class MediciIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?medici\.tv/#!/(?P<id>[^?#&]+)' - _TEST = { - 'url': 'http://www.medici.tv/#!/daniel-harding-frans-helmerson-verbier-festival-music-camp', - 'md5': '004c21bb0a57248085b6ff3fec72719d', + _VALID_URL = r'https?://(?:(?P<sub>www|edu)\.)?medici\.tv/[a-z]{2}/[\w.-]+/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.medici.tv/en/operas/thomas-ades-the-exterminating-angel-calixto-bieito-opera-bastille-paris', + 'md5': 'd483f74e7a7a9eac0dbe152ab189050d', 'info_dict': { - 'id': '3059', - 'ext': 'flv', - 'title': 'Daniel Harding conducts the Verbier Festival Music Camp \u2013 With Frans Helmerson', - 'description': 'md5:322a1e952bafb725174fd8c1a8212f58', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20170408', + 'id': '8032', + 'ext': 'mp4', + 'title': 'Thomas Adès\'s The Exterminating Angel', + 'description': 'md5:708ae6350dadc604225b4a6e32482bab', + 'thumbnail': r're:https://.+/.+\.jpg', + 'upload_date': '20240304', + 'timestamp': 1709561766, + 'display_id': 'thomas-ades-the-exterminating-angel-calixto-bieito-opera-bastille-paris', }, - } + 'expected_warnings': [r'preview'], + }, { + 'url': 'https://edu.medici.tv/en/operas/wagner-lohengrin-paris-opera-kirill-serebrennikov-piotr-beczala-kwangchul-youn-johanni-van-oostrum', + 'md5': '4ef3f4079a6e1c617584463a9eb84f99', + 'info_dict': { + 'id': '7900', + 'ext': 'mp4', + 'title': 'Wagner\'s Lohengrin', + 'description': 'md5:a384a62937866101f86902f21752cd89', + 'thumbnail': r're:https://.+/.+\.jpg', + 'upload_date': '20231017', + 'timestamp': 1697554771, + 'display_id': 'wagner-lohengrin-paris-opera-kirill-serebrennikov-piotr-beczala-kwangchul-youn-johanni-van-oostrum', + }, + 'expected_warnings': [r'preview'], + }, { + 'url': 'https://www.medici.tv/en/concerts/sergey-smbatyan-conducts-mansurian-chouchane-siranossian-mario-brunello', + 'md5': '9dd757e53b22b2511e85ea9ea60e4815', + 'info_dict': { + 'id': '5712', + 'ext': 'mp4', + 'title': 'Sergey Smbatyan conducts Tigran Mansurian — With Chouchane Siranossian and Mario Brunello', + 'thumbnail': r're:https://.+/.+\.jpg', + 'description': 'md5:9411fe44c874bb10e9af288c65816e41', + 'upload_date': '20200323', + 'timestamp': 1584975600, + 'display_id': 'sergey-smbatyan-conducts-mansurian-chouchane-siranossian-mario-brunello', + }, + 'expected_warnings': [r'preview'], + }, { + 'url': 'https://www.medici.tv/en/ballets/carmen-ballet-choregraphie-de-jiri-bubenicek-teatro-dellopera-di-roma', + 'md5': '40f5e76cb701a97a6d7ba23b62c49990', + 'info_dict': { + 'id': '7857', + 'ext': 'mp4', + 'title': 'Carmen by Jiří Bubeníček after Roland Petit, music by Bizet, de Falla, Castelnuovo-Tedesco, and Bonolis', + 'thumbnail': r're:https://.+/.+\.jpg', + 'description': 'md5:0f15a15611ed748020c769873e10a8bb', + 'upload_date': '20240223', + 'timestamp': 1708707600, + 'display_id': 'carmen-ballet-choregraphie-de-jiri-bubenicek-teatro-dellopera-di-roma', + }, + 'expected_warnings': [r'preview'], + }, { + 'url': 'https://www.medici.tv/en/documentaries/la-sonnambula-liege-2023-documentaire', + 'md5': '87ff198018ce79a34757ab0dd6f21080', + 'info_dict': { + 'id': '7513', + 'ext': 'mp4', + 'title': 'La Sonnambula', + 'thumbnail': r're:https://.+/.+\.jpg', + 'description': 'md5:0caf9109a860fd50cd018df062a67f34', + 'upload_date': '20231103', + 'timestamp': 1699010830, + 'display_id': 'la-sonnambula-liege-2023-documentaire', + }, + 'expected_warnings': [r'preview'], + }, { + 'url': 'https://edu.medici.tv/en/masterclasses/yvonne-loriod-olivier-messiaen', + 'md5': 'fb5dcec46d76ad20fbdbaabb01da191d', + 'info_dict': { + 'id': '3024', + 'ext': 'mp4', + 'title': 'Olivier Messiaen and Yvonne Loriod, pianists and teachers', + 'thumbnail': r're:https://.+/.+\.jpg', + 'description': 'md5:aab948e2f7690214b5c28896c83f1fc1', + 'upload_date': '20150223', + 'timestamp': 1424706608, + 'display_id': 'yvonne-loriod-olivier-messiaen', + }, + 'skip': 'Requires authentication; preview starts in the middle', + }, { + 'url': 'https://www.medici.tv/en/jazz/makaya-mccraven-la-rochelle', + 'md5': '4cc279a8b06609782747c8f50beea2b3', + 'info_dict': { + 'id': '7922', + 'ext': 'mp4', + 'title': 'NEW: Makaya McCraven in La Rochelle', + 'thumbnail': r're:https://.+/.+\.jpg', + 'description': 'md5:b5a8aaeb6993d8ccb18bde8abb8aa8d2', + 'upload_date': '20231228', + 'timestamp': 1703754863, + 'display_id': 'makaya-mccraven-la-rochelle', + }, + 'expected_warnings': [r'preview'], + }] def _real_extract(self, url): - video_id = self._match_id(url) + display_id, subdomain = self._match_valid_url(url).group('id', 'sub') + self._request_webpage(url, display_id, 'Requesting CSRF token cookie') - # Sets csrftoken cookie - self._download_webpage(url, video_id) - - MEDICI_URL = 'http://www.medici.tv/' + subdomain = 'edu-' if subdomain == 'edu' else '' + origin = f'https://{urllib.parse.urlparse(url).hostname}' data = self._download_json( - MEDICI_URL, video_id, - data=urlencode_postdata({ - 'json': 'true', - 'page': '/%s' % video_id, - 'timezone_offset': -420, - }), headers={ - 'X-CSRFToken': self._get_cookies(url)['csrftoken'].value, - 'X-Requested-With': 'XMLHttpRequest', - 'Referer': MEDICI_URL, - 'Content-Type': 'application/x-www-form-urlencoded', - }) + f'https://api.medici.tv/{subdomain}satie/edito/movie-file/{display_id}/', display_id, + headers=filter_dict({ + 'Authorization': try_call( + lambda: urllib.parse.unquote(self._get_cookies(url)['auth._token.mAuth'].value)), + 'Device-Type': 'web', + 'Origin': origin, + 'Referer': f'{origin}/', + 'Accept': 'application/json, text/plain, */*', + })) - video = data['video']['videos']['video1'] + if not traverse_obj(data, ('video', 'is_full_video')) and traverse_obj( + data, ('video', 'is_limited_by_user_access')): + self.report_warning( + 'The full video is for subscribers only. Only previews will be downloaded. If you ' + 'have used the --cookies-from-browser option, try using the --cookies option instead') - title = video.get('nom') or data['title'] - - video_id = video.get('id') or video_id - formats = self._extract_f4m_formats( - update_url_query(video['url_akamai'], { - 'hdcore': '3.1.0', - 'plugin=aasp': '3.1.0.43.124', - }), video_id, f4m_id='hds') - - description = data.get('meta_description') - thumbnail = video.get('url_thumbnail') or data.get('main_image') - upload_date = unified_strdate(data['video'].get('date')) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + data['video']['video_url'], display_id, 'mp4') return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, + 'id': str(data['id']), + 'display_id': display_id, 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('subtitle', {str}), + 'thumbnail': ('picture', {url_or_none}), + 'timestamp': ('date_publish', {parse_iso8601}), + }), } diff --git a/yt_dlp/extractor/megaphone.py b/yt_dlp/extractor/megaphone.py index af80523e32..d249a8492d 100644 --- a/yt_dlp/extractor/megaphone.py +++ b/yt_dlp/extractor/megaphone.py @@ -5,18 +5,18 @@ class MegaphoneIE(InfoExtractor): IE_NAME = 'megaphone.fm' IE_DESC = 'megaphone.fm embedded players' - _VALID_URL = r'https://player\.megaphone\.fm/(?P<id>[A-Z0-9]+)' + _VALID_URL = r'https?://player\.megaphone\.fm/(?P<id>[A-Z0-9]+)' _EMBED_REGEX = [rf'<iframe[^>]*?\ssrc=["\'](?P<url>{_VALID_URL})'] _TEST = { - 'url': 'https://player.megaphone.fm/GLT9749789991?"', + 'url': 'https://player.megaphone.fm/GLT9749789991', 'md5': '4816a0de523eb3e972dc0dda2c191f96', 'info_dict': { 'id': 'GLT9749789991', 'ext': 'mp3', 'title': '#97 What Kind Of Idiot Gets Phished?', 'thumbnail': r're:^https://.*\.png.*$', - 'duration': 1776.26375, - 'author': 'Reply All', + 'duration': 1998.36, + 'creators': ['Reply All'], }, } @@ -40,7 +40,7 @@ def _real_extract(self, url): 'id': video_id, 'thumbnail': thumbnail, 'title': title, - 'author': author, + 'creators': [author] if author else None, 'duration': episode_data['duration'], 'formats': formats, } diff --git a/yt_dlp/extractor/megatvcom.py b/yt_dlp/extractor/megatvcom.py index 2f3f11f519..93c7e7dc08 100644 --- a/yt_dlp/extractor/megatvcom.py +++ b/yt_dlp/extractor/megatvcom.py @@ -1,14 +1,14 @@ import re from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( + ExtractorError, clean_html, determine_ext, - ExtractorError, extract_attributes, get_element_by_class, get_element_html_by_id, - HEADRequest, parse_qs, unescapeHTML, unified_timestamp, @@ -160,5 +160,5 @@ def _real_extract(self, url): canonical_url = self._request_webpage( HEADRequest(canonical_url), video_id, note='Resolve canonical URL', - errnote='Could not resolve canonical URL').geturl() + errnote='Could not resolve canonical URL').url return self.url_result(canonical_url, MegaTVComIE.ie_key(), video_id) diff --git a/yt_dlp/extractor/meipai.py b/yt_dlp/extractor/meipai.py index 1a6f3cd748..e4c145c727 100644 --- a/yt_dlp/extractor/meipai.py +++ b/yt_dlp/extractor/meipai.py @@ -25,7 +25,7 @@ class MeipaiIE(InfoExtractor): 'view_count': 35511, 'creator': '她她-TATA', 'tags': ['葉子', '阿桑', '余姿昀', '超級女聲'], - } + }, }, { # record of live streaming 'url': 'http://www.meipai.com/media/585526361', @@ -41,7 +41,7 @@ class MeipaiIE(InfoExtractor): 'upload_date': '20160919', 'view_count': 1215, 'creator': '她她-TATA', - } + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/melonvod.py b/yt_dlp/extractor/melonvod.py index 1d3fff8566..05d9de8dc1 100644 --- a/yt_dlp/extractor/melonvod.py +++ b/yt_dlp/extractor/melonvod.py @@ -20,7 +20,7 @@ class MelonVODIE(InfoExtractor): }, 'params': { 'skip_download': 'm3u8 download', - } + }, } def _real_extract(self, url): @@ -64,5 +64,5 @@ def _real_extract(self, url): 'thumbnail': thumbnail, 'upload_date': upload_date, 'duration': duration, - 'formats': formats + 'formats': formats, } diff --git a/yt_dlp/extractor/meta.py b/yt_dlp/extractor/meta.py deleted file mode 100644 index 7c11e6017e..0000000000 --- a/yt_dlp/extractor/meta.py +++ /dev/null @@ -1,70 +0,0 @@ -from .common import InfoExtractor -from .pladform import PladformIE -from ..utils import ( - unescapeHTML, - int_or_none, - ExtractorError, -) - - -class METAIE(InfoExtractor): - _VALID_URL = r'https?://video\.meta\.ua/(?:iframe/)?(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://video.meta.ua/5502115.video', - 'md5': '71b6f3ee274bef16f1ab410f7f56b476', - 'info_dict': { - 'id': '5502115', - 'ext': 'mp4', - 'title': 'Sony Xperia Z camera test [HQ]', - 'description': 'Xperia Z shoots video in FullHD HDR.', - 'uploader_id': 'nomobile', - 'uploader': 'CHЁZA.TV', - 'upload_date': '20130211', - }, - 'add_ie': ['Youtube'], - }, { - 'url': 'http://video.meta.ua/iframe/5502115', - 'only_matching': True, - }, { - # pladform embed - 'url': 'http://video.meta.ua/7121015.video', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - st_html5 = self._search_regex( - r"st_html5\s*=\s*'#([^']+)'", webpage, 'uppod html5 st', default=None) - - if st_html5: - # uppod st decryption algorithm is reverse engineered from function un(s) at uppod.js - json_str = '' - for i in range(0, len(st_html5), 3): - json_str += '�%s;' % st_html5[i:i + 3] - uppod_data = self._parse_json(unescapeHTML(json_str), video_id) - error = uppod_data.get('customnotfound') - if error: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) - - video_url = uppod_data['file'] - info = { - 'id': video_id, - 'url': video_url, - 'title': uppod_data.get('comment') or self._og_search_title(webpage), - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': uppod_data.get('poster') or self._og_search_thumbnail(webpage), - 'duration': int_or_none(self._og_search_property( - 'video:duration', webpage, default=None)), - } - if 'youtube.com/' in video_url: - info.update({ - '_type': 'url_transparent', - 'ie_key': 'Youtube', - }) - return info - - pladform_url = PladformIE._extract_url(webpage) - if pladform_url: - return self.url_result(pladform_url) diff --git a/yt_dlp/extractor/metacafe.py b/yt_dlp/extractor/metacafe.py deleted file mode 100644 index d7f5def0e0..0000000000 --- a/yt_dlp/extractor/metacafe.py +++ /dev/null @@ -1,281 +0,0 @@ -import json -import re -import urllib.parse - -from .common import InfoExtractor -from ..compat import compat_parse_qs, compat_urllib_parse_unquote -from ..utils import ( - ExtractorError, - determine_ext, - get_element_by_attribute, - int_or_none, - mimetype2ext, -) - - -class MetacafeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/(?P<id>[^/]+)/(?P<display_id>[^/?#]+)' - _DISCLAIMER = 'http://www.metacafe.com/family_filter/' - _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' - IE_NAME = 'metacafe' - _TESTS = [ - # Youtube video - { - 'add_ie': ['Youtube'], - 'url': 'http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/', - 'info_dict': { - 'id': '_aUehQsCQtM', - 'ext': 'mp4', - 'upload_date': '20090102', - 'title': 'The Electric Company | "Short I" | PBS KIDS GO!', - 'description': 'md5:2439a8ef6d5a70e380c22f5ad323e5a8', - 'uploader': 'PBS', - 'uploader_id': 'PBS' - } - }, - # Normal metacafe video - { - 'url': 'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/', - 'md5': '6e0bca200eaad2552e6915ed6fd4d9ad', - 'info_dict': { - 'id': '11121940', - 'ext': 'mp4', - 'title': 'News: Stuff You Won\'t Do with Your PlayStation 4', - 'uploader': 'ign', - 'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.', - }, - 'skip': 'Page is temporarily unavailable.', - }, - # metacafe video with family filter - { - 'url': 'http://www.metacafe.com/watch/2155630/adult_art_by_david_hart_156/', - 'md5': 'b06082c5079bbdcde677a6291fbdf376', - 'info_dict': { - 'id': '2155630', - 'ext': 'mp4', - 'title': 'Adult Art By David Hart 156', - 'uploader': '63346', - 'description': 'md5:9afac8fc885252201ad14563694040fc', - }, - 'params': { - 'skip_download': True, - }, - }, - # AnyClip video - { - 'url': 'http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/', - 'info_dict': { - 'id': 'an-dVVXnuY7Jh77J', - 'ext': 'mp4', - 'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3', - 'uploader': 'AnyClip', - 'description': 'md5:cbef0460d31e3807f6feb4e7a5952e5b', - }, - }, - # age-restricted video - { - 'url': 'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/', - 'md5': '98dde7c1a35d02178e8ab7560fe8bd09', - 'info_dict': { - 'id': '5186653', - 'ext': 'mp4', - 'title': 'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.', - 'uploader': 'Dwayne Pipe', - 'description': 'md5:950bf4c581e2c059911fa3ffbe377e4b', - 'age_limit': 18, - }, - }, - # cbs video - { - 'url': 'http://www.metacafe.com/watch/cb-8VD4r_Zws8VP/open_this_is_face_the_nation_february_9/', - 'info_dict': { - 'id': '8VD4r_Zws8VP', - 'ext': 'flv', - 'title': 'Open: This is Face the Nation, February 9', - 'description': 'md5:8a9ceec26d1f7ed6eab610834cc1a476', - 'duration': 96, - 'uploader': 'CBSI-NEW', - 'upload_date': '20140209', - 'timestamp': 1391959800, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, - # Movieclips.com video - { - 'url': 'http://www.metacafe.com/watch/mv-Wy7ZU/my_week_with_marilyn_do_you_love_me/', - 'info_dict': { - 'id': 'mv-Wy7ZU', - 'ext': 'mp4', - 'title': 'My Week with Marilyn - Do You Love Me?', - 'description': 'From the movie My Week with Marilyn - Colin (Eddie Redmayne) professes his love to Marilyn (Michelle Williams) and gets her to promise to return to set and finish the movie.', - 'uploader': 'movie_trailers', - 'duration': 176, - }, - 'params': { - 'skip_download': 'requires rtmpdump', - } - } - ] - - def report_disclaimer(self): - self.to_screen('Retrieving disclaimer') - - def _real_extract(self, url): - # Extract id and simplified title from URL - video_id, display_id = self._match_valid_url(url).groups() - - # the video may come from an external site - m_external = re.match(r'^(\w{2})-(.*)$', video_id) - if m_external is not None: - prefix, ext_id = m_external.groups() - # Check if video comes from YouTube - if prefix == 'yt': - return self.url_result('http://www.youtube.com/watch?v=%s' % ext_id, 'Youtube') - # CBS videos use theplatform.com - if prefix == 'cb': - return self.url_result('theplatform:%s' % ext_id, 'ThePlatform') - - headers = { - # Disable family filter - 'Cookie': 'user=%s; ' % urllib.parse.quote(json.dumps({'ffilter': False})) - } - - # AnyClip videos require the flashversion cookie so that we get the link - # to the mp4 file - if video_id.startswith('an-'): - headers['Cookie'] += 'flashVersion=0; ' - - # Retrieve video webpage to extract further information - webpage = self._download_webpage(url, video_id, headers=headers) - - error = get_element_by_attribute( - 'class', 'notfound-page-title', webpage) - if error: - raise ExtractorError(error, expected=True) - - video_title = self._html_search_meta( - ['og:title', 'twitter:title'], webpage, 'title', default=None) or self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title') - - # Extract URL, uploader and title from webpage - self.report_extraction(video_id) - video_url = None - mobj = re.search(r'(?m)&(?:media|video)URL=([^&]+)', webpage) - if mobj is not None: - mediaURL = compat_urllib_parse_unquote(mobj.group(1)) - video_ext = determine_ext(mediaURL) - - # Extract gdaKey if available - mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage) - if mobj is None: - video_url = mediaURL - else: - gdaKey = mobj.group(1) - video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) - if video_url is None: - mobj = re.search(r'<video src="([^"]+)"', webpage) - if mobj: - video_url = mobj.group(1) - video_ext = 'mp4' - if video_url is None: - flashvars = self._search_regex( - r' name="flashvars" value="(.*?)"', webpage, 'flashvars', - default=None) - if flashvars: - vardict = compat_parse_qs(flashvars) - if 'mediaData' not in vardict: - raise ExtractorError('Unable to extract media URL') - mobj = re.search( - r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0]) - if mobj is None: - raise ExtractorError('Unable to extract media URL') - mediaURL = mobj.group('mediaURL').replace('\\/', '/') - video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key')) - video_ext = determine_ext(video_url) - if video_url is None: - player_url = self._search_regex( - r"swfobject\.embedSWF\('([^']+)'", - webpage, 'config URL', default=None) - if player_url: - config_url = self._search_regex( - r'config=(.+)$', player_url, 'config URL') - config_doc = self._download_xml( - config_url, video_id, - note='Downloading video config') - smil_url = config_doc.find('.//properties').attrib['smil_file'] - smil_doc = self._download_xml( - smil_url, video_id, - note='Downloading SMIL document') - base_url = smil_doc.find('./head/meta').attrib['base'] - video_url = [] - for vn in smil_doc.findall('.//video'): - br = int(vn.attrib['system-bitrate']) - play_path = vn.attrib['src'] - video_url.append({ - 'format_id': 'smil-%d' % br, - 'url': base_url, - 'play_path': play_path, - 'page_url': url, - 'player_url': player_url, - 'ext': play_path.partition(':')[0], - }) - if video_url is None: - flashvars = self._parse_json(self._search_regex( - r'flashvars\s*=\s*({.*});', webpage, 'flashvars', - default=None), video_id, fatal=False) - if flashvars: - video_url = [] - for source in flashvars.get('sources'): - source_url = source.get('src') - if not source_url: - continue - ext = mimetype2ext(source.get('type')) or determine_ext(source_url) - if ext == 'm3u8': - video_url.extend(self._extract_m3u8_formats( - source_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - else: - video_url.append({ - 'url': source_url, - 'ext': ext, - }) - - if video_url is None: - raise ExtractorError('Unsupported video type') - - description = self._html_search_meta( - ['og:description', 'twitter:description', 'description'], - webpage, 'title', fatal=False) - thumbnail = self._html_search_meta( - ['og:image', 'twitter:image'], webpage, 'title', fatal=False) - video_uploader = self._html_search_regex( - r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', - webpage, 'uploader nickname', fatal=False) - duration = int_or_none( - self._html_search_meta('video:duration', webpage, default=None)) - age_limit = ( - 18 - if re.search(r'(?:"contentRating":|"rating",)"restricted"', webpage) - else 0) - - if isinstance(video_url, list): - formats = video_url - else: - formats = [{ - 'url': video_url, - 'ext': video_ext, - }] - - return { - 'id': video_id, - 'display_id': display_id, - 'description': description, - 'uploader': video_uploader, - 'title': video_title, - 'thumbnail': thumbnail, - 'age_limit': age_limit, - 'formats': formats, - 'duration': duration, - } diff --git a/yt_dlp/extractor/metacritic.py b/yt_dlp/extractor/metacritic.py index 14410549ab..41e20a58e0 100644 --- a/yt_dlp/extractor/metacritic.py +++ b/yt_dlp/extractor/metacritic.py @@ -40,9 +40,9 @@ def _real_extract(self, url): clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id) formats = [] - for videoFile in clip.findall('httpURI/videoFile'): - rate_str = videoFile.find('rate').text - video_url = videoFile.find('filePath').text + for video_file in clip.findall('httpURI/videoFile'): + rate_str = video_file.find('rate').text + video_url = video_file.find('filePath').text formats.append({ 'url': video_url, 'ext': 'mp4', diff --git a/yt_dlp/extractor/mgoon.py b/yt_dlp/extractor/mgoon.py deleted file mode 100644 index 2388a71920..0000000000 --- a/yt_dlp/extractor/mgoon.py +++ /dev/null @@ -1,81 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - qualities, - unified_strdate, -) - - -class MgoonIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)? - (?:(:?m\.)?mgoon\.com/(?:ch/(?:.+)/v|play/view)| - video\.mgoon\.com)/(?P<id>[0-9]+)''' - _API_URL = 'http://mpos.mgoon.com/player/video?id={0:}' - _TESTS = [ - { - 'url': 'http://m.mgoon.com/ch/hi6618/v/5582148', - 'md5': 'dd46bb66ab35cf6d51cc812fd82da79d', - 'info_dict': { - 'id': '5582148', - 'uploader_id': 'hi6618', - 'duration': 240.419, - 'upload_date': '20131220', - 'ext': 'mp4', - 'title': 'md5:543aa4c27a4931d371c3f433e8cebebc', - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, - { - 'url': 'http://www.mgoon.com/play/view/5582148', - 'only_matching': True, - }, - { - 'url': 'http://video.mgoon.com/5582148', - 'only_matching': True, - }, - ] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - - data = self._download_json(self._API_URL.format(video_id), video_id) - - if data.get('errorInfo', {}).get('code') != 'NONE': - raise ExtractorError('%s encountered an error: %s' % ( - self.IE_NAME, data['errorInfo']['message']), expected=True) - - v_info = data['videoInfo'] - title = v_info.get('v_title') - thumbnail = v_info.get('v_thumbnail') - duration = v_info.get('v_duration') - upload_date = unified_strdate(v_info.get('v_reg_date')) - uploader_id = data.get('userInfo', {}).get('u_alias') - if duration: - duration /= 1000.0 - - age_limit = None - if data.get('accessInfo', {}).get('code') == 'VIDEO_STATUS_ADULT': - age_limit = 18 - - formats = [] - get_quality = qualities(['360p', '480p', '720p', '1080p']) - for fmt in data['videoFiles']: - formats.append({ - 'format_id': fmt['label'], - 'quality': get_quality(fmt['label']), - 'url': fmt['url'], - 'ext': fmt['format'], - - }) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'duration': duration, - 'upload_date': upload_date, - 'uploader_id': uploader_id, - 'age_limit': age_limit, - } diff --git a/yt_dlp/extractor/mgtv.py b/yt_dlp/extractor/mgtv.py index edc92b371f..d5dda06f99 100644 --- a/yt_dlp/extractor/mgtv.py +++ b/yt_dlp/extractor/mgtv.py @@ -3,15 +3,15 @@ import uuid from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, + parse_resolution, + traverse_obj, try_get, url_or_none, + urljoin, ) @@ -30,16 +30,18 @@ class MGTVIE(InfoExtractor): 'duration': 7461, 'thumbnail': r're:^https?://.*\.jpg$', }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://w.mgtv.com/b/427837/15588271.html', 'info_dict': { 'id': '15588271', 'ext': 'mp4', - 'title': '春日迟迟再出发 沉浸版', + 'title': '春日迟迟再出发 沉浸版第1期:陆莹结婚半年查出肾炎被离婚 吴雅婷把一半票根退给前夫', 'description': 'md5:a7a05a05b1aa87bd50cae619b19bbca6', 'thumbnail': r're:^https?://.+\.jpg', 'duration': 4026, }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://w.mgtv.com/b/333652/7329822.html', 'info_dict': { @@ -50,6 +52,7 @@ class MGTVIE(InfoExtractor): 'thumbnail': r're:^https?://.+\.jpg', 'duration': 2656, }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://w.mgtv.com/b/427837/15591647.html', 'only_matching': True, @@ -64,67 +67,79 @@ class MGTVIE(InfoExtractor): 'only_matching': True, }] + _RESOLUTIONS = { + '标清': ('480p', '854x480'), + '高清': ('540p', '960x540'), + '超清': ('720p', '1280x720'), + '蓝光': ('1080p', '1920x1080'), + } + def _real_extract(self, url): video_id = self._match_id(url) tk2 = base64.urlsafe_b64encode( - f'did={str(uuid.uuid4())}|pno=1030|ver=0.3.0301|clit={int(time.time())}'.encode())[::-1] + f'did={uuid.uuid4()}|pno=1030|ver=0.3.0301|clit={int(time.time())}'.encode())[::-1] try: api_data = self._download_json( 'https://pcweb.api.mgtv.com/player/video', video_id, query={ 'tk2': tk2, 'video_id': video_id, - 'type': 'pch5' + 'type': 'pch5', }, headers=self.geo_verification_headers())['data'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read().decode(), None) + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + error = self._parse_json(e.cause.response.read().decode(), None) if error.get('code') == 40005: self.raise_geo_restricted(countries=self._GEO_COUNTRIES) raise ExtractorError(error['msg'], expected=True) raise - info = api_data['info'] - title = info['title'].strip() + stream_data = self._download_json( 'https://pcweb.api.mgtv.com/player/getSource', video_id, query={ - 'pm2': api_data['atc']['pm2'], 'tk2': tk2, + 'pm2': api_data['atc']['pm2'], 'video_id': video_id, + 'type': 'pch5', 'src': 'intelmgtv', }, headers=self.geo_verification_headers())['data'] - stream_domain = stream_data['stream_domain'][0] + stream_domain = traverse_obj(stream_data, ('stream_domain', ..., {url_or_none}), get_all=False) formats = [] - for idx, stream in enumerate(stream_data['stream']): - stream_path = stream.get('url') - if not stream_path: - continue - format_data = self._download_json( - stream_domain + stream_path, video_id, - note=f'Download video info for format #{idx}') - format_url = format_data.get('info') + for idx, stream in enumerate(traverse_obj(stream_data, ('stream', lambda _, v: v['url']))): + stream_name = traverse_obj(stream, 'name', 'standardName', 'barName', expected_type=str) + resolution = traverse_obj( + self._RESOLUTIONS, (stream_name, 1 if stream.get('scale') == '16:9' else 0)) + format_url = traverse_obj(self._download_json( + urljoin(stream_domain, stream['url']), video_id, fatal=False, + note=f'Downloading video info for format {resolution or stream_name}'), + ('info', {url_or_none})) if not format_url: continue tbr = int_or_none(stream.get('filebitrate') or self._search_regex( r'_(\d+)_mp4/', format_url, 'tbr', default=None)) formats.append({ - 'format_id': compat_str(tbr or idx), - 'url': url_or_none(format_url), + 'format_id': str(tbr or idx), + 'url': format_url, 'ext': 'mp4', 'tbr': tbr, + 'vcodec': stream.get('videoFormat'), + 'acodec': stream.get('audioFormat'), + **parse_resolution(resolution), 'protocol': 'm3u8_native', 'http_headers': { 'Referer': url, }, - 'format_note': stream.get('name'), + 'format_note': stream_name, }) return { 'id': video_id, - 'title': title, 'formats': formats, - 'description': info.get('desc'), - 'duration': int_or_none(info.get('duration')), - 'thumbnail': info.get('thumb'), + **traverse_obj(api_data, ('info', { + 'title': ('title', {str.strip}), + 'description': ('desc', {str}), + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('thumb', {url_or_none}), + })), 'subtitles': self.extract_subtitles(video_id, stream_domain), } @@ -145,6 +160,6 @@ def _get_subtitles(self, video_id, domain): subtitles.setdefault(locale.lower(), []).append({ 'url': sub_url, 'name': sub.get('name'), - 'ext': 'srt' + 'ext': 'srt', }) return subtitles diff --git a/yt_dlp/extractor/miaopai.py b/yt_dlp/extractor/miaopai.py deleted file mode 100644 index 329ce36582..0000000000 --- a/yt_dlp/extractor/miaopai.py +++ /dev/null @@ -1,36 +0,0 @@ -from .common import InfoExtractor - - -class MiaoPaiIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?miaopai\.com/show/(?P<id>[-A-Za-z0-9~_]+)' - _TEST = { - 'url': 'http://www.miaopai.com/show/n~0hO7sfV1nBEw4Y29-Hqg__.htm', - 'md5': '095ed3f1cd96b821add957bdc29f845b', - 'info_dict': { - 'id': 'n~0hO7sfV1nBEw4Y29-Hqg__', - 'ext': 'mp4', - 'title': '西游记音乐会的秒拍视频', - 'thumbnail': 're:^https?://.*/n~0hO7sfV1nBEw4Y29-Hqg___m.jpg', - } - } - - _USER_AGENT_IPAD = 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1' - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage( - url, video_id, headers={'User-Agent': self._USER_AGENT_IPAD}) - - title = self._html_extract_title(webpage) - thumbnail = self._html_search_regex( - r'<div[^>]+class=(?P<q1>[\'"]).*\bvideo_img\b.*(?P=q1)[^>]+data-url=(?P<q2>[\'"])(?P<url>[^\'"]+)(?P=q2)', - webpage, 'thumbnail', fatal=False, group='url') - videos = self._parse_html5_media_entries(url, webpage, video_id) - info = videos[0] - - info.update({ - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - }) - return info diff --git a/yt_dlp/extractor/microsoftembed.py b/yt_dlp/extractor/microsoftembed.py index f71ab3e921..d0135f5a9c 100644 --- a/yt_dlp/extractor/microsoftembed.py +++ b/yt_dlp/extractor/microsoftembed.py @@ -1,5 +1,14 @@ +import re + from .common import InfoExtractor -from ..utils import int_or_none, traverse_obj, unified_timestamp +from ..utils import ( + int_or_none, + parse_iso8601, + traverse_obj, + unified_timestamp, + url_basename, + url_or_none, +) class MicrosoftEmbedIE(InfoExtractor): @@ -15,8 +24,8 @@ class MicrosoftEmbedIE(InfoExtractor): 'thumbnail': 'http://img-prod-cms-rt-microsoft-com.akamaized.net/cms/api/am/imageFileData/RWL7Ju?ver=cae5', 'age_limit': 0, 'timestamp': 1631658316, - 'upload_date': '20210914' - } + 'upload_date': '20210914', + }, }] _API_URL = 'https://prod-video-cms-rt-microsoft-com.akamaized.net/vhs/api/videos/' @@ -63,3 +72,250 @@ def _real_extract(self, url): 'subtitles': subtitles, 'thumbnails': thumbnails, } + + +class MicrosoftMediusBaseIE(InfoExtractor): + @staticmethod + def _sub_to_dict(subtitle_list): + subtitles = {} + for sub in subtitle_list: + subtitles.setdefault(sub.pop('tag', 'und'), []).append(sub) + return subtitles + + def _extract_ism(self, ism_url, video_id): + formats = self._extract_ism_formats(ism_url, video_id) + for fmt in formats: + if fmt['language'] != 'eng' and 'English' not in fmt['format_id']: + fmt['language_preference'] = -10 + return formats + + +class MicrosoftMediusIE(MicrosoftMediusBaseIE): + _VALID_URL = r'https?://medius\.microsoft\.com/Embed/(?:Video\?id=|video-nc/|VideoDetails/)(?P<id>[\da-f-]+)' + + _TESTS = [{ + 'url': 'https://medius.microsoft.com/Embed/video-nc/9640d86c-f513-4889-959e-5dace86e7d2b', + 'info_dict': { + 'id': '9640d86c-f513-4889-959e-5dace86e7d2b', + 'ext': 'ismv', + 'title': 'Rapidly code, test and ship from secure cloud developer environments', + 'description': 'md5:33c8e4facadc438613476eea24165f71', + 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + 'subtitles': 'count:30', + }, + }, { + 'url': 'https://medius.microsoft.com/Embed/video-nc/81215af5-c813-4dcd-aede-94f4e1a7daa3', + 'info_dict': { + 'id': '81215af5-c813-4dcd-aede-94f4e1a7daa3', + 'ext': 'ismv', + 'title': 'Microsoft Build opening', + 'description': 'md5:43455096141077a1f23144cab8cec1cb', + 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + 'subtitles': 'count:31', + }, + }, { + 'url': 'https://medius.microsoft.com/Embed/VideoDetails/78493569-9b3b-4a85-a409-ee76e789e25c', + 'info_dict': { + 'id': '78493569-9b3b-4a85-a409-ee76e789e25c', + 'ext': 'ismv', + 'title': ' Anomaly Detection & Root cause at Edge', + 'description': 'md5:f8f1ad93d7918649bfb97fa081b03b83', + 'thumbnail': r're:https://mediusdownload.event.microsoft.com/asset.*\.jpg.*', + 'subtitles': 'count:17', + }, + }, { + 'url': 'https://medius.microsoft.com/Embed/Video?id=0dc69bda-079b-4070-a7db-a8da1a06a9c7', + 'only_matching': True, + }, { + 'url': 'https://medius.microsoft.com/Embed/video-nc/fe823a91-959c-465b-96d4-8f4db624f72c', + 'only_matching': True, + }] + + def _extract_subtitle(self, webpage, video_id): + captions = traverse_obj( + self._search_json(r'const\s+captionsConfiguration\s*=', webpage, 'captions', video_id, default=None), + ('languageList', lambda _, v: url_or_none(v['src']), { + 'url': 'src', + 'tag': ('srclang', {str}), + 'name': ('kind', {str}), + })) or [{'url': url, 'tag': url_basename(url).split('.vtt')[0].split('_')[-1]} + for url in re.findall(r'var\s+file\s+=\s+\{[^}]+\'(https://[^\']+\.vtt\?[^\']+)', webpage)] + + return self._sub_to_dict(captions) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(f'https://medius.microsoft.com/Embed/video-nc/{video_id}', video_id) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'formats': self._extract_ism( + self._search_regex(r'StreamUrl\s*=\s*"([^"]+manifest)"', webpage, 'ism url'), video_id), + 'thumbnail': self._og_search_thumbnail(webpage), + 'subtitles': self._extract_subtitle(webpage, video_id), + } + + +class MicrosoftLearnPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://learn\.microsoft\.com/(?:[\w-]+/)?(?P<type>shows|events)/(?P<id>[\w-]+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://learn.microsoft.com/en-us/shows/bash-for-beginners', + 'info_dict': { + 'id': 'bash-for-beginners', + 'title': 'Bash for Beginners', + 'description': 'md5:16a91c07222117d1e00912f0dbc02c2c', + }, + 'playlist_count': 20, + }, { + 'url': 'https://learn.microsoft.com/en-us/events/build-2022', + 'info_dict': { + 'id': 'build-2022', + 'title': 'Microsoft Build 2022 - Events', + 'description': 'md5:c16b43848027df837b22c6fbac7648d3', + }, + 'playlist_count': 201, + }] + + def _entries(self, url_base, video_id): + skip = 0 + while True: + playlist_info = self._download_json(url_base, video_id, f'Downloading entries {skip}', query={ + 'locale': 'en-us', + '$skip': skip, + }) + url_paths = traverse_obj(playlist_info, ('results', ..., 'url', {str})) + for url_path in url_paths: + yield self.url_result(f'https://learn.microsoft.com/en-us{url_path}') + skip += len(url_paths) + if skip >= playlist_info.get('count', 0) or not url_paths: + break + + def _real_extract(self, url): + playlist_id, playlist_type = self._match_valid_url(url).group('id', 'type') + webpage = self._download_webpage(url, playlist_id) + + metainfo = { + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + } + sub_type = 'episodes' if playlist_type == 'shows' else 'sessions' + + url_base = f'https://learn.microsoft.com/api/contentbrowser/search/{playlist_type}/{playlist_id}/{sub_type}' + return self.playlist_result(self._entries(url_base, playlist_id), playlist_id, **metainfo) + + +class MicrosoftLearnEpisodeIE(MicrosoftMediusBaseIE): + _VALID_URL = r'https?://learn\.microsoft\.com/(?:[\w-]+/)?shows/[\w-]+/(?P<id>[^?#/]+)' + _TESTS = [{ + 'url': 'https://learn.microsoft.com/en-us/shows/bash-for-beginners/what-is-the-difference-between-a-terminal-and-a-shell-2-of-20-bash-for-beginners/', + 'info_dict': { + 'id': 'd44e1a03-a0e5-45c2-9496-5c9fa08dc94c', + 'ext': 'ismv', + 'title': 'What is the Difference Between a Terminal and a Shell? (Part 2 of 20)', + 'description': 'md5:7bbbfb593d21c2cf2babc3715ade6b88', + 'timestamp': 1676339547, + 'upload_date': '20230214', + 'thumbnail': r're:https://learn\.microsoft\.com/video/media/.*\.png', + 'subtitles': 'count:14', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + entry_id = self._html_search_meta('entryId', webpage, 'entryId', fatal=True) + video_info = self._download_json( + f'https://learn.microsoft.com/api/video/public/v1/entries/{entry_id}', video_id) + return { + 'id': entry_id, + 'formats': self._extract_ism(video_info['publicVideo']['adaptiveVideoUrl'], video_id), + 'subtitles': self._sub_to_dict(traverse_obj(video_info, ( + 'publicVideo', 'captions', lambda _, v: url_or_none(v['url']), { + 'tag': ('language', {str}), + 'url': 'url', + }))), + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + **traverse_obj(video_info, { + 'timestamp': ('createTime', {parse_iso8601}), + 'thumbnails': ('publicVideo', 'thumbnailOtherSizes', ..., {'url': {url_or_none}}), + }), + } + + +class MicrosoftLearnSessionIE(InfoExtractor): + _VALID_URL = r'https?://learn\.microsoft\.com/(?:[\w-]+/)?events/[\w-]+/(?P<id>[^?#/]+)' + _TESTS = [{ + 'url': 'https://learn.microsoft.com/en-us/events/build-2022/ts01-rapidly-code-test-ship-from-secure-cloud-developer-environments', + 'info_dict': { + 'id': '9640d86c-f513-4889-959e-5dace86e7d2b', + 'ext': 'ismv', + 'title': 'Rapidly code, test and ship from secure cloud developer environments - Events', + 'description': 'md5:f26c1a85d41c1cffd27a0279254a25c3', + 'timestamp': 1653408600, + 'upload_date': '20220524', + 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + metainfo = { + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'timestamp': parse_iso8601(self._html_search_meta('startDate', webpage, 'startDate')), + } + + return self.url_result( + self._html_search_meta('externalVideoUrl', webpage, 'videoUrl', fatal=True), + url_transparent=True, ie=MicrosoftMediusIE, **metainfo) + + +class MicrosoftBuildIE(InfoExtractor): + _VALID_URL = [ + r'https?://build\.microsoft\.com/[\w-]+/sessions/(?P<id>[\da-f-]+)', + r'https?://build\.microsoft\.com/[\w-]+/(?P<id>sessions)/?(?:[?#]|$)', + ] + + _TESTS = [{ + 'url': 'https://build.microsoft.com/en-US/sessions/b49feb31-afcd-4217-a538-d3ca1d171198?source=sessions', + 'info_dict': { + 'id': 'aee55fb5-fcf9-4b38-b764-a3527cb57554', + 'ext': 'ismv', + 'title': 'Microsoft Build opening keynote', + 'description': 'md5:d38338f336ef4b6ef9ad2a7466a76655', + 'timestamp': 1716307200, + 'upload_date': '20240521', + 'thumbnail': r're:https://mediusimg\.event\.microsoft\.com/video-\d+/thumbnail\.jpg.*', + }, + }, { + 'url': 'https://build.microsoft.com/en-US/sessions', + 'info_dict': { + 'id': 'sessions', + }, + 'playlist_mincount': 418, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + entries = [ + self.url_result( + video_info['onDemand'], ie=MicrosoftMediusIE, url_transparent=True, **traverse_obj(video_info, { + 'id': ('sessionId', {str}), + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('startDateTime', {parse_iso8601}), + })) + for video_info in self._download_json( + 'https://api-v2.build.microsoft.com/api/session/all/en-US', video_id, 'Downloading video info') + ] + if video_id == 'sessions': + return self.playlist_result(entries, video_id) + else: + return traverse_obj(entries, (lambda _, v: v['id'] == video_id), get_all=False) diff --git a/yt_dlp/extractor/microsoftstream.py b/yt_dlp/extractor/microsoftstream.py index 9b50996b70..b138810838 100644 --- a/yt_dlp/extractor/microsoftstream.py +++ b/yt_dlp/extractor/microsoftstream.py @@ -1,10 +1,10 @@ -from base64 import b64decode +import base64 from .common import InfoExtractor from ..utils import ( merge_dicts, - parse_iso8601, parse_duration, + parse_iso8601, parse_resolution, try_get, url_basename, @@ -37,11 +37,11 @@ def _get_all_subtitles(self, api_url, video_id, headers): sub_dict = automatic_captions if track.get('autoGenerated') else subtitles sub_dict.setdefault(track['language'], []).append({ 'ext': 'vtt', - 'url': track.get('url') + 'url': track.get('url'), }) return { 'subtitles': subtitles, - 'automatic_captions': automatic_captions + 'automatic_captions': automatic_captions, } def extract_all_subtitles(self, *args, **kwargs): @@ -66,7 +66,7 @@ def _real_extract(self, url): f'{api_url}/videos/{video_id}', video_id, headers=headers, query={ '$expand': 'creator,tokens,status,liveEvent,extensions', - 'api-version': '1.4-private' + 'api-version': '1.4-private', }) video_id = video_data.get('id') or video_id language = video_data.get('language') @@ -81,7 +81,7 @@ def _real_extract(self, url): 'url': thumbnail_url, } thumb_name = url_basename(thumbnail_url) - thumb_name = str(b64decode(thumb_name + '=' * (-len(thumb_name) % 4))) + thumb_name = str(base64.b64decode(thumb_name + '=' * (-len(thumb_name) % 4))) thumb.update(parse_resolution(thumb_name)) thumbnails.append(thumb) diff --git a/yt_dlp/extractor/microsoftvirtualacademy.py b/yt_dlp/extractor/microsoftvirtualacademy.py deleted file mode 100644 index b759b1860b..0000000000 --- a/yt_dlp/extractor/microsoftvirtualacademy.py +++ /dev/null @@ -1,189 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, - smuggle_url, - unsmuggle_url, - xpath_text, -) - - -class MicrosoftVirtualAcademyBaseIE(InfoExtractor): - def _extract_base_url(self, course_id, display_id): - return self._download_json( - 'https://api-mlxprod.microsoft.com/services/products/anonymous/%s' % course_id, - display_id, 'Downloading course base URL') - - def _extract_chapter_and_title(self, title): - if not title: - return None, None - m = re.search(r'(?P<chapter>\d+)\s*\|\s*(?P<title>.+)', title) - return (int(m.group('chapter')), m.group('title')) if m else (None, title) - - -class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): - IE_NAME = 'mva' - IE_DESC = 'Microsoft Virtual Academy videos' - _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME - - _TESTS = [{ - 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382', - 'md5': '7826c44fc31678b12ad8db11f6b5abb9', - 'info_dict': { - 'id': 'gfVXISmEB_6804984382', - 'ext': 'mp4', - 'title': 'Course Introduction', - 'formats': 'mincount:3', - 'subtitles': { - 'en': [{ - 'ext': 'ttml', - }], - }, - } - }, { - 'url': 'mva:11788:gfVXISmEB_6804984382', - 'only_matching': True, - }] - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - - mobj = self._match_valid_url(url) - course_id = mobj.group('course_id') - video_id = mobj.group('id') - - base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id) - - settings = self._download_xml( - '%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id), - video_id, 'Downloading video settings XML') - - _, title = self._extract_chapter_and_title(xpath_text( - settings, './/Title', 'title', fatal=True)) - - formats = [] - - for sources in settings.findall('.//MediaSources'): - sources_type = sources.get('videoType') - for source in sources.findall('./MediaSource'): - video_url = source.text - if not video_url or not video_url.startswith('http'): - continue - if sources_type == 'smoothstreaming': - formats.extend(self._extract_ism_formats( - video_url, video_id, 'mss', fatal=False)) - continue - video_mode = source.get('videoMode') - height = int_or_none(self._search_regex( - r'^(\d+)[pP]$', video_mode or '', 'height', default=None)) - codec = source.get('codec') - acodec, vcodec = [None] * 2 - if codec: - codecs = codec.split(',') - if len(codecs) == 2: - acodec, vcodec = codecs - elif len(codecs) == 1: - vcodec = codecs[0] - formats.append({ - 'url': video_url, - 'format_id': video_mode, - 'height': height, - 'acodec': acodec, - 'vcodec': vcodec, - }) - - subtitles = {} - for source in settings.findall('.//MarkerResourceSource'): - subtitle_url = source.text - if not subtitle_url: - continue - subtitles.setdefault('en', []).append({ - 'url': '%s/%s' % (base_url, subtitle_url), - 'ext': source.get('type'), - }) - - return { - 'id': video_id, - 'title': title, - 'subtitles': subtitles, - 'formats': formats - } - - -class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE): - IE_NAME = 'mva:course' - IE_DESC = 'Microsoft Virtual Academy courses' - _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME - - _TESTS = [{ - 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', - 'info_dict': { - 'id': '11788', - 'title': 'Microsoft Azure Fundamentals: Virtual Machines', - }, - 'playlist_count': 36, - }, { - # with emphasized chapters - 'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335', - 'info_dict': { - 'id': '16335', - 'title': 'Developing Windows 10 Games with Construct 2', - }, - 'playlist_count': 10, - }, { - 'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', - 'only_matching': True, - }, { - 'url': 'mva:course:11788', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if MicrosoftVirtualAcademyIE.suitable(url) else super( - MicrosoftVirtualAcademyCourseIE, cls).suitable(url) - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - course_id = mobj.group('id') - display_id = mobj.group('display_id') - - base_url = self._extract_base_url(course_id, display_id) - - manifest = self._download_json( - '%s/imsmanifestlite.json' % base_url, - display_id, 'Downloading course manifest JSON')['manifest'] - - organization = manifest['organizations']['organization'][0] - - entries = [] - for chapter in organization['item']: - chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title')) - chapter_id = chapter.get('@identifier') - for item in chapter.get('item', []): - item_id = item.get('@identifier') - if not item_id: - continue - metadata = item.get('resource', {}).get('metadata') or {} - if metadata.get('learningresourcetype') != 'Video': - continue - _, title = self._extract_chapter_and_title(item.get('title')) - duration = parse_duration(metadata.get('duration')) - description = metadata.get('description') - entries.append({ - '_type': 'url_transparent', - 'url': smuggle_url( - 'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}), - 'title': title, - 'description': description, - 'duration': duration, - 'chapter': chapter_title, - 'chapter_number': chapter_number, - 'chapter_id': chapter_id, - }) - - title = organization.get('title') or manifest.get('metadata', {}).get('title') - - return self.playlist_result(entries, course_id, title) diff --git a/yt_dlp/extractor/mildom.py b/yt_dlp/extractor/mildom.py index f64d575dcc..88a2b9e891 100644 --- a/yt_dlp/extractor/mildom.py +++ b/yt_dlp/extractor/mildom.py @@ -4,11 +4,11 @@ from .common import InfoExtractor from ..utils import ( + ExtractorError, + OnDemandPagedList, determine_ext, dict_get, - ExtractorError, float_or_none, - OnDemandPagedList, traverse_obj, ) @@ -18,7 +18,7 @@ class MildomBaseIE(InfoExtractor): def _call_api(self, url, video_id, query=None, note='Downloading JSON metadata', body=None): if not self._GUEST_ID: - self._GUEST_ID = f'pc-gp-{str(uuid.uuid4())}' + self._GUEST_ID = f'pc-gp-{uuid.uuid4()}' content = self._download_json( url, video_id, note=note, data=json.dumps(body).encode() if body else None, @@ -150,18 +150,18 @@ def _real_extract(self, url): 'protocol': 'm3u8_native', 'vcodec': 'none', 'acodec': 'aac', - 'ext': 'm4a' + 'ext': 'm4a', }] for fmt in autoplay['video_link']: formats.append({ - 'format_id': 'video-%s' % fmt['name'], + 'format_id': 'video-{}'.format(fmt['name']), 'url': fmt['url'], 'protocol': 'm3u8_native', 'width': fmt['level'] * autoplay['video_width'] // autoplay['video_height'], 'height': fmt['level'], 'vcodec': 'h264', 'acodec': 'aac', - 'ext': 'mp4' + 'ext': 'mp4', }) return { @@ -280,7 +280,7 @@ def _fetch_page(self, user_id, page): def _real_extract(self, url): user_id = self._match_id(url) - self.to_screen('This will download all VODs belonging to user. To download ongoing live video, use "https://www.mildom.com/%s" instead' % user_id) + self.to_screen(f'This will download all VODs belonging to user. To download ongoing live video, use "https://www.mildom.com/{user_id}" instead') profile = self._call_api( 'https://cloudac.mildom.com/nonolive/gappserv/user/profileV2', user_id, diff --git a/yt_dlp/extractor/minds.py b/yt_dlp/extractor/minds.py index 2fb17920cc..71c82f208e 100644 --- a/yt_dlp/extractor/minds.py +++ b/yt_dlp/extractor/minds.py @@ -1,5 +1,4 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( clean_html, format_field, @@ -16,7 +15,7 @@ def _call_api(self, path, video_id, resource, query=None): api_url = 'https://www.minds.com/api/' + path token = self._get_cookies(api_url).get('XSRF-TOKEN') return self._download_json( - api_url, video_id, 'Downloading %s JSON metadata' % resource, headers={ + api_url, video_id, f'Downloading {resource} JSON metadata', headers={ 'Referer': 'https://www.minds.com/', 'X-XSRF-TOKEN': token.value if token else '', }, query=query) @@ -98,7 +97,7 @@ def _real_extract(self, url): uploader_id = owner.get('username') tags = entity.get('tags') - if tags and isinstance(tags, compat_str): + if tags and isinstance(tags, str): tags = [tags] thumbnail = None @@ -106,7 +105,7 @@ def _real_extract(self, url): if poster: urlh = self._request_webpage(poster, video_id, fatal=False) if urlh: - thumbnail = urlh.geturl() + thumbnail = urlh.url return { 'id': video_id, @@ -135,8 +134,8 @@ def _entries(self, feed_id): i = 1 while True: data = self._call_api( - 'v2/feeds/container/%s/videos' % feed_id, - feed_id, 'page %s' % i, query) + f'v2/feeds/container/{feed_id}/videos', + feed_id, f'page {i}', query) entities = data.get('entities') or [] for entity in entities: guid = entity.get('guid') @@ -153,7 +152,7 @@ def _entries(self, feed_id): def _real_extract(self, url): feed_id = self._match_id(url) feed = self._call_api( - 'v1/%s/%s' % (self._FEED_PATH, feed_id), + f'v1/{self._FEED_PATH}/{feed_id}', feed_id, self._FEED_TYPE)[self._FEED_TYPE] return self.playlist_result( diff --git a/yt_dlp/extractor/ministrygrid.py b/yt_dlp/extractor/ministrygrid.py deleted file mode 100644 index 053c6726c3..0000000000 --- a/yt_dlp/extractor/ministrygrid.py +++ /dev/null @@ -1,55 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - smuggle_url, -) - - -class MinistryGridIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ministrygrid\.com/([^/?#]*/)*(?P<id>[^/#?]+)/?(?:$|[?#])' - - _TEST = { - 'url': 'http://www.ministrygrid.com/training-viewer/-/training/t4g-2014-conference/the-gospel-by-numbers-4/the-gospel-by-numbers', - 'md5': '844be0d2a1340422759c2a9101bab017', - 'info_dict': { - 'id': '3453494717001', - 'ext': 'mp4', - 'title': 'The Gospel by Numbers', - 'thumbnail': r're:^https?://.*\.jpg', - 'upload_date': '20140410', - 'description': 'Coming soon from T4G 2014!', - 'uploader_id': '2034960640001', - 'timestamp': 1397145591, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': ['TDSLifeway'], - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - portlets = self._parse_json(self._search_regex( - r'Liferay\.Portlet\.list=(\[.+?\])', webpage, 'portlet list'), - video_id) - pl_id = self._search_regex( - r'getPlid:function\(\){return"(\d+)"}', webpage, 'p_l_id') - - for i, portlet in enumerate(portlets): - portlet_url = 'http://www.ministrygrid.com/c/portal/render_portlet?p_l_id=%s&p_p_id=%s' % (pl_id, portlet) - portlet_code = self._download_webpage( - portlet_url, video_id, - note='Looking in portlet %s (%d/%d)' % (portlet, i + 1, len(portlets)), - fatal=False) - video_iframe_url = self._search_regex( - r'<iframe.*?src="([^"]+)"', portlet_code, 'video iframe', - default=None) - if video_iframe_url: - return self.url_result( - smuggle_url(video_iframe_url, {'force_videoid': video_id}), - video_id=video_id) - - raise ExtractorError('Could not find video iframe in any portlets') diff --git a/yt_dlp/extractor/minoto.py b/yt_dlp/extractor/minoto.py index 8d18179c78..69832560d0 100644 --- a/yt_dlp/extractor/minoto.py +++ b/yt_dlp/extractor/minoto.py @@ -12,7 +12,7 @@ def _real_extract(self, url): mobj = self._match_valid_url(url) player_id = mobj.group('player_id') or '1' video_id = mobj.group('id') - video_data = self._download_json('http://play.minoto-video.com/%s/%s.js' % (player_id, video_id), video_id) + video_data = self._download_json(f'http://play.minoto-video.com/{player_id}/{video_id}.js', video_id) video_metadata = video_data['video-metadata'] formats = [] for fmt in video_data['video-files']: @@ -21,7 +21,7 @@ def _real_extract(self, url): continue container = fmt.get('container') if container == 'hls': - formats.extend(fmt_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(self._extract_m3u8_formats(fmt_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: fmt_profile = fmt.get('profile') or {} formats.append({ diff --git a/yt_dlp/extractor/miomio.py b/yt_dlp/extractor/miomio.py deleted file mode 100644 index a0a041ea54..0000000000 --- a/yt_dlp/extractor/miomio.py +++ /dev/null @@ -1,138 +0,0 @@ -import random - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - xpath_text, - int_or_none, - ExtractorError, - sanitized_Request, -) - - -class MioMioIE(InfoExtractor): - IE_NAME = 'miomio.tv' - _VALID_URL = r'https?://(?:www\.)?miomio\.tv/watch/cc(?P<id>[0-9]+)' - _TESTS = [{ - # "type=video" in flashvars - 'url': 'http://www.miomio.tv/watch/cc88912/', - 'info_dict': { - 'id': '88912', - 'ext': 'flv', - 'title': '【SKY】字幕 铠武昭和VS平成 假面骑士大战FEAT战队 魔星字幕组 字幕', - 'duration': 5923, - }, - 'skip': 'Unable to load videos', - }, { - 'url': 'http://www.miomio.tv/watch/cc184024/', - 'info_dict': { - 'id': '43729', - 'title': '《动漫同人插画绘制》', - }, - 'playlist_mincount': 86, - 'skip': 'Unable to load videos', - }, { - 'url': 'http://www.miomio.tv/watch/cc173113/', - 'info_dict': { - 'id': '173113', - 'title': 'The New Macbook 2015 上手试玩与简评' - }, - 'playlist_mincount': 2, - 'skip': 'Unable to load videos', - }, { - # new 'h5' player - 'url': 'http://www.miomio.tv/watch/cc273997/', - 'md5': '0b27a4b4495055d826813f8c3a6b2070', - 'info_dict': { - 'id': '273997', - 'ext': 'mp4', - 'title': 'マツコの知らない世界【劇的進化SP!ビニール傘&冷凍食品2016】 1_2 - 16 05 31', - }, - 'skip': 'Unable to load videos', - }] - - def _extract_mioplayer(self, webpage, video_id, title, http_headers): - xml_config = self._search_regex( - r'flashvars="type=(?:sina|video)&(.+?)&', - webpage, 'xml config') - - # skipping the following page causes lags and eventually connection drop-outs - self._request_webpage( - 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/xml.php?id=%s&r=%s' % (id, random.randint(100, 999)), - video_id) - - vid_config_request = sanitized_Request( - 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/sina.php?{0}'.format(xml_config), - headers=http_headers) - - # the following xml contains the actual configuration information on the video file(s) - vid_config = self._download_xml(vid_config_request, video_id) - - if not int_or_none(xpath_text(vid_config, 'timelength')): - raise ExtractorError('Unable to load videos!', expected=True) - - entries = [] - for f in vid_config.findall('./durl'): - segment_url = xpath_text(f, 'url', 'video url') - if not segment_url: - continue - order = xpath_text(f, 'order', 'order') - segment_id = video_id - segment_title = title - if order: - segment_id += '-%s' % order - segment_title += ' part %s' % order - entries.append({ - 'id': segment_id, - 'url': segment_url, - 'title': segment_title, - 'duration': int_or_none(xpath_text(f, 'length', 'duration'), 1000), - 'http_headers': http_headers, - }) - - return entries - - def _download_chinese_webpage(self, *args, **kwargs): - # Requests with English locales return garbage - headers = { - 'Accept-Language': 'zh-TW,en-US;q=0.7,en;q=0.3', - } - kwargs.setdefault('headers', {}).update(headers) - return self._download_webpage(*args, **kwargs) - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_chinese_webpage( - url, video_id) - - title = self._html_search_meta( - 'description', webpage, 'title', fatal=True) - - mioplayer_path = self._search_regex( - r'src="(/mioplayer(?:_h5)?/[^"]+)"', webpage, 'ref_path') - - if '_h5' in mioplayer_path: - player_url = compat_urlparse.urljoin(url, mioplayer_path) - player_webpage = self._download_chinese_webpage( - player_url, video_id, - note='Downloading player webpage', headers={'Referer': url}) - entries = self._parse_html5_media_entries(player_url, player_webpage, video_id) - http_headers = {'Referer': player_url} - else: - http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path} - entries = self._extract_mioplayer(webpage, video_id, title, http_headers) - - if len(entries) == 1: - segment = entries[0] - segment['id'] = video_id - segment['title'] = title - segment['http_headers'] = http_headers - return segment - - return { - '_type': 'multi_video', - 'id': video_id, - 'entries': entries, - 'title': title, - 'http_headers': http_headers, - } diff --git a/yt_dlp/extractor/mirrativ.py b/yt_dlp/extractor/mirrativ.py index 0a8ee0c3a5..4e24371a22 100644 --- a/yt_dlp/extractor/mirrativ.py +++ b/yt_dlp/extractor/mirrativ.py @@ -11,7 +11,7 @@ class MirrativBaseIE(InfoExtractor): def assert_error(self, response): error_message = traverse_obj(response, ('status', 'error')) if error_message: - raise ExtractorError('Mirrativ says: %s' % error_message, expected=True) + raise ExtractorError(f'Mirrativ says: {error_message}', expected=True) class MirrativIE(MirrativBaseIE): @@ -42,7 +42,7 @@ class MirrativIE(MirrativBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage('https://www.mirrativ.com/live/%s' % video_id, video_id) + webpage = self._download_webpage(f'https://www.mirrativ.com/live/{video_id}', video_id) live_response = self._download_json(f'https://www.mirrativ.com/api/live/live?live_id={video_id}', video_id) self.assert_error(live_response) @@ -102,7 +102,7 @@ def _entries(self, user_id): # or the service will ban your IP address for a while continue live_id = live.get('live_id') - url = 'https://www.mirrativ.com/live/%s' % live_id + url = f'https://www.mirrativ.com/live/{live_id}' yield self.url_result(url, video_id=live_id, video_title=live.get('title')) page = api_response.get('next_page') diff --git a/yt_dlp/extractor/mit.py b/yt_dlp/extractor/mit.py index 38cc0c2741..e75c540a23 100644 --- a/yt_dlp/extractor/mit.py +++ b/yt_dlp/extractor/mit.py @@ -1,11 +1,11 @@ -import re import json +import re from .common import InfoExtractor from .youtube import YoutubeIE from ..utils import ( - clean_html, ExtractorError, + clean_html, get_element_by_id, ) @@ -28,7 +28,7 @@ class TechTVMITIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) raw_page = self._download_webpage( - 'http://techtv.mit.edu/videos/%s' % video_id, video_id) + f'http://techtv.mit.edu/videos/{video_id}', video_id) clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page) base_url = self._proto_relative_url(self._search_regex( @@ -79,7 +79,7 @@ class OCWMITIE(InfoExtractor): 'upload_date': '20121109', 'uploader_id': 'MIT', 'uploader': 'MIT OpenCourseWare', - } + }, }, { 'url': 'http://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/1.-differentiation/part-a-definition-and-basic-rules/session-1-introduction-to-derivatives/', @@ -91,8 +91,8 @@ class OCWMITIE(InfoExtractor): 'uploader_id': 'MIT', 'uploader': 'MIT OpenCourseWare', 'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.', - } - } + }, + }, ] def _real_extract(self, url): diff --git a/yt_dlp/extractor/mixch.py b/yt_dlp/extractor/mixch.py index 4be6947289..9b7c7b89b9 100644 --- a/yt_dlp/extractor/mixch.py +++ b/yt_dlp/extractor/mixch.py @@ -1,5 +1,13 @@ from .common import InfoExtractor -from ..utils import UserNotLive, traverse_obj +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + UserNotLive, + int_or_none, + str_or_none, + url_or_none, +) +from ..utils.traversal import traverse_obj class MixchIE(InfoExtractor): @@ -7,17 +15,20 @@ class MixchIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?mixch\.tv/u/(?P<id>\d+)' _TESTS = [{ - 'url': 'https://mixch.tv/u/16236849/live', + 'url': 'https://mixch.tv/u/16943797/live', 'skip': 'don\'t know if this live persists', 'info_dict': { - 'id': '16236849', - 'title': '24配信シェア⭕️投票🙏💦', - 'comment_count': 13145, - 'view_count': 28348, - 'timestamp': 1636189377, - 'uploader': '🦥伊咲👶🏻#フレアワ', - 'uploader_id': '16236849', - } + 'id': '16943797', + 'ext': 'mp4', + 'title': '#EntView #カリナ #セブチ 2024-05-05 06:58', + 'comment_count': int, + 'view_count': int, + 'timestamp': 1714726805, + 'uploader': 'Ent.View K-news🎶💕', + 'uploader_id': '16943797', + 'live_status': 'is_live', + 'upload_date': '20240503', + }, }, { 'url': 'https://mixch.tv/u/16137876/live', 'only_matching': True, @@ -25,31 +36,41 @@ class MixchIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(f'https://mixch.tv/u/{video_id}/live', video_id) - - initial_js_state = self._parse_json(self._search_regex( - r'(?m)^\s*window\.__INITIAL_JS_STATE__\s*=\s*(\{.+?\});\s*$', webpage, 'initial JS state'), video_id) - if not initial_js_state.get('liveInfo'): + data = self._download_json(f'https://mixch.tv/api-web/users/{video_id}/live', video_id) + if not traverse_obj(data, ('liveInfo', {dict})): raise UserNotLive(video_id=video_id) return { 'id': video_id, - 'title': traverse_obj(initial_js_state, ('liveInfo', 'title')), - 'comment_count': traverse_obj(initial_js_state, ('liveInfo', 'comments')), - 'view_count': traverse_obj(initial_js_state, ('liveInfo', 'visitor')), - 'timestamp': traverse_obj(initial_js_state, ('liveInfo', 'created')), - 'uploader': traverse_obj(initial_js_state, ('broadcasterInfo', 'name')), 'uploader_id': video_id, + **traverse_obj(data, { + 'title': ('liveInfo', 'title', {str}), + 'comment_count': ('liveInfo', 'comments', {int_or_none}), + 'view_count': ('liveInfo', 'visitor', {int_or_none}), + 'timestamp': ('liveInfo', 'created', {int_or_none}), + 'uploader': ('broadcasterInfo', 'name', {str}), + }), 'formats': [{ 'format_id': 'hls', - 'url': (traverse_obj(initial_js_state, ('liveInfo', 'hls')) - or f'https://d1hd0ww6piyb43.cloudfront.net/hls/torte_{video_id}.m3u8'), + 'url': data['liveInfo']['hls'], 'ext': 'mp4', 'protocol': 'm3u8', }], 'is_live': True, + '__post_extractor': self.extract_comments(video_id), } + def _get_comments(self, video_id): + yield from traverse_obj(self._download_json( + f'https://mixch.tv/api-web/lives/{video_id}/messages', video_id, + note='Downloading comments', errnote='Failed to download comments'), (..., { + 'author': ('name', {str}), + 'author_id': ('user_id', {str_or_none}), + 'id': ('message_id', {str}, {lambda x: x or None}), + 'text': ('body', {str}), + 'timestamp': ('created', {int}), + })) + class MixchArchiveIE(InfoExtractor): IE_NAME = 'mixch:archive' @@ -60,22 +81,38 @@ class MixchArchiveIE(InfoExtractor): 'skip': 'paid video, no DRM. expires at Jan 23', 'info_dict': { 'id': '421', + 'ext': 'mp4', 'title': '96NEKO SHOW TIME', - } + }, + }, { + 'url': 'https://mixch.tv/archive/1213', + 'skip': 'paid video, no DRM. expires at Dec 31, 2023', + 'info_dict': { + 'id': '1213', + 'ext': 'mp4', + 'title': '【特別トーク番組アーカイブス】Merm4id×燐舞曲 2nd LIVE「VERSUS」', + 'release_date': '20231201', + 'thumbnail': str, + }, + }, { + 'url': 'https://mixch.tv/archive/1214', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - html5_videos = self._parse_html5_media_entries( - url, webpage.replace('video-js', 'video'), video_id, 'hls') - if not html5_videos: - self.raise_login_required(method='cookies') - infodict = html5_videos[0] - infodict.update({ + try: + info_json = self._download_json( + f'https://mixch.tv/api-web/archive/{video_id}', video_id)['archive'] + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + self.raise_login_required() + raise + + return { 'id': video_id, - 'title': self._html_search_regex(r'class="archive-title">(.+?)</', webpage, 'title') - }) - - return infodict + 'title': traverse_obj(info_json, ('title', {str})), + 'formats': self._extract_m3u8_formats(info_json['archiveURL'], video_id), + 'thumbnail': traverse_obj(info_json, ('thumbnailURL', {url_or_none})), + } diff --git a/yt_dlp/extractor/mixcloud.py b/yt_dlp/extractor/mixcloud.py index fb5a08ca28..19b7fd4e70 100644 --- a/yt_dlp/extractor/mixcloud.py +++ b/yt_dlp/extractor/mixcloud.py @@ -1,12 +1,9 @@ +import base64 import itertools +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_ord, - compat_str, - compat_urllib_parse_unquote, -) +from ..compat import compat_ord from ..utils import ( ExtractorError, int_or_none, @@ -20,12 +17,12 @@ class MixcloudBaseIE(InfoExtractor): def _call_api(self, object_type, object_fields, display_id, username, slug=None): lookup_key = object_type + 'Lookup' return self._download_json( - 'https://www.mixcloud.com/graphql', display_id, query={ + 'https://app.mixcloud.com/graphql', display_id, query={ 'query': '''{ %s(lookup: {username: "%s"%s}) { %s } -}''' % (lookup_key, username, ', slug: "%s"' % slug if slug else '', object_fields) +}''' % (lookup_key, username, f', slug: "{slug}"' if slug else '', object_fields), # noqa: UP031 })['data'][lookup_key] @@ -46,7 +43,15 @@ class MixcloudIE(MixcloudBaseIE): 'view_count': int, 'timestamp': 1321359578, 'upload_date': '20111115', + 'uploader_url': 'https://www.mixcloud.com/dholbach/', + 'artist': 'Submorphics & Chino , Telekinesis, Porter Robinson, Enei, Breakage ft Jess Mills', + 'duration': 3723, + 'tags': [], + 'comment_count': int, + 'repost_count': int, + 'like_count': int, }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/', 'info_dict': { @@ -60,7 +65,14 @@ class MixcloudIE(MixcloudBaseIE): 'view_count': int, 'timestamp': 1422987057, 'upload_date': '20150203', + 'uploader_url': 'https://www.mixcloud.com/gillespeterson/', + 'duration': 2992, + 'tags': [], + 'comment_count': int, + 'repost_count': int, + 'like_count': int, }, + 'params': {'skip_download': '404 playback error on site'}, }, { 'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/', 'only_matching': True, @@ -76,8 +88,8 @@ def _decrypt_xor_cipher(key, ciphertext): def _real_extract(self, url): username, slug = self._match_valid_url(url).groups() - username, slug = compat_urllib_parse_unquote(username), compat_urllib_parse_unquote(slug) - track_id = '%s_%s' % (username, slug) + username, slug = urllib.parse.unquote(username), urllib.parse.unquote(slug) + track_id = f'{username}_{slug}' cloudcast = self._call_api('cloudcast', '''audioLength comments(first: 100) { @@ -147,7 +159,7 @@ def _real_extract(self, url): if not format_url: continue decrypted = self._decrypt_xor_cipher( - self._DECRYPTION_KEY, compat_b64decode(format_url)) + self._DECRYPTION_KEY, base64.b64decode(format_url)) if url_key == 'hlsUrl': formats.extend(self._extract_m3u8_formats( decrypted, track_id, 'mp4', entry_protocol='m3u8_native', @@ -185,7 +197,7 @@ def _real_extract(self, url): tags = [] for t in cloudcast.get('tags'): - tag = try_get(t, lambda x: x['tag']['name'], compat_str) + tag = try_get(t, lambda x: x['tag']['name'], str) if not tag: tags.append(tag) @@ -198,7 +210,7 @@ def _real_extract(self, url): 'title': title, 'formats': formats, 'description': cloudcast.get('description'), - 'thumbnail': try_get(cloudcast, lambda x: x['picture']['url'], compat_str), + 'thumbnail': try_get(cloudcast, lambda x: x['picture']['url'], str), 'uploader': owner.get('displayName'), 'timestamp': parse_iso8601(cloudcast.get('publishDate')), 'uploader_id': owner.get('username'), @@ -223,12 +235,12 @@ def _get_playlist_title(self, title, slug): def _real_extract(self, url): username, slug = self._match_valid_url(url).groups() - username = compat_urllib_parse_unquote(username) + username = urllib.parse.unquote(username) if not slug: slug = 'uploads' else: - slug = compat_urllib_parse_unquote(slug) - playlist_id = '%s_%s' % (username, slug) + slug = urllib.parse.unquote(slug) + playlist_id = f'{username}_{slug}' is_playlist_type = self._ROOT_TYPE == 'playlist' playlist_type = 'items' if is_playlist_type else slug @@ -250,7 +262,7 @@ def _real_extract(self, url): endCursor hasNextPage } - }''' % (self._TITLE_KEY, self._DESCRIPTION_KEY, playlist_type, list_filter, self._NODE_TEMPLATE), + }''' % (self._TITLE_KEY, self._DESCRIPTION_KEY, playlist_type, list_filter, self._NODE_TEMPLATE), # noqa: UP031 playlist_id, username, slug if is_playlist_type else None) items = playlist.get(playlist_type) or {} @@ -259,15 +271,15 @@ def _real_extract(self, url): cloudcast_url = cloudcast.get('url') if not cloudcast_url: continue - slug = try_get(cloudcast, lambda x: x['slug'], compat_str) - owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str) - video_id = '%s_%s' % (owner_username, slug) if slug and owner_username else None + item_slug = try_get(cloudcast, lambda x: x['slug'], str) + owner_username = try_get(cloudcast, lambda x: x['owner']['username'], str) + video_id = f'{owner_username}_{item_slug}' if item_slug and owner_username else None entries.append(self.url_result( cloudcast_url, MixcloudIE.ie_key(), video_id)) page_info = items['pageInfo'] has_next_page = page_info['hasNextPage'] - list_filter = ', after: "%s"' % page_info['endCursor'] + list_filter = ', after: "{}"'.format(page_info['endCursor']) return self.playlist_result( entries, playlist_id, @@ -284,7 +296,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'info_dict': { 'id': 'dholbach_uploads', 'title': 'Daniel Holbach (uploads)', - 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', + 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, 'playlist_mincount': 36, }, { @@ -292,7 +304,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'info_dict': { 'id': 'dholbach_uploads', 'title': 'Daniel Holbach (uploads)', - 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', + 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, 'playlist_mincount': 36, }, { @@ -300,7 +312,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'info_dict': { 'id': 'dholbach_favorites', 'title': 'Daniel Holbach (favorites)', - 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', + 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, # 'params': { # 'playlist_items': '1-100', @@ -323,9 +335,9 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'info_dict': { 'id': 'FirstEar_stream', 'title': 'First Ear (stream)', - 'description': 'Curators of good music\r\n\r\nfirstearmusic.com', + 'description': 'we maraud for ears', }, - 'playlist_mincount': 271, + 'playlist_mincount': 269, }] _TITLE_KEY = 'displayName' @@ -336,7 +348,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): owner { username }''' def _get_playlist_title(self, title, slug): - return '%s (%s)' % (title, slug) + return f'{title} ({slug})' class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): diff --git a/yt_dlp/extractor/mlb.py b/yt_dlp/extractor/mlb.py index 72057dc97a..935bf85615 100644 --- a/yt_dlp/extractor/mlb.py +++ b/yt_dlp/extractor/mlb.py @@ -1,17 +1,23 @@ +import json import re -import urllib.parse +import time import uuid from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( + ExtractorError, determine_ext, int_or_none, join_nonempty, + jwt_decode_hs256, parse_duration, parse_iso8601, - traverse_obj, try_get, + url_or_none, + urlencode_postdata, ) +from ..utils.traversal import traverse_obj class MLBBaseIE(InfoExtractor): @@ -203,7 +209,7 @@ def _extract_mlb_subtitles(feed, language): def _download_video_data(self, display_id): return self._download_json( - 'http://content.mlb.com/mlb/item/id/v1/%s/details/web-v1.json' % display_id, + f'http://content.mlb.com/mlb/item/id/v1/{display_id}/details/web-v1.json', display_id) @@ -227,7 +233,7 @@ class MLBVideoIE(MLBBaseIE): @classmethod def suitable(cls, url): - return False if MLBIE.suitable(url) else super(MLBVideoIE, cls).suitable(url) + return False if MLBIE.suitable(url) else super().suitable(url) @staticmethod def _get_feed(video): @@ -268,83 +274,232 @@ def _download_video_data(self, display_id): timestamp title } -}''' % display_id, +}''' % display_id, # noqa: UP031 })['data']['mediaPlayback'][0] class MLBTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?mlb\.com/tv/g(?P<id>\d{6})' _NETRC_MACHINE = 'mlb' - _TESTS = [{ 'url': 'https://www.mlb.com/tv/g661581/vee2eff5f-a7df-4c20-bdb4-7b926fa12638', 'info_dict': { 'id': '661581', 'ext': 'mp4', 'title': '2022-07-02 - St. Louis Cardinals @ Philadelphia Phillies', + 'release_date': '20220702', + 'release_timestamp': 1656792300, }, - 'params': { - 'skip_download': True, + 'params': {'skip_download': 'm3u8'}, + }, { + # makeup game: has multiple dates, need to avoid games with 'rescheduleDate' + 'url': 'https://www.mlb.com/tv/g747039/vd22541c4-5a29-45f7-822b-635ec041cf5e', + 'info_dict': { + 'id': '747039', + 'ext': 'mp4', + 'title': '2024-07-29 - Toronto Blue Jays @ Baltimore Orioles', + 'release_date': '20240729', + 'release_timestamp': 1722280200, }, + 'params': {'skip_download': 'm3u8'}, }] + _GRAPHQL_INIT_QUERY = '''\ +mutation initSession($device: InitSessionInput!, $clientType: ClientType!, $experience: ExperienceTypeInput) { + initSession(device: $device, clientType: $clientType, experience: $experience) { + deviceId + sessionId + entitlements { + code + } + location { + countryCode + regionName + zipCode + latitude + longitude + } + clientExperience + features + } + }''' + _GRAPHQL_PLAYBACK_QUERY = '''\ +mutation initPlaybackSession( + $adCapabilities: [AdExperienceType] + $mediaId: String! + $deviceId: String! + $sessionId: String! + $quality: PlaybackQuality + ) { + initPlaybackSession( + adCapabilities: $adCapabilities + mediaId: $mediaId + deviceId: $deviceId + sessionId: $sessionId + quality: $quality + ) { + playbackSessionId + playback { + url + token + expiration + cdn + } + } + }''' + _APP_VERSION = '7.8.2' + _device_id = None + _session_id = None _access_token = None + _token_expiry = 0 + + @property + def _api_headers(self): + if (self._token_expiry - 120) <= time.time(): + self.write_debug('Access token has expired; re-logging in') + self._perform_login(*self._get_login_info()) + return {'Authorization': f'Bearer {self._access_token}'} def _real_initialize(self): if not self._access_token: self.raise_login_required( 'All videos are only available to registered users', method='password') + def _set_device_id(self, username): + if not self._device_id: + self._device_id = self.cache.load( + self._NETRC_MACHINE, 'device_ids', default={}).get(username) + if self._device_id: + return + self._device_id = str(uuid.uuid4()) + self.cache.store(self._NETRC_MACHINE, 'device_ids', {username: self._device_id}) + def _perform_login(self, username, password): - data = f'grant_type=password&username={urllib.parse.quote(username)}&password={urllib.parse.quote(password)}&scope=openid offline_access&client_id=0oa3e1nutA1HLzAKG356' - access_token = self._download_json( - 'https://ids.mlb.com/oauth2/aus1m088yK07noBfh356/v1/token', None, - headers={ - 'User-Agent': 'okhttp/3.12.1', - 'Content-Type': 'application/x-www-form-urlencoded' - }, data=data.encode())['access_token'] + try: + self._access_token = self._download_json( + 'https://ids.mlb.com/oauth2/aus1m088yK07noBfh356/v1/token', None, + 'Logging in', 'Unable to log in', headers={ + 'User-Agent': 'okhttp/3.12.1', + 'Content-Type': 'application/x-www-form-urlencoded', + }, data=urlencode_postdata({ + 'grant_type': 'password', + 'username': username, + 'password': password, + 'scope': 'openid offline_access', + 'client_id': '0oa3e1nutA1HLzAKG356', + }))['access_token'] + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 400: + raise ExtractorError('Invalid username or password', expected=True) + raise - entitlement = self._download_webpage( - f'https://media-entitlement.mlb.com/api/v3/jwt?os=Android&appname=AtBat&did={str(uuid.uuid4())}', None, - headers={ - 'User-Agent': 'okhttp/3.12.1', - 'Authorization': f'Bearer {access_token}' - }) + self._token_expiry = traverse_obj(self._access_token, ({jwt_decode_hs256}, 'exp', {int})) or 0 + self._set_device_id(username) - data = f'grant_type=urn:ietf:params:oauth:grant-type:token-exchange&subject_token={entitlement}&subject_token_type=urn:ietf:params:oauth:token-type:jwt&platform=android-tv' - self._access_token = self._download_json( - 'https://us.edge.bamgrid.com/token', None, + self._session_id = self._call_api({ + 'operationName': 'initSession', + 'query': self._GRAPHQL_INIT_QUERY, + 'variables': { + 'device': { + 'appVersion': self._APP_VERSION, + 'deviceFamily': 'desktop', + 'knownDeviceId': self._device_id, + 'languagePreference': 'ENGLISH', + 'manufacturer': '', + 'model': '', + 'os': '', + 'osVersion': '', + }, + 'clientType': 'WEB', + }, + }, None, 'session ID')['data']['initSession']['sessionId'] + + def _call_api(self, data, video_id, description='GraphQL JSON', fatal=True): + return self._download_json( + 'https://media-gateway.mlb.com/graphql', video_id, + f'Downloading {description}', f'Unable to download {description}', fatal=fatal, headers={ + **self._api_headers, 'Accept': 'application/json', - 'Authorization': 'Bearer bWxidHYmYW5kcm9pZCYxLjAuMA.6LZMbH2r--rbXcgEabaDdIslpo4RyZrlVfWZhsAgXIk', - 'Content-Type': 'application/x-www-form-urlencoded' - }, data=data.encode())['access_token'] + 'Content-Type': 'application/json', + 'x-client-name': 'WEB', + 'x-client-version': self._APP_VERSION, + }, data=json.dumps(data, separators=(',', ':')).encode()) + + def _extract_formats_and_subtitles(self, broadcast, video_id): + feed = traverse_obj(broadcast, ('homeAway', {str.title})) + medium = traverse_obj(broadcast, ('type', {str})) + language = traverse_obj(broadcast, ('language', {str.lower})) + format_id = join_nonempty(feed, medium, language) + + response = self._call_api({ + 'operationName': 'initPlaybackSession', + 'query': self._GRAPHQL_PLAYBACK_QUERY, + 'variables': { + 'adCapabilities': ['GOOGLE_STANDALONE_AD_PODS'], + 'deviceId': self._device_id, + 'mediaId': broadcast['mediaId'], + 'quality': 'PLACEHOLDER', + 'sessionId': self._session_id, + }, + }, video_id, f'{format_id} broadcast JSON', fatal=False) + + playback = traverse_obj(response, ('data', 'initPlaybackSession', 'playback', {dict})) + m3u8_url = traverse_obj(playback, ('url', {url_or_none})) + token = traverse_obj(playback, ('token', {str})) + + if not (m3u8_url and token): + errors = '; '.join(traverse_obj(response, ('errors', ..., 'message', {str}))) + if 'not entitled' in errors: + raise ExtractorError(errors, expected=True) + elif errors: # Only warn when 'blacked out' since radio formats are available + self.report_warning(f'API returned errors for {format_id}: {errors}') + else: + self.report_warning(f'No formats available for {format_id} broadcast; skipping') + return [], {} + + cdn_headers = {'x-cdn-token': token} + fmts, subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url.replace(f'/{token}/', '/'), video_id, 'mp4', + m3u8_id=format_id, fatal=False, headers=cdn_headers) + for fmt in fmts: + fmt['http_headers'] = cdn_headers + fmt.setdefault('format_note', join_nonempty(feed, medium, delim=' ')) + fmt.setdefault('language', language) + if fmt.get('vcodec') == 'none' and fmt['language'] == 'en': + fmt['source_preference'] = 10 + + return fmts, subs def _real_extract(self, url): video_id = self._match_id(url) - airings = self._download_json( - f'https://search-api-mlbtv.mlb.com/svc/search/v2/graphql/persisted/query/core/Airings?variables=%7B%22partnerProgramIds%22%3A%5B%22{video_id}%22%5D%2C%22applyEsniMediaRightsLabels%22%3Atrue%7D', - video_id)['data']['Airings'] + data = self._download_json( + 'https://statsapi.mlb.com/api/v1/schedule', video_id, query={ + 'gamePk': video_id, + 'hydrate': 'broadcasts(all),statusFlags', + }) + metadata = traverse_obj(data, ( + 'dates', ..., 'games', + lambda _, v: str(v['gamePk']) == video_id and not v.get('rescheduleDate'), any)) + + broadcasts = traverse_obj(metadata, ( + 'broadcasts', lambda _, v: v['mediaId'] and v['mediaState']['mediaStateCode'] != 'MEDIA_OFF')) formats, subtitles = [], {} - for airing in airings: - m3u8_url = self._download_json( - airing['playbackUrls'][0]['href'].format(scenario='browser~csai'), video_id, - headers={ - 'Authorization': self._access_token, - 'Accept': 'application/vnd.media-service+json; version=2' - })['stream']['complete'] - f, s = self._extract_m3u8_formats_and_subtitles( - m3u8_url, video_id, 'mp4', m3u8_id=join_nonempty(airing.get('feedType'), airing.get('feedLanguage'))) - formats.extend(f) - self._merge_subtitles(s, target=subtitles) + for broadcast in broadcasts: + fmts, subs = self._extract_formats_and_subtitles(broadcast, video_id) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) return { 'id': video_id, - 'title': traverse_obj(airings, (..., 'titles', 0, 'episodeName'), get_all=False), - 'is_live': traverse_obj(airings, (..., 'mediaConfig', 'productType'), get_all=False) == 'LIVE', + 'title': join_nonempty( + traverse_obj(metadata, ('officialDate', {str})), + traverse_obj(metadata, ('teams', ('away', 'home'), 'team', 'name', {str}, all, {' @ '.join})), + delim=' - '), + 'is_live': traverse_obj(broadcasts, (..., 'mediaState', 'mediaStateCode', {str}, any)) == 'MEDIA_ON', + 'release_timestamp': traverse_obj(metadata, ('gameDate', {parse_iso8601})), 'formats': formats, 'subtitles': subtitles, - 'http_headers': {'Authorization': f'Bearer {self._access_token}'}, } @@ -355,11 +510,11 @@ class MLBArticleIE(InfoExtractor): 'info_dict': { 'id': '36db7394-343c-4ea3-b8ca-ead2e61bca9a', 'title': 'Machado\'s grab draws hilarious irate reaction', - 'modified_timestamp': 1650130737, + 'modified_timestamp': 1675888370, 'description': 'md5:a19d4eb0487b2cb304e9a176f6b67676', - 'modified_date': '20220416', + 'modified_date': '20230208', }, - 'playlist_count': 2, + 'playlist_mincount': 2, }] def _real_extract(self, url): @@ -367,15 +522,13 @@ def _real_extract(self, url): webpage = self._download_webpage(url, display_id) apollo_cache_json = self._search_json(r'window\.initState\s*=', webpage, 'window.initState', display_id)['apolloCache'] - content_data_id = traverse_obj( - apollo_cache_json, ('ROOT_QUERY', lambda k, _: k.startswith('getForgeContent'), 'id'), get_all=False) - - content_real_info = apollo_cache_json[content_data_id] + content_real_info = traverse_obj( + apollo_cache_json, ('ROOT_QUERY', lambda k, _: k.startswith('getArticle')), get_all=False) return self.playlist_from_matches( - traverse_obj(content_real_info, ('parts', lambda _, v: v['typename'] == 'Video', 'id')), - getter=lambda x: f'https://www.mlb.com/video/{apollo_cache_json[x]["slug"]}', - ie=MLBVideoIE, playlist_id=content_real_info.get('_translationId'), + traverse_obj(content_real_info, ('parts', lambda _, v: v['__typename'] == 'Video' or v['type'] == 'video')), + getter=lambda x: f'https://www.mlb.com/video/{x["slug"]}', + ie=MLBVideoIE, playlist_id=content_real_info.get('translationId'), title=self._html_search_meta('og:title', webpage), description=content_real_info.get('summary'), modified_timestamp=parse_iso8601(content_real_info.get('lastUpdatedDate'))) diff --git a/yt_dlp/extractor/mlssoccer.py b/yt_dlp/extractor/mlssoccer.py index 9383f13581..a0b141ae82 100644 --- a/yt_dlp/extractor/mlssoccer.py +++ b/yt_dlp/extractor/mlssoccer.py @@ -3,7 +3,7 @@ class MLSSoccerIE(InfoExtractor): _VALID_DOMAINS = r'(?:(?:cfmontreal|intermiamicf|lagalaxy|lafc|houstondynamofc|dcunited|atlutd|mlssoccer|fcdallas|columbuscrew|coloradorapids|fccincinnati|chicagofirefc|austinfc|nashvillesc|whitecapsfc|sportingkc|soundersfc|sjearthquakes|rsl|timbers|philadelphiaunion|orlandocitysc|newyorkredbulls|nycfc)\.com|(?:torontofc)\.ca|(?:revolutionsoccer)\.net)' - _VALID_URL = r'https?://(?:www\.)?%s/video/#?(?P<id>[^/&$#?]+)' % _VALID_DOMAINS + _VALID_URL = rf'https?://(?:www\.)?{_VALID_DOMAINS}/video/#?(?P<id>[^/&$#?]+)' _TESTS = [{ 'url': 'https://www.mlssoccer.com/video/the-octagon-can-alphonso-davies-lead-canada-to-first-world-cup-since-1986#the-octagon-can-alphonso-davies-lead-canada-to-first-world-cup-since-1986', @@ -19,96 +19,97 @@ class MLSSoccerIE(InfoExtractor): 'tags': ['club/canada'], 'is_live': False, 'upload_date': '20211007', - 'filesize_approx': 255193528.83200002 + 'filesize_approx': 255193528.83200002, }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }, { 'url': 'https://www.whitecapsfc.com/video/highlights-san-jose-earthquakes-vs-vancouver-whitecaps-fc-october-23-2021#highlights-san-jose-earthquakes-vs-vancouver-whitecaps-fc-october-23-2021', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.torontofc.ca/video/highlights-toronto-fc-vs-cf-montreal-october-23-2021-x6733#highlights-toronto-fc-vs-cf-montreal-october-23-2021-x6733', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.sportingkc.com/video/post-match-press-conference-john-pulskamp-oct-27-2021#post-match-press-conference-john-pulskamp-oct-27-2021', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.soundersfc.com/video/highlights-seattle-sounders-fc-vs-sporting-kansas-city-october-23-2021', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.sjearthquakes.com/video/#highlights-austin-fc-vs-san-jose-earthquakes-june-19-2021', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.rsl.com/video/2021-u-of-u-health-mic-d-up-vs-colorado-10-16-21#2021-u-of-u-health-mic-d-up-vs-colorado-10-16-21', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.timbers.com/video/highlights-d-chara-asprilla-with-goals-in-portland-timbers-2-0-win-over-san-jose#highlights-d-chara-asprilla-with-goals-in-portland-timbers-2-0-win-over-san-jose', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.philadelphiaunion.com/video/highlights-torvphi', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.orlandocitysc.com/video/highlight-columbus-crew-vs-orlando-city-sc', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.newyorkredbulls.com/video/all-access-matchday-double-derby-week#all-access-matchday-double-derby-week', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.nycfc.com/video/highlights-nycfc-1-0-chicago-fire-fc#highlights-nycfc-1-0-chicago-fire-fc', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.revolutionsoccer.net/video/two-minute-highlights-revs-1-rapids-0-october-27-2021#two-minute-highlights-revs-1-rapids-0-october-27-2021', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.nashvillesc.com/video/goal-c-j-sapong-nashville-sc-92nd-minute', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.cfmontreal.com/video/faits-saillants-tor-v-mtl#faits-saillants-orl-v-mtl-x5645', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.intermiamicf.com/video/all-access-victory-vs-nashville-sc-by-ukg#all-access-victory-vs-nashville-sc-by-ukg', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.lagalaxy.com/video/#moment-of-the-month-presented-by-san-manuel-casino-rayan-raveloson-scores-his-se', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.lafc.com/video/breaking-down-lafc-s-final-6-matches-of-the-2021-mls-regular-season#breaking-down-lafc-s-final-6-matches-of-the-2021-mls-regular-season', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.houstondynamofc.com/video/postgame-press-conference-michael-nelson-presented-by-coushatta-casino-res-x9660#postgame-press-conference-michael-nelson-presented-by-coushatta-casino-res-x9660', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.dcunited.com/video/tony-alfaro-my-family-pushed-me-to-believe-everything-was-possible', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.fcdallas.com/video/highlights-fc-dallas-vs-minnesota-united-fc-october-02-2021#highlights-fc-dallas-vs-minnesota-united-fc-october-02-2021', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.columbuscrew.com/video/match-rewind-columbus-crew-vs-new-york-red-bulls-october-23-2021', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.coloradorapids.com/video/postgame-reaction-robin-fraser-october-27#postgame-reaction-robin-fraser-october-27', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.fccincinnati.com/video/#keeping-cincy-chill-presented-by-coors-lite', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.chicagofirefc.com/video/all-access-fire-score-dramatic-road-win-in-cincy#all-access-fire-score-dramatic-road-win-in-cincy', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.austinfc.com/video/highlights-colorado-rapids-vs-austin-fc-september-29-2021#highlights-colorado-rapids-vs-austin-fc-september-29-2021', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.atlutd.com/video/goal-josef-martinez-scores-in-the-73rd-minute#goal-josef-martinez-scores-in-the-73rd-minute', - 'only_matching': True + 'only_matching': True, }] def _real_extract(self, url): - id = self._match_id(url) - webpage = self._download_webpage(url, id) - data_json = self._parse_json(self._html_search_regex(r'data-options\=\"([^\"]+)\"', webpage, 'json'), id)['videoList'][0] + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data_json = self._parse_json( + self._html_search_regex(r'data-options\=\"([^\"]+)\"', webpage, 'json'), video_id)['videoList'][0] return { - 'id': id, + 'id': video_id, '_type': 'url', - 'url': 'https://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (data_json['accountId'], data_json['videoId']), + 'url': 'https://players.brightcove.net/{}/default_default/index.html?videoId={}'.format(data_json['accountId'], data_json['videoId']), 'ie_key': 'BrightcoveNew', } diff --git a/yt_dlp/extractor/mnet.py b/yt_dlp/extractor/mnet.py deleted file mode 100644 index 98bab2e100..0000000000 --- a/yt_dlp/extractor/mnet.py +++ /dev/null @@ -1,85 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, - parse_iso8601, -) - - -class MnetIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mnet\.(?:com|interest\.me)/tv/vod/(?:.*?\bclip_id=)?(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://www.mnet.com/tv/vod/171008', - 'info_dict': { - 'id': '171008', - 'title': 'SS_이해인@히든박스', - 'description': 'md5:b9efa592c3918b615ba69fe9f8a05c55', - 'duration': 88, - 'upload_date': '20151231', - 'timestamp': 1451564040, - 'age_limit': 0, - 'thumbnails': 'mincount:5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'ext': 'flv', - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - 'url': 'http://mnet.interest.me/tv/vod/172790', - 'only_matching': True, - }, { - 'url': 'http://www.mnet.com/tv/vod/vod_view.asp?clip_id=172790&tabMenu=', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - # TODO: extract rtmp formats - # no stype -> rtmp url - # stype=H -> m3u8 url - # stype=M -> mpd url - info = self._download_json( - 'http://content.api.mnet.com/player/vodConfig', - video_id, 'Downloading vod config JSON', query={ - 'id': video_id, - 'ctype': 'CLIP', - 'stype': 'H', - })['data']['info'] - - title = info['title'] - - cdn_data = self._download_json( - info['cdn'], video_id, 'Downloading vod cdn JSON')['data'][0] - m3u8_url = cdn_data['url'] - token = cdn_data.get('token') - if token and token != '-': - m3u8_url += '?' + token - formats = self._extract_wowza_formats( - m3u8_url, video_id, skip_protocols=['rtmp', 'rtsp', 'f4m']) - - description = info.get('ment') - duration = parse_duration(info.get('time')) - timestamp = parse_iso8601(info.get('date'), delimiter=' ') - age_limit = info.get('adult') - if age_limit is not None: - age_limit = 0 if age_limit == 'N' else 18 - thumbnails = [{ - 'id': thumb_format, - 'url': thumb['url'], - 'width': int_or_none(thumb.get('width')), - 'height': int_or_none(thumb.get('height')), - } for thumb_format, thumb in info.get('cover', {}).items() if thumb.get('url')] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'age_limit': age_limit, - 'thumbnails': thumbnails, - 'formats': formats, - } diff --git a/yt_dlp/extractor/mocha.py b/yt_dlp/extractor/mocha.py index 5f72b810bb..c3f4055bad 100644 --- a/yt_dlp/extractor/mocha.py +++ b/yt_dlp/extractor/mocha.py @@ -3,7 +3,7 @@ class MochaVideoIE(InfoExtractor): - _VALID_URL = r'https?://video.mocha.com.vn/(?P<video_slug>[\w-]+)' + _VALID_URL = r'https?://video\.mocha\.com\.vn/(?P<video_slug>[\w-]+)' _TESTS = [{ 'url': 'http://video.mocha.com.vn/chuyen-meo-gia-su-tu-thong-diep-cuoc-song-v18694039', 'info_dict': { @@ -20,8 +20,8 @@ class MochaVideoIE(InfoExtractor): 'timestamp': 1652254203, 'upload_date': '20220511', 'comment_count': int, - 'categories': ['Kids'] - } + 'categories': ['Kids'], + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/moevideo.py b/yt_dlp/extractor/moevideo.py deleted file mode 100644 index fda08cae91..0000000000 --- a/yt_dlp/extractor/moevideo.py +++ /dev/null @@ -1,74 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - clean_html, - int_or_none, -) - - -class MoeVideoIE(InfoExtractor): - IE_DESC = 'LetitBit video services: moevideo.net, playreplay.net and videochart.net' - _VALID_URL = r'''(?x) - https?://(?P<host>(?:www\.)? - (?:(?:moevideo|playreplay|videochart)\.net|thesame\.tv))/ - (?:video|framevideo|embed)/(?P<id>[0-9a-z]+\.[0-9A-Za-z]+)''' - _API_URL = 'http://api.letitbit.net/' - _API_KEY = 'tVL0gjqo5' - _TESTS = [ - { - 'url': 'http://moevideo.net/video/00297.0036103fe3d513ef27915216fd29', - 'md5': '129f5ae1f6585d0e9bb4f38e774ffb3a', - 'info_dict': { - 'id': '00297.0036103fe3d513ef27915216fd29', - 'ext': 'flv', - 'title': 'Sink cut out machine', - 'description': 'md5:f29ff97b663aefa760bf7ca63c8ca8a8', - 'thumbnail': r're:^https?://.*\.jpg$', - 'width': 540, - 'height': 360, - 'duration': 179, - 'filesize': 17822500, - }, - 'skip': 'Video has been removed', - }, - { - 'url': 'http://playreplay.net/video/77107.7f325710a627383d40540d8e991a', - 'md5': '74f0a014d5b661f0f0e2361300d1620e', - 'info_dict': { - 'id': '77107.7f325710a627383d40540d8e991a', - 'ext': 'flv', - 'title': 'Operacion Condor.', - 'description': 'md5:7e68cb2fcda66833d5081c542491a9a3', - 'thumbnail': r're:^https?://.*\.jpg$', - 'width': 480, - 'height': 296, - 'duration': 6027, - 'filesize': 588257923, - }, - 'skip': 'Video has been removed', - }, - ] - - def _real_extract(self, url): - host, video_id = self._match_valid_url(url).groups() - - webpage = self._download_webpage( - 'http://%s/video/%s' % (host, video_id), - video_id, 'Downloading webpage') - - title = self._og_search_title(webpage) - - embed_webpage = self._download_webpage( - 'http://%s/embed/%s' % (host, video_id), - video_id, 'Downloading embed webpage') - video = self._parse_json(self._search_regex( - r'mvplayer\("#player"\s*,\s*({.+})', - embed_webpage, 'mvplayer'), video_id)['video'] - - return { - 'id': video_id, - 'title': title, - 'thumbnail': video.get('poster') or self._og_search_thumbnail(webpage), - 'description': clean_html(self._og_search_description(webpage)), - 'duration': int_or_none(self._og_search_property('video:duration', webpage)), - 'url': video['ourUrl'], - } diff --git a/yt_dlp/extractor/mofosex.py b/yt_dlp/extractor/mofosex.py deleted file mode 100644 index 9cb6980c1c..0000000000 --- a/yt_dlp/extractor/mofosex.py +++ /dev/null @@ -1,70 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - int_or_none, - str_to_int, - unified_strdate, -) -from .keezmovies import KeezMoviesIE - - -class MofosexIE(KeezMoviesIE): # XXX: Do not subclass from concrete IE - _VALID_URL = r'https?://(?:www\.)?mofosex\.com/videos/(?P<id>\d+)/(?P<display_id>[^/?#&.]+)\.html' - _TESTS = [{ - 'url': 'http://www.mofosex.com/videos/318131/amateur-teen-playing-and-masturbating-318131.html', - 'md5': '558fcdafbb63a87c019218d6e49daf8a', - 'info_dict': { - 'id': '318131', - 'display_id': 'amateur-teen-playing-and-masturbating-318131', - 'ext': 'mp4', - 'title': 'amateur teen playing and masturbating', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20121114', - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - 'age_limit': 18, - } - }, { - # This video is no longer available - 'url': 'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - webpage, info = self._extract_info(url) - - view_count = str_to_int(self._search_regex( - r'VIEWS:</span>\s*([\d,.]+)', webpage, 'view count', fatal=False)) - like_count = int_or_none(self._search_regex( - r'id=["\']amountLikes["\'][^>]*>(\d+)', webpage, - 'like count', fatal=False)) - dislike_count = int_or_none(self._search_regex( - r'id=["\']amountDislikes["\'][^>]*>(\d+)', webpage, - 'like count', fatal=False)) - upload_date = unified_strdate(self._html_search_regex( - r'Added:</span>([^<]+)', webpage, 'upload date', fatal=False)) - - info.update({ - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'upload_date': upload_date, - 'thumbnail': self._og_search_thumbnail(webpage), - }) - - return info - - -class MofosexEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=(?P<id>\d+)' - _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=\d+)'] - _TESTS = [{ - 'url': 'https://www.mofosex.com/embed/?videoid=318131&referrer=KM', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result( - 'http://www.mofosex.com/videos/{0}/{0}.html'.format(video_id), - ie=MofosexIE.ie_key(), video_id=video_id) diff --git a/yt_dlp/extractor/mojvideo.py b/yt_dlp/extractor/mojvideo.py index d47ad07421..6bc362a53c 100644 --- a/yt_dlp/extractor/mojvideo.py +++ b/yt_dlp/extractor/mojvideo.py @@ -17,7 +17,7 @@ class MojvideoIE(InfoExtractor): 'title': 'V avtu pred mano rdečelaska - Alfi Nipič', 'thumbnail': r're:^http://.*\.jpg$', 'duration': 242, - } + }, } def _real_extract(self, url): @@ -27,12 +27,12 @@ def _real_extract(self, url): # XML is malformed playerapi = self._download_webpage( - 'http://www.mojvideo.com/playerapi.php?v=%s&t=1' % video_id, display_id) + f'http://www.mojvideo.com/playerapi.php?v={video_id}&t=1', display_id) if '<error>true</error>' in playerapi: error_desc = self._html_search_regex( r'<errordesc>([^<]*)</errordesc>', playerapi, 'error description', fatal=False) - raise ExtractorError('%s said: %s' % (self.IE_NAME, error_desc), expected=True) + raise ExtractorError(f'{self.IE_NAME} said: {error_desc}', expected=True) title = self._html_extract_title(playerapi) video_url = self._html_search_regex( diff --git a/yt_dlp/extractor/monstercat.py b/yt_dlp/extractor/monstercat.py new file mode 100644 index 0000000000..930c13e278 --- /dev/null +++ b/yt_dlp/extractor/monstercat.py @@ -0,0 +1,77 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + extract_attributes, + get_element_by_class, + get_element_html_by_class, + get_element_text_and_html_by_tag, + int_or_none, + strip_or_none, + traverse_obj, + try_call, + unified_strdate, +) + + +class MonstercatIE(InfoExtractor): + _VALID_URL = r'https?://www\.monstercat\.com/release/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.monstercat.com/release/742779548009', + 'playlist_count': 20, + 'info_dict': { + 'title': 'The Secret Language of Trees', + 'id': '742779548009', + 'thumbnail': 'https://www.monstercat.com/release/742779548009/cover', + 'release_date': '20230711', + 'album': 'The Secret Language of Trees', + 'album_artist': 'BT', + }, + }] + + def _extract_tracks(self, table, album_meta): + for td in re.findall(r'<tr[^<]*>((?:(?!</tr>)[\w\W])+)', table): # regex by chatgpt due to lack of get_elements_by_tag + title = clean_html(try_call( + lambda: get_element_by_class('d-inline-flex flex-column', td).partition(' <span')[0])) + ids = extract_attributes(try_call(lambda: get_element_html_by_class('btn-play cursor-pointer mr-small', td)) or '') + track_id = ids.get('data-track-id') + release_id = ids.get('data-release-id') + + track_number = int_or_none(try_call(lambda: get_element_by_class('py-xsmall', td))) + if not track_id or not release_id: + self.report_warning(f'Skipping track {track_number}, ID(s) not found') + self.write_debug(f'release_id={release_id!r} track_id={track_id!r}') + continue + yield { + **album_meta, + 'title': title, + 'track': title, + 'track_number': track_number, + 'artist': clean_html(try_call(lambda: get_element_by_class('d-block fs-xxsmall', td))), + 'url': f'https://www.monstercat.com/api/release/{release_id}/track-stream/{track_id}', + 'id': track_id, + 'ext': 'mp3', + } + + def _real_extract(self, url): + url_id = self._match_id(url) + html = self._download_webpage(url, url_id) + # wrap all `get_elements` in `try_call`, HTMLParser has problems with site's html + tracklist_table = try_call(lambda: get_element_by_class('table table-small', html)) or '' + + title = try_call(lambda: get_element_text_and_html_by_tag('h1', html)[0]) + date = traverse_obj(html, ({lambda html: get_element_by_class('font-italic mb-medium d-tablet-none d-phone-block', + html).partition('Released ')}, 2, {strip_or_none}, {unified_strdate})) + + album_meta = { + 'title': title, + 'album': title, + 'thumbnail': f'https://www.monstercat.com/release/{url_id}/cover', + 'album_artist': try_call( + lambda: get_element_by_class('h-normal text-uppercase mb-desktop-medium mb-smallish', html)), + 'release_date': date, + } + + return self.playlist_result( + self._extract_tracks(tracklist_table, album_meta), playlist_id=url_id, **album_meta) diff --git a/yt_dlp/extractor/morningstar.py b/yt_dlp/extractor/morningstar.py deleted file mode 100644 index e9fcfe3e20..0000000000 --- a/yt_dlp/extractor/morningstar.py +++ /dev/null @@ -1,45 +0,0 @@ -from .common import InfoExtractor - - -class MorningstarIE(InfoExtractor): - IE_DESC = 'morningstar.com' - _VALID_URL = r'https?://(?:(?:www|news)\.)morningstar\.com/[cC]over/video[cC]enter\.aspx\?id=(?P<id>[0-9]+)' - _TESTS = [{ - 'url': 'http://www.morningstar.com/cover/videocenter.aspx?id=615869', - 'md5': '6c0acface7a787aadc8391e4bbf7b0f5', - 'info_dict': { - 'id': '615869', - 'ext': 'mp4', - 'title': 'Get Ahead of the Curve on 2013 Taxes', - 'description': "Vanguard's Joel Dickson on managing higher tax rates for high-income earners and fund capital-gain distributions in 2013.", - 'thumbnail': r're:^https?://.*m(?:orning)?star\.com/.+thumb\.jpg$' - } - }, { - 'url': 'http://news.morningstar.com/cover/videocenter.aspx?id=825556', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'<h1 id="titleLink">(.*?)</h1>', webpage, 'title') - video_url = self._html_search_regex( - r'<input type="hidden" id="hidVideoUrl" value="([^"]+)"', - webpage, 'video URL') - thumbnail = self._html_search_regex( - r'<input type="hidden" id="hidSnapshot" value="([^"]+)"', - webpage, 'thumbnail', fatal=False) - description = self._html_search_regex( - r'<div id="mstarDeck".*?>(.*?)</div>', - webpage, 'description', fatal=False) - - return { - 'id': video_id, - 'title': title, - 'url': video_url, - 'thumbnail': thumbnail, - 'description': description, - } diff --git a/yt_dlp/extractor/motherless.py b/yt_dlp/extractor/motherless.py index c24ef9b0d1..86551950b7 100644 --- a/yt_dlp/extractor/motherless.py +++ b/yt_dlp/extractor/motherless.py @@ -1,32 +1,39 @@ -import datetime +import datetime as dt import re +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( ExtractorError, - InAdvancePagedList, - orderedSet, + OnDemandPagedList, + remove_end, str_to_int, unified_strdate, ) class MotherlessIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/|G[VIG]?[A-F0-9]+/)?(?P<id>[A-F0-9]+)' _TESTS = [{ - 'url': 'http://motherless.com/AC3FFE1', - 'md5': '310f62e325a9fafe64f68c0bccb6e75f', + 'url': 'http://motherless.com/EE97006', + 'md5': 'cb5e7438f7a3c4e886b7bccc1292a3bc', 'info_dict': { - 'id': 'AC3FFE1', + 'id': 'EE97006', 'ext': 'mp4', - 'title': 'Fucked in the ass while playing PS3', - 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], - 'upload_date': '20100913', - 'uploader_id': 'famouslyfuckedup', + 'title': 'Dogging blond Brit getting glazed (comp)', + 'categories': ['UK', 'slag', 'whore', 'dogging', 'cunt', 'cumhound', 'big tits', 'Pearl Necklace'], + 'upload_date': '20230519', + 'uploader_id': 'deathbird', 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, - } + 'comment_count': int, + 'view_count': int, + 'like_count': int, + }, + 'params': { + # Incomplete cert chains + 'nocheckcertificate': True, + }, }, { 'url': 'http://motherless.com/532291B', 'md5': 'bc59a6b47d1f958e61fbd38a4d31b131', @@ -49,16 +56,36 @@ class MotherlessIE(InfoExtractor): 'id': '633979F', 'ext': 'mp4', 'title': 'Turtlette', - 'categories': ['superheroine heroine superher'], + 'categories': ['superheroine heroine superher'], 'upload_date': '20140827', 'uploader_id': 'shade0230', 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, - } + 'like_count': int, + 'comment_count': int, + 'view_count': int, + }, + 'params': { + 'nocheckcertificate': True, + }, }, { - # no keywords 'url': 'http://motherless.com/8B4BBC1', - 'only_matching': True, + 'info_dict': { + 'id': '8B4BBC1', + 'ext': 'mp4', + 'title': 'VIDEO00441.mp4', + 'categories': [], + 'upload_date': '20160214', + 'uploader_id': 'NMWildGirl', + 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 18, + 'like_count': int, + 'comment_count': int, + 'view_count': int, + }, + 'params': { + 'nocheckcertificate': True, + }, }, { # see https://motherless.com/videos/recent for recent videos with # uploaded date in "ago" format @@ -72,9 +99,12 @@ class MotherlessIE(InfoExtractor): 'uploader_id': 'anonymous', 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, + 'like_count': int, + 'comment_count': int, + 'view_count': int, }, 'params': { - 'skip_download': True, + 'nocheckcertificate': True, }, }] @@ -85,10 +115,10 @@ def _real_extract(self, url): if any(p in webpage for p in ( '<title>404 - MOTHERLESS.COM<', ">The page you're looking for cannot be found.<")): - raise ExtractorError('Video %s does not exist' % video_id, expected=True) + raise ExtractorError(f'Video {video_id} does not exist', expected=True) if '>The content you are trying to view is for friends only.' in webpage: - raise ExtractorError('Video %s is for friends only' % video_id, expected=True) + raise ExtractorError(f'Video {video_id} is for friends only', expected=True) title = self._html_search_regex( (r'(?s)<div[^>]+\bclass=["\']media-meta-title[^>]+>(.+?)</div>', @@ -97,7 +127,7 @@ def _real_extract(self, url): (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', r'fileurl\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'), webpage, 'video URL', default=None, group='url') - or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) + or f'http://cdn4.videos.motherlessmedia.com/videos/{video_id}.mp4?fs=opencloud') age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( (r'>([\d,.]+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'), @@ -121,17 +151,15 @@ def _real_extract(self, url): 'd': 'days', } kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} - upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') + upload_date = (dt.datetime.now(dt.timezone.utc) - dt.timedelta(**kwargs)).strftime('%Y%m%d') comment_count = len(re.findall(r'''class\s*=\s*['"]media-comment-contents\b''', webpage)) uploader_id = self._html_search_regex( (r'''<span\b[^>]+\bclass\s*=\s*["']username\b[^>]*>([^<]+)</span>''', r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)'''), webpage, 'uploader_id', fatal=False) - - categories = self._html_search_meta('keywords', webpage, default=None) - if categories: - categories = [cat.strip() for cat in categories.split(',')] + categories = self._html_search_meta('keywords', webpage, default='') + categories = [cat.strip() for cat in categories.split(',') if cat.strip()] return { 'id': video_id, @@ -148,102 +176,122 @@ def _real_extract(self, url): } -class MotherlessGroupIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)' +class MotherlessPaginatedIE(InfoExtractor): + _EXTRA_QUERY = {} + _PAGE_SIZE = 60 + + def _correct_path(self, url, item_id): + raise NotImplementedError('This method must be implemented by subclasses') + + def _extract_entries(self, webpage, base): + for mobj in re.finditer(r'href="[^"]*(?P<href>/[A-F0-9]+)"\s+title="(?P<title>[^"]+)', + webpage): + video_url = urllib.parse.urljoin(base, mobj.group('href')) + video_id = MotherlessIE.get_temp_id(video_url) + + if video_id: + yield self.url_result(video_url, MotherlessIE, video_id, mobj.group('title')) + + def _real_extract(self, url): + item_id = self._match_id(url) + real_url = self._correct_path(url, item_id) + webpage = self._download_webpage(real_url, item_id, 'Downloading page 1') + + def get_page(idx): + page = idx + 1 + current_page = webpage if not idx else self._download_webpage( + real_url, item_id, note=f'Downloading page {page}', query={'page': page, **self._EXTRA_QUERY}) + yield from self._extract_entries(current_page, real_url) + + return self.playlist_result( + OnDemandPagedList(get_page, self._PAGE_SIZE), item_id, + remove_end(self._html_extract_title(webpage), ' | MOTHERLESS.COM ™')) + + +class MotherlessGroupIE(MotherlessPaginatedIE): + _VALID_URL = r'https?://(?:www\.)?motherless\.com/g[vifm]?/(?P<id>[a-z0-9_]+)/?(?:$|[#?])' _TESTS = [{ - 'url': 'http://motherless.com/g/movie_scenes', + 'url': 'http://motherless.com/gv/movie_scenes', 'info_dict': { 'id': 'movie_scenes', - 'title': 'Movie Scenes', - 'description': 'Hot and sexy scenes from "regular" movies... ' - 'Beautiful actresses fully nude... A looot of ' - 'skin! :)Enjoy!', + 'title': 'Movie Scenes - Videos - Hot and sexy scenes from "regular" movies... Beautiful actresses fully', }, - 'playlist_mincount': 662, + 'playlist_mincount': 540, }, { - 'url': 'http://motherless.com/gv/sex_must_be_funny', + 'url': 'http://motherless.com/g/sex_must_be_funny', 'info_dict': { 'id': 'sex_must_be_funny', 'title': 'Sex must be funny', - 'description': 'Sex can be funny. Wide smiles,laugh, games, fun of ' - 'any kind!' }, - 'playlist_mincount': 0, - 'expected_warnings': [ - 'This group has no videos.', - ] + 'playlist_count': 0, }, { - 'url': 'https://motherless.com/g/beautiful_cock', + 'url': 'https://motherless.com/gv/beautiful_cock', 'info_dict': { 'id': 'beautiful_cock', 'title': 'Beautiful Cock', - 'description': 'Group for lovely cocks yours, mine, a friends anything human', }, - 'playlist_mincount': 2500, + 'playlist_mincount': 2040, }] - @classmethod - def suitable(cls, url): - return (False if MotherlessIE.suitable(url) - else super(MotherlessGroupIE, cls).suitable(url)) + def _correct_path(self, url, item_id): + return urllib.parse.urljoin(url, f'/gv/{item_id}') - def _extract_entries(self, webpage, base): - entries = [] - for mobj in re.finditer( - r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?', - webpage): - video_url = compat_urlparse.urljoin(base, mobj.group('href')) - if not MotherlessIE.suitable(video_url): - continue - video_id = MotherlessIE._match_id(video_url) - title = mobj.group('title') - entries.append(self.url_result( - video_url, ie=MotherlessIE.ie_key(), video_id=video_id, - video_title=title)) - # Alternative fallback - if not entries: - entries = [ - self.url_result( - compat_urlparse.urljoin(base, '/' + entry_id), - ie=MotherlessIE.ie_key(), video_id=entry_id) - for entry_id in orderedSet(re.findall( - r'data-codename=["\']([A-Z0-9]+)', webpage))] - return entries - def _real_extract(self, url): - group_id = self._match_id(url) - page_url = compat_urlparse.urljoin(url, '/gv/%s' % group_id) - webpage = self._download_webpage(page_url, group_id) - title = self._search_regex( - r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False) - description = self._html_search_meta( - 'description', webpage, fatal=False) - page_count = str_to_int(self._search_regex( - r'(\d+)\s*</(?:a|span)>\s*<(?:a|span)[^>]+(?:>\s*NEXT|\brel\s*=\s*["\']?next)\b', - webpage, 'page_count', default=0)) - if not page_count: - message = self._search_regex( - r'''class\s*=\s*['"]error-page\b[^>]*>\s*<p[^>]*>\s*(?P<error_msg>[^<]+)(?<=\S)\s*''', - webpage, 'error_msg', default=None) or 'This group has no videos.' - self.report_warning(message, group_id) - page_count = 1 - PAGE_SIZE = 80 +class MotherlessGalleryIE(MotherlessPaginatedIE): + _VALID_URL = r'https?://(?:www\.)?motherless\.com/G[VIG]?(?P<id>[A-F0-9]+)/?(?:$|[#?])' + _TESTS = [{ + 'url': 'https://motherless.com/GV338999F', + 'info_dict': { + 'id': '338999F', + 'title': 'Random', + }, + 'playlist_mincount': 171, + }, { + 'url': 'https://motherless.com/GVABD6213', + 'info_dict': { + 'id': 'ABD6213', + 'title': 'Cuties', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://motherless.com/GVBCF7622', + 'info_dict': { + 'id': 'BCF7622', + 'title': 'Vintage', + }, + 'playlist_count': 0, + }, { + 'url': 'https://motherless.com/G035DE2F', + 'info_dict': { + 'id': '035DE2F', + 'title': 'General', + }, + 'playlist_mincount': 420, + }] - def _get_page(idx): - if idx > 0: - webpage = self._download_webpage( - page_url, group_id, query={'page': idx + 1}, - note='Downloading page %d/%d' % (idx + 1, page_count) - ) - for entry in self._extract_entries(webpage, url): - yield entry + def _correct_path(self, url, item_id): + return urllib.parse.urljoin(url, f'/GV{item_id}') - playlist = InAdvancePagedList(_get_page, page_count, PAGE_SIZE) - return { - '_type': 'playlist', - 'id': group_id, - 'title': title, - 'description': description, - 'entries': playlist - } +class MotherlessUploaderIE(MotherlessPaginatedIE): + _VALID_URL = r'https?://(?:www\.)?motherless\.com/u/(?P<id>\w+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://motherless.com/u/Mrgo4hrs2023', + 'info_dict': { + 'id': 'Mrgo4hrs2023', + 'title': "Mrgo4hrs2023's Uploads - Videos", + }, + 'playlist_mincount': 32, + }, { + 'url': 'https://motherless.com/u/Happy_couple?t=v', + 'info_dict': { + 'id': 'Happy_couple', + 'title': "Happy_couple's Uploads - Videos", + }, + 'playlist_mincount': 8, + }] + + _EXTRA_QUERY = {'t': 'v'} + + def _correct_path(self, url, item_id): + return urllib.parse.urljoin(url, f'/u/{item_id}?t=v') diff --git a/yt_dlp/extractor/motorsport.py b/yt_dlp/extractor/motorsport.py index efb087d035..0178367105 100644 --- a/yt_dlp/extractor/motorsport.py +++ b/yt_dlp/extractor/motorsport.py @@ -1,10 +1,10 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import ( - compat_urlparse, -) class MotorsportIE(InfoExtractor): + _WORKING = False IE_DESC = 'motorsport.com' _VALID_URL = r'https?://(?:www\.)?motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/?(?:$|[?#])' _TEST = { @@ -18,7 +18,7 @@ class MotorsportIE(InfoExtractor): 'uploader': 'mcomstaff', 'uploader_id': 'UC334JIYKkVnyFoNCclfZtHQ', 'upload_date': '20140903', - 'thumbnail': r're:^https?://.+\.jpg$' + 'thumbnail': r're:^https?://.+\.jpg$', }, 'add_ie': ['Youtube'], 'params': { @@ -39,7 +39,7 @@ def _real_extract(self, url): return self.url_result(iframe_path) iframe = self._download_webpage( - compat_urlparse.urljoin(url, iframe_path), display_id, + urllib.parse.urljoin(url, iframe_path), display_id, 'Downloading iframe') youtube_id = self._search_regex( r'www.youtube.com/embed/(.{11})', iframe, 'youtube id') @@ -47,5 +47,5 @@ def _real_extract(self, url): return { '_type': 'url_transparent', 'display_id': display_id, - 'url': 'https://youtube.com/watch?v=%s' % youtube_id, + 'url': f'https://youtube.com/watch?v={youtube_id}', } diff --git a/yt_dlp/extractor/movieclips.py b/yt_dlp/extractor/movieclips.py deleted file mode 100644 index 4777f440e0..0000000000 --- a/yt_dlp/extractor/movieclips.py +++ /dev/null @@ -1,46 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - smuggle_url, - float_or_none, - parse_iso8601, - update_url_query, -) - - -class MovieClipsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?movieclips\.com/videos/.+-(?P<id>\d+)(?:\?|$)' - _TEST = { - 'url': 'http://www.movieclips.com/videos/warcraft-trailer-1-561180739597', - 'md5': '42b5a0352d4933a7bd54f2104f481244', - 'info_dict': { - 'id': 'pKIGmG83AqD9', - 'ext': 'mp4', - 'title': 'Warcraft Trailer 1', - 'description': 'Watch Trailer 1 from Warcraft (2016). Legendary’s WARCRAFT is a 3D epic adventure of world-colliding conflict based.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1446843055, - 'upload_date': '20151106', - 'uploader': 'Movieclips', - }, - 'add_ie': ['ThePlatform'], - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video = next(v for v in self._parse_json(self._search_regex( - r'var\s+__REACT_ENGINE__\s*=\s*({.+});', - webpage, 'react engine'), video_id)['playlist']['videos'] if v['id'] == video_id) - - return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': smuggle_url(update_url_query( - video['contentUrl'], {'mbr': 'true'}), {'force_smil_url': True}), - 'title': self._og_search_title(webpage), - 'description': self._html_search_meta('description', webpage), - 'duration': float_or_none(video.get('duration')), - 'timestamp': parse_iso8601(video.get('dateCreated')), - 'thumbnail': video.get('defaultImage'), - 'uploader': video.get('provider'), - } diff --git a/yt_dlp/extractor/moviepilot.py b/yt_dlp/extractor/moviepilot.py index 668c0984eb..ed5be4fa65 100644 --- a/yt_dlp/extractor/moviepilot.py +++ b/yt_dlp/extractor/moviepilot.py @@ -1,5 +1,5 @@ -from .dailymotion import DailymotionIE from .common import InfoExtractor +from .dailymotion import DailymotionIE class MoviepilotIE(InfoExtractor): @@ -14,7 +14,7 @@ class MoviepilotIE(InfoExtractor): 'display_id': 'interstellar-2', 'ext': 'mp4', 'title': 'Interstellar', - 'thumbnail': r're:https://\w+\.dmcdn\.net/v/SaV-q1ZganMw4HVXg/x1080', + 'thumbnail': r're:https://\w+\.dmcdn\.net/v/SaV-q1.*/x1080', 'timestamp': 1605010596, 'description': 'md5:0ae9cb452af52610c9ffc60f2fd0474c', 'uploader': 'Moviepilot', @@ -71,7 +71,7 @@ class MoviepilotIE(InfoExtractor): 'age_limit': 0, 'duration': 82, 'upload_date': '20201109', - 'thumbnail': r're:https://\w+\.dmcdn\.net/v/SaMes1Zg3lxLv9j5u/x1080', + 'thumbnail': r're:https://\w+\.dmcdn\.net/v/SaMes1Z.*/x1080', 'uploader': 'Moviepilot', 'like_count': int, 'view_count': int, @@ -92,6 +92,6 @@ def _real_extract(self, url): 'ie_key': DailymotionIE.ie_key(), 'display_id': video_id, 'title': clip.get('title'), - 'url': f'https://www.dailymotion.com/video/{clip["videoRemoteId"]}', + 'url': f'https://www.dailymotion.com/video/{clip["video"]["remoteId"]}', 'description': clip.get('summary'), } diff --git a/yt_dlp/extractor/moview.py b/yt_dlp/extractor/moview.py index 678b2eb06e..560154e1a1 100644 --- a/yt_dlp/extractor/moview.py +++ b/yt_dlp/extractor/moview.py @@ -16,7 +16,7 @@ class MoviewPlayIE(JixieBaseIE): 'title': 'Candy Monster', 'description': 'Mengapa Candy Monster ingin mengambil permen Chloe?', 'thumbnail': 'https://video.jixie.media/1034/146182/146182_1280x720.jpg', - } + }, }, { # non-drm hls 'url': 'https://www.moview.id/play/75/Paris-Van-Java-Episode-16', @@ -29,8 +29,8 @@ class MoviewPlayIE(JixieBaseIE): 'thumbnail': 'https://video.jixie.media/1003/28210/28210_1280x720.jpg', 'description': 'md5:2a5e18d98eef9b39d7895029cac96c63', 'title': 'Paris Van Java Episode 16', - } - } + }, + }, ] def _real_extract(self, url): diff --git a/yt_dlp/extractor/moviezine.py b/yt_dlp/extractor/moviezine.py index cffcdcfb56..331a56234b 100644 --- a/yt_dlp/extractor/moviezine.py +++ b/yt_dlp/extractor/moviezine.py @@ -20,7 +20,7 @@ def _real_extract(self, url): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - jsplayer = self._download_webpage('http://www.moviezine.se/api/player.js?video=%s' % video_id, video_id, 'Downloading js api player') + jsplayer = self._download_webpage(f'http://www.moviezine.se/api/player.js?video={video_id}', video_id, 'Downloading js api player') formats = [{ 'format_id': 'sd', diff --git a/yt_dlp/extractor/movingimage.py b/yt_dlp/extractor/movingimage.py index cdd8ba4dce..7b1c7979ce 100644 --- a/yt_dlp/extractor/movingimage.py +++ b/yt_dlp/extractor/movingimage.py @@ -1,7 +1,7 @@ from .common import InfoExtractor from ..utils import ( - unescapeHTML, parse_duration, + unescapeHTML, ) @@ -31,7 +31,7 @@ def _real_extract(self, url): def search_field(field_name, fatal=False): return self._search_regex( - r'<span\s+class="field_title">%s:</span>\s*<span\s+class="field_content">([^<]+)</span>' % field_name, + rf'<span\s+class="field_title">{field_name}:</span>\s*<span\s+class="field_content">([^<]+)</span>', webpage, 'title', fatal=fatal) title = unescapeHTML(search_field('Title', fatal=True)).strip('()[]') diff --git a/yt_dlp/extractor/msn.py b/yt_dlp/extractor/msn.py index f91c53eba1..dd864952c4 100644 --- a/yt_dlp/extractor/msn.py +++ b/yt_dlp/extractor/msn.py @@ -1,16 +1,16 @@ import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( - determine_ext, ExtractorError, + determine_ext, int_or_none, unescapeHTML, ) class MSNIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:(?:www|preview)\.)?msn\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/[a-z]{2}-(?P<id>[\da-zA-Z]+)' _TESTS = [{ 'url': 'https://www.msn.com/en-in/money/video/7-ways-to-get-rid-of-chest-congestion/vi-BBPxU6d', @@ -138,7 +138,7 @@ def _real_extract(self, url): format_code = file_.get('formatCode') if not format_url or not format_code: continue - if compat_str(format_code) == '3100': + if str(format_code) == '3100': subtitles.setdefault(file_.get('culture', 'en'), []).append({ 'ext': determine_ext(format_url, 'ttml'), 'url': format_url, @@ -162,6 +162,6 @@ def _real_extract(self, url): error = unescapeHTML(self._search_regex( r'data-error=(["\'])(?P<error>.+?)\1', webpage, 'error', group='error')) - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + raise ExtractorError(f'{self.IE_NAME} said: {error}', expected=True) return self.playlist_result(entries, page_id) diff --git a/yt_dlp/extractor/mtv.py b/yt_dlp/extractor/mtv.py index d91be62700..34e015dfcd 100644 --- a/yt_dlp/extractor/mtv.py +++ b/yt_dlp/extractor/mtv.py @@ -1,17 +1,16 @@ import re +import xml.etree.ElementTree from .common import InfoExtractor -from ..compat import compat_str +from ..networking import HEADRequest, Request from ..utils import ( ExtractorError, + RegexNotFoundError, find_xpath_attr, fix_xml_ampersands, float_or_none, - HEADRequest, int_or_none, join_nonempty, - RegexNotFoundError, - sanitized_Request, strip_or_none, timeconvert, try_get, @@ -23,7 +22,7 @@ def _media_xml_tag(tag): - return '{http://search.yahoo.com/mrss/}%s' % tag + return f'{{http://search.yahoo.com/mrss/}}{tag}' class MTVServicesInfoExtractor(InfoExtractor): @@ -43,7 +42,7 @@ def _get_feed_url(self, uri, url=None): return self._FEED_URL def _get_thumbnail_url(self, uri, itemdoc): - search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) + search_path = '{}/{}'.format(_media_xml_tag('group'), _media_xml_tag('thumbnail')) thumb_node = itemdoc.find(search_path) if thumb_node is None: return None @@ -51,17 +50,17 @@ def _get_thumbnail_url(self, uri, itemdoc): def _extract_mobile_video_formats(self, mtvn_id): webpage_url = self._MOBILE_TEMPLATE % mtvn_id - req = sanitized_Request(webpage_url) + req = Request(webpage_url) # Otherwise we get a webpage that would execute some javascript - req.add_header('User-Agent', 'curl/7') + req.headers['User-Agent'] = 'curl/7' webpage = self._download_webpage(req, mtvn_id, 'Downloading mobile page') metrics_url = unescapeHTML(self._search_regex(r'<a href="(http://metrics.+?)"', webpage, 'url')) req = HEADRequest(metrics_url) response = self._request_webpage(req, mtvn_id, 'Resolving url') - url = response.geturl() + url = response.url # Transform the url to get the best quality: - url = re.sub(r'.+pxE=mp4', 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4', url, 1) + url = re.sub(r'.+pxE=mp4', 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4', url, count=1) return [{'url': url, 'ext': 'mp4'}] def _extract_video_formats(self, mdoc, mtvn_id, video_id): @@ -87,7 +86,7 @@ def _extract_video_formats(self, mdoc, mtvn_id, video_id): rtmp_video_url = rendition.find('./src').text if 'error_not_available.swf' in rtmp_video_url: raise ExtractorError( - '%s said: video is not available' % self.IE_NAME, + f'{self.IE_NAME} said: video is not available', expected=True) if rtmp_video_url.endswith('siteunavail.png'): continue @@ -118,8 +117,8 @@ def _extract_subtitles(self, mdoc, mtvn_id): if ext == 'cea-608': ext = 'scc' subtitles.setdefault(lang, []).append({ - 'url': compat_str(sub_src), - 'ext': ext + 'url': str(sub_src), + 'ext': ext, }) return subtitles @@ -127,7 +126,7 @@ def _get_video_info(self, itemdoc, use_hls=True): uri = itemdoc.find('guid').text video_id = self._id_from_uri(uri) self.report_extraction(video_id) - content_el = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))) + content_el = itemdoc.find('{}/{}'.format(_media_xml_tag('group'), _media_xml_tag('content'))) mediagen_url = self._remove_template_parameter(content_el.attrib['url']) mediagen_url = mediagen_url.replace('device={device}', '') if 'acceptMethods' not in mediagen_url: @@ -138,14 +137,14 @@ def _get_video_info(self, itemdoc, use_hls=True): mediagen_doc = self._download_xml( mediagen_url, video_id, 'Downloading video urls', fatal=False) - if mediagen_doc is False: + if not isinstance(mediagen_doc, xml.etree.ElementTree.Element): return None item = mediagen_doc.find('./video/item') if item is not None and item.get('type') == 'text': - message = '%s returned error: ' % self.IE_NAME + message = f'{self.IE_NAME} returned error: ' if item.get('code') is not None: - message += '%s - ' % item.get('code') + message += '{} - '.format(item.get('code')) message += item.text raise ExtractorError(message, expected=True) @@ -184,7 +183,7 @@ def _get_video_info(self, itemdoc, use_hls=True): episode = episode.text if episode is not None else None if season and episode: # episode number includes season, so remove it - episode = re.sub(r'^%s' % season, '', episode) + episode = re.sub(rf'^{season}', '', episode) # This a short id that's used in the webpage urls mtvn_id = None @@ -255,7 +254,7 @@ def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None): feed_url = try_get( triforce_feed, lambda x: x['manifest']['zones'][data_zone]['feed'], - compat_str) + str) if not feed_url: return @@ -263,7 +262,7 @@ def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None): if not feed: return - return try_get(feed, lambda x: x['result']['data']['id'], compat_str) + return try_get(feed, lambda x: x['result']['data']['id'], str) @staticmethod def _extract_child_with_type(parent, t): @@ -320,8 +319,7 @@ def _real_extract(self, url): title = url_basename(url) webpage = self._download_webpage(url, title) mgid = self._extract_mgid(webpage) - videos_info = self._get_videos_info(mgid, url=url) - return videos_info + return self._get_videos_info(mgid, url=url) class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): @@ -346,7 +344,7 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): def _get_feed_url(self, uri, url=None): video_id = self._id_from_uri(uri) config = self._download_json( - 'http://media.mtvnservices.com/pmt/e1/access/index.html?uri=%s&configtype=edge' % uri, video_id) + f'http://media.mtvnservices.com/pmt/e1/access/index.html?uri={uri}&configtype=edge', video_id) return self._remove_template_parameter(config['feedWithQueryParams']) def _real_extract(self, url): @@ -443,14 +441,15 @@ def _real_extract(self, url): r'(?s)isVevoVideo = true;.*?vevoVideoId = "(.*?)";', webpage) if m_vevo: vevo_id = m_vevo.group(1) - self.to_screen('Vevo video detected: %s' % vevo_id) - return self.url_result('vevo:%s' % vevo_id, ie='Vevo') + self.to_screen(f'Vevo video detected: {vevo_id}') + return self.url_result(f'vevo:{vevo_id}', ie='Vevo') uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, 'uri') return self._get_videos_info(uri) class MTVDEIE(MTVServicesInfoExtractor): + _WORKING = False IE_NAME = 'mtv.de' _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:musik/videoclips|folgen|news)/(?P<id>[0-9a-z]+)' _TESTS = [{ @@ -576,9 +575,9 @@ class MTVItaliaProgrammaIE(MTVItaliaIE): # XXX: Do not subclass from concrete I def _get_entries(self, title, url): while True: pg = self._search_regex(r'/(\d+)$', url, 'entries', '1') - entries = self._download_json(url, title, 'page %s' % pg) + entries = self._download_json(url, title, f'page {pg}') url = try_get( - entries, lambda x: x['result']['nextPageURL'], compat_str) + entries, lambda x: x['result']['nextPageURL'], str) entries = try_get( entries, ( lambda x: x['result']['data']['items'], @@ -597,15 +596,15 @@ def _real_extract(self, url): info = self._download_json(info_url, video_id).get('manifest') redirect = try_get( - info, lambda x: x['newLocation']['url'], compat_str) + info, lambda x: x['newLocation']['url'], str) if redirect: return self.url_result(redirect) title = info.get('title') video_id = try_get( - info, lambda x: x['reporting']['itemId'], compat_str) + info, lambda x: x['reporting']['itemId'], str) parent_id = try_get( - info, lambda x: x['reporting']['parentId'], compat_str) + info, lambda x: x['reporting']['parentId'], str) playlist_url = current_url = None for z in (info.get('zones') or {}).values(): @@ -629,15 +628,15 @@ def _real_extract(self, url): info, ( lambda x: x['title'], lambda x: x['headline']), - compat_str) - description = try_get(info, lambda x: x['content'], compat_str) + str) + description = try_get(info, lambda x: x['content'], str) if current_url: season = try_get( self._download_json(playlist_url, video_id, 'Seasons info'), lambda x: x['result']['data'], dict) current = try_get( - season, lambda x: x['currentSeason'], compat_str) + season, lambda x: x['currentSeason'], str) seasons = try_get( season, lambda x: x['seasons'], list) or [] diff --git a/yt_dlp/extractor/muenchentv.py b/yt_dlp/extractor/muenchentv.py index 36a2d4688e..5d2dd392b1 100644 --- a/yt_dlp/extractor/muenchentv.py +++ b/yt_dlp/extractor/muenchentv.py @@ -9,6 +9,7 @@ class MuenchenTVIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?muenchen\.tv/livestream' IE_DESC = 'münchen.tv' _TEST = { @@ -19,11 +20,11 @@ class MuenchenTVIE(InfoExtractor): 'ext': 'mp4', 'title': 're:^münchen.tv-Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, - 'thumbnail': r're:^https?://.*\.jpg$' + 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { 'skip_download': True, - } + }, } def _real_extract(self, url): @@ -46,12 +47,12 @@ def _real_extract(self, url): ext = determine_ext(s['file'], None) label_str = s.get('label') if label_str is None: - label_str = '_%d' % format_num + label_str = f'_{format_num}' if ext is None: format_id = label_str else: - format_id = '%s-%s' % (ext, label_str) + format_id = f'{ext}-{label_str}' formats.append({ 'url': s['file'], diff --git a/yt_dlp/extractor/murrtube.py b/yt_dlp/extractor/murrtube.py index 6cdbbda168..9067b8781e 100644 --- a/yt_dlp/extractor/murrtube.py +++ b/yt_dlp/extractor/murrtube.py @@ -5,9 +5,14 @@ from ..utils import ( ExtractorError, OnDemandPagedList, - determine_ext, - int_or_none, - try_get, + clean_html, + extract_attributes, + get_element_by_class, + get_element_html_by_id, + parse_count, + remove_end, + update_url, + urlencode_postdata, ) @@ -15,28 +20,88 @@ class MurrtubeIE(InfoExtractor): _VALID_URL = r'''(?x) (?: murrtube:| - https?://murrtube\.net/videos/(?P<slug>[a-z0-9\-]+)\- + https?://murrtube\.net/(?:v/|videos/(?P<slug>[a-z0-9-]+?)-) ) - (?P<id>[a-f0-9]{8}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{12}) + (?P<id>[A-Z0-9]{4}|[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}) ''' - _TEST = { + _TESTS = [{ 'url': 'https://murrtube.net/videos/inferno-x-skyler-148b6f2a-fdcc-4902-affe-9c0f41aaaca0', - 'md5': '169f494812d9a90914b42978e73aa690', + 'md5': '70380878a77e8565d4aea7f68b8bbb35', 'info_dict': { - 'id': '148b6f2a-fdcc-4902-affe-9c0f41aaaca0', + 'id': 'ca885d8456b95de529b6723b158032e11115d', 'ext': 'mp4', 'title': 'Inferno X Skyler', 'description': 'Humping a very good slutty sheppy (roomate)', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 284, 'uploader': 'Inferno Wolf', 'age_limit': 18, + 'thumbnail': 'https://storage.murrtube.net/murrtube-production/ekbs3zcfvuynnqfx72nn2tkokvsd', 'comment_count': int, 'view_count': int, 'like_count': int, - 'tags': ['hump', 'breed', 'Fursuit', 'murrsuit', 'bareback'], + }, + }, { + 'url': 'https://murrtube.net/v/0J2Q', + 'md5': '31262f6ac56f0ca75e5a54a0f3fefcb6', + 'info_dict': { + 'id': '8442998c52134968d9caa36e473e1a6bac6ca', + 'ext': 'mp4', + 'uploader': 'Hayel', + 'title': 'Who\'s in charge now?', + 'description': 'md5:795791e97e5b0f1805ea84573f02a997', + 'age_limit': 18, + 'thumbnail': 'https://storage.murrtube.net/murrtube-production/fb1ojjwiucufp34ya6hxu5vfqi5s', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + }, + }] + + def _extract_count(self, name, html): + return parse_count(self._search_regex( + rf'([\d,]+)\s+<span[^>]*>{name}</span>', html, name, default=None)) + + def _real_initialize(self): + homepage = self._download_webpage( + 'https://murrtube.net', None, note='Getting session token') + self._request_webpage( + 'https://murrtube.net/accept_age_check', None, 'Setting age cookie', + data=urlencode_postdata(self._hidden_inputs(homepage))) + + def _real_extract(self, url): + video_id = self._match_id(url) + if video_id.startswith('murrtube:'): + raise ExtractorError('Support for murrtube: prefix URLs is broken') + video_page = self._download_webpage(url, video_id) + video_attrs = extract_attributes(get_element_html_by_id('video', video_page)) + playlist = update_url(video_attrs['data-url'], query=None) + video_id = self._search_regex(r'/([\da-f]+)/index.m3u8', playlist, 'video id') + + return { + 'id': video_id, + 'title': remove_end(self._og_search_title(video_page), ' - Murrtube'), + 'age_limit': 18, + 'formats': self._extract_m3u8_formats(playlist, video_id, 'mp4'), + 'description': self._og_search_description(video_page), + 'thumbnail': update_url(self._og_search_thumbnail(video_page, default=''), query=None) or None, + 'uploader': clean_html(get_element_by_class('pl-1 is-size-6 has-text-lighter', video_page)), + 'view_count': self._extract_count('Views', video_page), + 'like_count': self._extract_count('Likes', video_page), + 'comment_count': self._extract_count('Comments', video_page), } - } + + +class MurrtubeUserIE(InfoExtractor): + _WORKING = False + IE_DESC = 'Murrtube user profile' + _VALID_URL = r'https?://murrtube\.net/(?P<id>[^/]+)$' + _TESTS = [{ + 'url': 'https://murrtube.net/stormy', + 'info_dict': { + 'id': 'stormy', + }, + 'playlist_mincount': 27, + }] + _PAGE_SIZE = 10 def _download_gql(self, video_id, op, note=None, fatal=True): result = self._download_json( @@ -45,72 +110,6 @@ def _download_gql(self, video_id, op, note=None, fatal=True): headers={'Content-Type': 'application/json'}) return result['data'] - def _real_extract(self, url): - video_id = self._match_id(url) - data = self._download_gql(video_id, { - 'operationName': 'Medium', - 'variables': { - 'id': video_id, - }, - 'query': '''\ -query Medium($id: ID!) { - medium(id: $id) { - title - description - key - duration - commentsCount - likesCount - viewsCount - thumbnailKey - tagList - user { - name - __typename - } - __typename - } -}'''}) - meta = data['medium'] - - storage_url = 'https://storage.murrtube.net/murrtube/' - format_url = storage_url + meta.get('key', '') - thumbnail = storage_url + meta.get('thumbnailKey', '') - - if determine_ext(format_url) == 'm3u8': - formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', fatal=False) - else: - formats = [{'url': format_url}] - - return { - 'id': video_id, - 'title': meta.get('title'), - 'description': meta.get('description'), - 'formats': formats, - 'thumbnail': thumbnail, - 'duration': int_or_none(meta.get('duration')), - 'uploader': try_get(meta, lambda x: x['user']['name']), - 'view_count': meta.get('viewsCount'), - 'like_count': meta.get('likesCount'), - 'comment_count': meta.get('commentsCount'), - 'tags': meta.get('tagList'), - 'age_limit': 18, - } - - -class MurrtubeUserIE(MurrtubeIE): # XXX: Do not subclass from concrete IE - IE_DESC = 'Murrtube user profile' - _VALID_URL = r'https?://murrtube\.net/(?P<id>[^/]+)$' - _TEST = { - 'url': 'https://murrtube.net/stormy', - 'info_dict': { - 'id': 'stormy', - }, - 'playlist_mincount': 27, - } - _PAGE_SIZE = 10 - def _fetch_page(self, username, user_id, page): data = self._download_gql(username, { 'operationName': 'Media', @@ -127,14 +126,14 @@ def _fetch_page(self, username, user_id, page): __typename } }'''}, - 'Downloading page {0}'.format(page + 1)) + f'Downloading page {page + 1}') if data is None: raise ExtractorError(f'Failed to retrieve video list for page {page + 1}') media = data['media'] for entry in media: - yield self.url_result('murrtube:{0}'.format(entry['id']), MurrtubeIE.ie_key()) + yield self.url_result('murrtube:{}'.format(entry['id']), MurrtubeIE.ie_key()) def _real_extract(self, url): username = self._match_id(url) diff --git a/yt_dlp/extractor/museai.py b/yt_dlp/extractor/museai.py new file mode 100644 index 0000000000..7f66928c72 --- /dev/null +++ b/yt_dlp/extractor/museai.py @@ -0,0 +1,112 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + float_or_none, + int_or_none, + js_to_json, + traverse_obj, + url_or_none, +) + + +class MuseAIIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?muse\.ai/(?:v|embed)/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://muse.ai/embed/YdTWvUW', + 'md5': 'f994f9a38be1c3aaf9e37cbd7d76fe7c', + 'info_dict': { + 'id': 'YdTWvUW', + 'ext': 'mp4', + 'title': '2023-05-28-Grabien-1941111 (1)', + 'description': '', + 'uploader': 'Today News Africa', + 'uploader_id': 'TodayNewsAfrica', + 'upload_date': '20230528', + 'timestamp': 1685285044, + 'duration': 1291.3, + 'view_count': int, + 'availability': 'public', + }, + }, { + 'url': 'https://muse.ai/v/gQ4gGAA-0756', + 'md5': '52dbfc78e865e56dc19a1715badc35e8', + 'info_dict': { + 'id': 'gQ4gGAA', + 'ext': 'mp4', + 'title': '0756', + 'description': 'md5:0ca1483f9aac423e9a96ad00bb3a0785', + 'uploader': 'Aerial.ie', + 'uploader_id': 'aerial', + 'upload_date': '20210306', + 'timestamp': 1615072842, + 'duration': 21.4, + 'view_count': int, + 'availability': 'public', + }, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://muse.ai/docs', + 'playlist_mincount': 4, + 'info_dict': { + 'id': 'docs', + 'title': 'muse.ai | docs', + 'description': 'md5:6c0293431481582739c82ee8902687fa', + 'age_limit': 0, + 'thumbnail': 'https://muse.ai/static/imgs/poster-img-docs.png', + }, + 'params': {'allowed_extractors': ['all', '-html5']}, + }] + _EMBED_REGEX = [r'<iframe[^>]*\bsrc=["\'](?P<url>https://muse\.ai/embed/\w+)'] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + yield from super()._extract_embed_urls(url, webpage) + for embed_id in re.findall(r'<script>[^<]*\bMusePlayer\(\{[^}<]*\bvideo:\s*["\'](\w+)["\']', webpage): + yield f'https://muse.ai/embed/{embed_id}' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(f'https://muse.ai/embed/{video_id}', video_id) + data = self._search_json( + r'player\.setData\(', webpage, 'player data', video_id, transform_source=js_to_json) + + source_url = data['url'] + if not url_or_none(source_url): + raise ExtractorError('Unable to extract video URL') + + formats = [{ + 'url': source_url, + 'format_id': 'source', + 'quality': 1, + **traverse_obj(data, { + 'ext': ('filename', {determine_ext}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'filesize': ('size', {int_or_none}), + }), + }] + if source_url.endswith('/data'): + base_url = f'{source_url[:-5]}/videos' + formats.extend(self._extract_m3u8_formats( + f'{base_url}/hls.m3u8', video_id, m3u8_id='hls', fatal=False)) + formats.extend(self._extract_mpd_formats( + f'{base_url}/dash.mpd', video_id, mpd_id='dash', fatal=False)) + + return { + 'id': video_id, + 'formats': formats, + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'duration': ('duration', {float_or_none}), + 'timestamp': ('tcreated', {int_or_none}), + 'uploader': ('owner_name', {str}), + 'uploader_id': ('owner_username', {str}), + 'view_count': ('views', {int_or_none}), + 'age_limit': ('mature', {lambda x: 18 if x else None}), + 'availability': ('visibility', {lambda x: x if x in ('private', 'unlisted') else 'public'}), + }), + } diff --git a/yt_dlp/extractor/musescore.py b/yt_dlp/extractor/musescore.py index 289ae57335..0ef2fa0c88 100644 --- a/yt_dlp/extractor/musescore.py +++ b/yt_dlp/extractor/musescore.py @@ -13,7 +13,7 @@ class MuseScoreIE(InfoExtractor): 'thumbnail': r're:https?://(?:www\.)?musescore\.com/.*\.png[^$]+', 'uploader': 'PapyPiano', 'creator': 'Wolfgang Amadeus Mozart', - } + }, }, { 'url': 'https://musescore.com/user/36164500/scores/6837638', 'info_dict': { @@ -24,7 +24,7 @@ class MuseScoreIE(InfoExtractor): 'thumbnail': r're:https?://(?:www\.)?musescore\.com/.*\.png[^$]+', 'uploader': 'roxbelviolin', 'creator': 'Guns N´Roses Arr. Roxbel Violin', - } + }, }, { 'url': 'https://musescore.com/classicman/fur-elise', 'info_dict': { @@ -35,7 +35,7 @@ class MuseScoreIE(InfoExtractor): 'thumbnail': r're:https?://(?:www\.)?musescore\.com/.*\.png[^$]+', 'uploader': 'ClassicMan', 'creator': 'Ludwig van Beethoven (1770–1827)', - } + }, }, { 'url': 'https://musescore.com/minh_cuteee/scores/6555384', 'only_matching': True, @@ -44,8 +44,8 @@ class MuseScoreIE(InfoExtractor): def _real_extract(self, url): webpage = self._download_webpage(url, None) url = self._og_search_url(webpage) or url - id = self._match_id(url) - mp3_url = self._download_json(f'https://musescore.com/api/jmuse?id={id}&index=0&type=mp3&v2=1', id, + video_id = self._match_id(url) + mp3_url = self._download_json(f'https://musescore.com/api/jmuse?id={video_id}&index=0&type=mp3&v2=1', video_id, headers={'authorization': '63794e5461e4cfa046edfbdddfccc1ac16daffd2'})['info']['url'] formats = [{ 'url': mp3_url, @@ -54,7 +54,7 @@ def _real_extract(self, url): }] return { - 'id': id, + 'id': video_id, 'formats': formats, 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), diff --git a/yt_dlp/extractor/musicdex.py b/yt_dlp/extractor/musicdex.py index 48f29702cf..5ca390ef9a 100644 --- a/yt_dlp/extractor/musicdex.py +++ b/yt_dlp/extractor/musicdex.py @@ -8,20 +8,20 @@ class MusicdexBaseIE(InfoExtractor): - def _return_info(self, track_json, album_json, id): + def _return_info(self, track_json, album_json, video_id): return { - 'id': str(id), + 'id': str(video_id), 'title': track_json.get('name'), 'track': track_json.get('name'), 'description': track_json.get('description'), 'track_number': track_json.get('number'), 'url': format_field(track_json, 'url', 'https://www.musicdex.org/%s'), 'duration': track_json.get('duration'), - 'genre': [genre.get('name') for genre in track_json.get('genres') or []], + 'genres': [genre.get('name') for genre in track_json.get('genres') or []], 'like_count': track_json.get('likes_count'), 'view_count': track_json.get('plays'), - 'artist': [artist.get('name') for artist in track_json.get('artists') or []], - 'album_artist': [artist.get('name') for artist in album_json.get('artists') or []], + 'artists': [artist.get('name') for artist in track_json.get('artists') or []], + 'album_artists': [artist.get('name') for artist in album_json.get('artists') or []], 'thumbnail': format_field(album_json, 'image', 'https://www.musicdex.org/%s'), 'album': album_json.get('name'), 'release_year': try_get(album_json, lambda x: date_from_str(unified_strdate(x['release_date'])).year), @@ -43,22 +43,23 @@ class MusicdexSongIE(MusicdexBaseIE): 'track': 'dual existence', 'track_number': 1, 'duration': 266000, - 'genre': ['Anime'], + 'genres': ['Anime'], 'like_count': int, 'view_count': int, - 'artist': ['fripSide'], - 'album_artist': ['fripSide'], + 'artists': ['fripSide'], + 'album_artists': ['fripSide'], 'thumbnail': 'https://www.musicdex.org/storage/album/9iDIam1DHTVqUG4UclFIEq1WAFGXfPW4y0TtZa91.png', 'album': 'To Aru Kagaku no Railgun T OP2 Single - dual existence', - 'release_year': 2020 + 'release_year': 2020, }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }] def _real_extract(self, url): - id = self._match_id(url) - data_json = self._download_json(f'https://www.musicdex.org/secure/tracks/{id}?defaultRelations=true', id)['track'] - return self._return_info(data_json, data_json.get('album') or {}, id) + video_id = self._match_id(url) + data_json = self._download_json( + f'https://www.musicdex.org/secure/tracks/{video_id}?defaultRelations=true', video_id)['track'] + return self._return_info(data_json, data_json.get('album') or {}, video_id) class MusicdexAlbumIE(MusicdexBaseIE): @@ -69,9 +70,9 @@ class MusicdexAlbumIE(MusicdexBaseIE): 'playlist_mincount': 28, 'info_dict': { 'id': '56', - 'genre': ['OST'], + 'genres': ['OST'], 'view_count': int, - 'artist': ['TENMON & Eiichiro Yanagi / minori'], + 'artists': ['TENMON & Eiichiro Yanagi / minori'], 'title': 'ef - a tale of memories Original Soundtrack 2 ~fortissimo~', 'release_year': 2008, 'thumbnail': 'https://www.musicdex.org/storage/album/2rSHkyYBYfB7sbvElpEyTMcUn6toY7AohOgJuDlE.jpg', @@ -79,18 +80,20 @@ class MusicdexAlbumIE(MusicdexBaseIE): }] def _real_extract(self, url): - id = self._match_id(url) - data_json = self._download_json(f'https://www.musicdex.org/secure/albums/{id}?defaultRelations=true', id)['album'] - entries = [self._return_info(track, data_json, track['id']) for track in data_json.get('tracks') or [] if track.get('id')] + playlist_id = self._match_id(url) + data_json = self._download_json( + f'https://www.musicdex.org/secure/albums/{playlist_id}?defaultRelations=true', playlist_id)['album'] + entries = [self._return_info(track, data_json, track['id']) + for track in data_json.get('tracks') or [] if track.get('id')] return { '_type': 'playlist', - 'id': id, + 'id': playlist_id, 'title': data_json.get('name'), 'description': data_json.get('description'), - 'genre': [genre.get('name') for genre in data_json.get('genres') or []], + 'genres': [genre.get('name') for genre in data_json.get('genres') or []], 'view_count': data_json.get('plays'), - 'artist': [artist.get('name') for artist in data_json.get('artists') or []], + 'artists': [artist.get('name') for artist in data_json.get('artists') or []], 'thumbnail': format_field(data_json, 'image', 'https://www.musicdex.org/%s'), 'release_year': try_get(data_json, lambda x: date_from_str(unified_strdate(x['release_date'])).year), 'entries': entries, @@ -98,12 +101,11 @@ def _real_extract(self, url): class MusicdexPageIE(MusicdexBaseIE): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor - def _entries(self, id): - next_page_url = self._API_URL % id + def _entries(self, playlist_id): + next_page_url = self._API_URL % playlist_id while next_page_url: - data_json = self._download_json(next_page_url, id)['pagination'] - for data in data_json.get('data') or []: - yield data + data_json = self._download_json(next_page_url, playlist_id)['pagination'] + yield from data_json.get('data') or [] next_page_url = data_json.get('next_page_url') @@ -123,15 +125,15 @@ class MusicdexArtistIE(MusicdexPageIE): }] def _real_extract(self, url): - id = self._match_id(url) - data_json = self._download_json(f'https://www.musicdex.org/secure/artists/{id}', id)['artist'] + playlist_id = self._match_id(url) + data_json = self._download_json(f'https://www.musicdex.org/secure/artists/{playlist_id}', playlist_id)['artist'] entries = [] - for album in self._entries(id): + for album in self._entries(playlist_id): entries.extend(self._return_info(track, album, track['id']) for track in album.get('tracks') or [] if track.get('id')) return { '_type': 'playlist', - 'id': id, + 'id': playlist_id, 'title': data_json.get('name'), 'view_count': data_json.get('plays'), 'thumbnail': format_field(data_json, 'image_small', 'https://www.musicdex.org/%s'), @@ -156,14 +158,14 @@ class MusicdexPlaylistIE(MusicdexPageIE): }] def _real_extract(self, url): - id = self._match_id(url) - data_json = self._download_json(f'https://www.musicdex.org/secure/playlists/{id}', id)['playlist'] + playlist_id = self._match_id(url) + data_json = self._download_json(f'https://www.musicdex.org/secure/playlists/{playlist_id}', playlist_id)['playlist'] entries = [self._return_info(track, track.get('album') or {}, track['id']) - for track in self._entries(id) or [] if track.get('id')] + for track in self._entries(playlist_id) or [] if track.get('id')] return { '_type': 'playlist', - 'id': id, + 'id': playlist_id, 'title': data_json.get('name'), 'description': data_json.get('description'), 'view_count': data_json.get('plays'), diff --git a/yt_dlp/extractor/mwave.py b/yt_dlp/extractor/mwave.py deleted file mode 100644 index efbfd9d430..0000000000 --- a/yt_dlp/extractor/mwave.py +++ /dev/null @@ -1,87 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - parse_duration, -) - - -class MwaveIE(InfoExtractor): - _VALID_URL = r'https?://mwave\.interest\.me/(?:[^/]+/)?mnettv/videodetail\.m\?searchVideoDetailVO\.clip_id=(?P<id>[0-9]+)' - _URL_TEMPLATE = 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=%s' - _TESTS = [{ - 'url': 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=168859', - # md5 is unstable - 'info_dict': { - 'id': '168859', - 'ext': 'flv', - 'title': '[M COUNTDOWN] SISTAR - SHAKE IT', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'M COUNTDOWN', - 'duration': 206, - 'view_count': int, - } - }, { - 'url': 'http://mwave.interest.me/en/mnettv/videodetail.m?searchVideoDetailVO.clip_id=176199', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - vod_info = self._download_json( - 'http://mwave.interest.me/onair/vod_info.m?vodtype=CL§orid=&endinfo=Y&id=%s' % video_id, - video_id, 'Download vod JSON') - - formats = [] - for num, cdn_info in enumerate(vod_info['cdn']): - stream_url = cdn_info.get('url') - if not stream_url: - continue - stream_name = cdn_info.get('name') or compat_str(num) - f4m_stream = self._download_json( - stream_url, video_id, - 'Download %s stream JSON' % stream_name) - f4m_url = f4m_stream.get('fileurl') - if not f4m_url: - continue - formats.extend( - self._extract_f4m_formats(f4m_url + '&hdcore=3.0.3', video_id, f4m_id=stream_name)) - - return { - 'id': video_id, - 'title': vod_info['title'], - 'thumbnail': vod_info.get('cover'), - 'uploader': vod_info.get('program_title'), - 'duration': parse_duration(vod_info.get('time')), - 'view_count': int_or_none(vod_info.get('hit')), - 'formats': formats, - } - - -class MwaveMeetGreetIE(InfoExtractor): - _VALID_URL = r'https?://mwave\.interest\.me/(?:[^/]+/)?meetgreet/view/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://mwave.interest.me/meetgreet/view/256', - 'info_dict': { - 'id': '173294', - 'ext': 'flv', - 'title': '[MEET&GREET] Park BoRam', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Mwave', - 'duration': 3634, - 'view_count': int, - } - }, { - 'url': 'http://mwave.interest.me/en/meetgreet/view/256', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - clip_id = self._html_search_regex( - r'<iframe[^>]+src="/mnettv/ifr_clip\.m\?searchVideoDetailVO\.clip_id=(\d+)', - webpage, 'clip ID') - clip_url = MwaveIE._URL_TEMPLATE % clip_id - return self.url_result(clip_url, 'Mwave', clip_id) diff --git a/yt_dlp/extractor/mx3.py b/yt_dlp/extractor/mx3.py new file mode 100644 index 0000000000..5c42f4d156 --- /dev/null +++ b/yt_dlp/extractor/mx3.py @@ -0,0 +1,171 @@ +import re + +from .common import InfoExtractor +from ..networking import HEADRequest +from ..utils import ( + get_element_by_class, + int_or_none, + try_call, + url_or_none, + urlhandle_detect_ext, +) +from ..utils.traversal import traverse_obj + + +class Mx3BaseIE(InfoExtractor): + _VALID_URL_TMPL = r'https?://(?:www\.)?%s/t/(?P<id>\w+)' + _FORMATS = [{ + 'url': 'player_asset', + 'format_id': 'default', + 'quality': 0, + }, { + 'url': 'player_asset?quality=hd', + 'format_id': 'hd', + 'quality': 1, + }, { + 'url': 'download', + 'format_id': 'download', + 'quality': 2, + }, { + 'url': 'player_asset?quality=source', + 'format_id': 'source', + 'quality': 2, + }] + + def _extract_formats(self, track_id): + formats = [] + for fmt in self._FORMATS: + format_url = f'https://{self._DOMAIN}/tracks/{track_id}/{fmt["url"]}' + urlh = self._request_webpage( + HEADRequest(format_url), track_id, fatal=False, expected_status=404, + note=f'Checking for format {fmt["format_id"]}') + if urlh and urlh.status == 200: + formats.append({ + **fmt, + 'url': format_url, + 'ext': urlhandle_detect_ext(urlh), + 'filesize': int_or_none(urlh.headers.get('Content-Length')), + }) + return formats + + def _real_extract(self, url): + track_id = self._match_id(url) + webpage = self._download_webpage(url, track_id) + more_info = get_element_by_class('single-more-info', webpage) + data = self._download_json(f'https://{self._DOMAIN}/t/{track_id}.json', track_id, fatal=False) + + def get_info_field(name): + return self._html_search_regex( + rf'<dt[^>]*>\s*{name}\s*</dt>\s*<dd[^>]*>(.*?)</dd>', + more_info, name, default=None, flags=re.DOTALL) + + return { + 'id': track_id, + 'formats': self._extract_formats(track_id), + 'genre': self._html_search_regex( + r'<div\b[^>]+class="single-band-genre"[^>]*>([^<]+)</div>', webpage, 'genre', default=None), + 'release_year': int_or_none(get_info_field('Year of creation')), + 'description': get_info_field('Description'), + 'tags': try_call(lambda: get_info_field('Tag').split(', '), list), + **traverse_obj(data, { + 'title': ('title', {str}), + 'artist': (('performer_name', 'artist'), {str}), + 'album_artist': ('artist', {str}), + 'composer': ('composer_name', {str}), + 'thumbnail': (('picture_url_xlarge', 'picture_url'), {url_or_none}), + }, get_all=False), + } + + +class Mx3IE(Mx3BaseIE): + _DOMAIN = 'mx3.ch' + _VALID_URL = Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN) + _TESTS = [{ + 'url': 'https://mx3.ch/t/1Cru', + 'md5': '7ba09e9826b4447d4e1ce9d69e0e295f', + 'info_dict': { + 'id': '1Cru', + 'ext': 'wav', + 'artist': 'Godina', + 'album_artist': 'Tortue Tortue', + 'composer': 'Olivier Godinat', + 'genre': 'Rock', + 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/4643/square_xlarge/1-s-envoler-1.jpg?1630272813', + 'title': "S'envoler", + 'release_year': 2021, + 'tags': [], + }, + }, { + 'url': 'https://mx3.ch/t/1LIY', + 'md5': '48293cb908342547827f963a5a2e9118', + 'info_dict': { + 'id': '1LIY', + 'ext': 'mov', + 'artist': 'Tania Kimfumu', + 'album_artist': 'The Broots', + 'composer': 'Emmanuel Diserens', + 'genre': 'Electro', + 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0110/0003/video_xlarge/frame_0000.png?1686963670', + 'title': 'The Broots-Larytta remix "Begging For Help"', + 'release_year': 2023, + 'tags': ['the broots', 'cassata records', 'larytta'], + 'description': '"Begging for Help" Larytta Remix Official Video\nRealized By Kali Donkilie in 2023', + }, + }, { + 'url': 'https://mx3.ch/t/1C6E', + 'md5': '1afcd578493ddb8e5008e94bb6d97e25', + 'info_dict': { + 'id': '1C6E', + 'ext': 'wav', + 'artist': 'Alien Bubblegum', + 'album_artist': 'Alien Bubblegum', + 'composer': 'Alien Bubblegum', + 'genre': 'Punk', + 'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/1551/square_xlarge/pandora-s-box-cover-with-title.png?1627054733', + 'title': 'Wide Awake', + 'release_year': 2021, + 'tags': ['alien bubblegum', 'bubblegum', 'alien', 'pop punk', 'poppunk'], + }, + }] + + +class Mx3NeoIE(Mx3BaseIE): + _DOMAIN = 'neo.mx3.ch' + _VALID_URL = Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN) + _TESTS = [{ + 'url': 'https://neo.mx3.ch/t/1hpd', + 'md5': '6d9986bbae5cac3296ec8813bf965eb2', + 'info_dict': { + 'id': '1hpd', + 'ext': 'wav', + 'artist': 'Baptiste Lopez', + 'album_artist': 'Kammerorchester Basel', + 'composer': 'Jannik Giger', + 'genre': 'Composition, Orchestra', + 'title': 'Troisième œil. Für Kammerorchester (2023)', + 'thumbnail': 'https://neo.mx3.ch/pictures/neo/file/0000/0241/square_xlarge/kammerorchester-basel-group-photo-2_c_-lukasz-rajchert.jpg?1560341252', + 'release_year': 2023, + 'tags': [], + }, + }] + + +class Mx3VolksmusikIE(Mx3BaseIE): + _DOMAIN = 'volksmusik.mx3.ch' + _VALID_URL = Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN) + _TESTS = [{ + 'url': 'https://volksmusik.mx3.ch/t/Zx', + 'md5': 'dd967a7b0c1ef898f3e072cf9c2eae3c', + 'info_dict': { + 'id': 'Zx', + 'ext': 'mp3', + 'artist': 'Ländlerkapelle GrischArt', + 'album_artist': 'Ländlerkapelle GrischArt', + 'composer': 'Urs Glauser', + 'genre': 'Instrumental, Graubünden', + 'title': 'Chämilouf', + 'thumbnail': 'https://volksmusik.mx3.ch/pictures/vxm/file/0000/3815/square_xlarge/grischart1.jpg?1450530120', + 'release_year': 2012, + 'tags': [], + }, + }] diff --git a/yt_dlp/extractor/mxplayer.py b/yt_dlp/extractor/mxplayer.py index 1fdb08edfe..8d3e35a7c7 100644 --- a/yt_dlp/extractor/mxplayer.py +++ b/yt_dlp/extractor/mxplayer.py @@ -1,5 +1,4 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, traverse_obj, @@ -23,7 +22,7 @@ class MxplayerIE(InfoExtractor): 'duration': 2451, 'season': 'Season 1', 'series': 'My Girlfriend Is An Alien (Hindi Dubbed)', - 'episode': 'Episode 1' + 'episode': 'Episode 1', }, 'params': { 'format': 'bv', @@ -56,13 +55,13 @@ class MxplayerIE(InfoExtractor): 'duration': 2332, 'season': 'Season 1', 'series': 'Shaitaan', - 'episode': 'Episode 1' + 'episode': 'Episode 1', }, 'params': { 'format': 'best', 'skip_download': True, }, - 'skip': 'No longer available.' + 'skip': 'No longer available.', }, { 'url': 'https://www.mxplayer.in/show/watch-aashram/chapter-1/duh-swapna-online-d445579792b0135598ba1bc9088a84cb', 'info_dict': { @@ -76,7 +75,7 @@ class MxplayerIE(InfoExtractor): 'duration': 2568, 'season': 'Season 1', 'series': 'Aashram', - 'episode': 'Episode 3' + 'episode': 'Episode 3', }, 'params': { 'format': 'bv', @@ -95,7 +94,7 @@ class MxplayerIE(InfoExtractor): 'duration': 1305, 'season': 'Season 1', 'series': 'Dangerous', - 'episode': 'Episode 1' + 'episode': 'Episode 1', }, 'params': { 'format': 'bv', @@ -114,7 +113,7 @@ class MxplayerIE(InfoExtractor): 'format': 'best', 'skip_download': True, }, - 'skip': 'No longer available. Cannot be played on browser' + 'skip': 'No longer available. Cannot be played on browser', }, { 'url': 'https://www.mxplayer.in/movie/watch-kitne-door-kitne-paas-movie-online-a9e9c76c566205955f70d8b2cb88a6a2', 'info_dict': { @@ -206,11 +205,11 @@ class MxplayerShowIE(InfoExtractor): 'info_dict': { 'id': 'a8f44e3cc0814b5601d17772cedf5417', 'title': 'Watch Chakravartin Ashoka Samrat Series Online', - } + }, }] - _API_SHOW_URL = "https://api.mxplay.com/v1/web/detail/tab/tvshowseasons?type=tv_show&id={}&device-density=2&platform=com.mxplay.desktop&content-languages=hi,en" - _API_EPISODES_URL = "https://api.mxplay.com/v1/web/detail/tab/tvshowepisodes?type=season&id={}&device-density=1&platform=com.mxplay.desktop&content-languages=hi,en&{}" + _API_SHOW_URL = 'https://api.mxplay.com/v1/web/detail/tab/tvshowseasons?type=tv_show&id={}&device-density=2&platform=com.mxplay.desktop&content-languages=hi,en' + _API_EPISODES_URL = 'https://api.mxplay.com/v1/web/detail/tab/tvshowepisodes?type=season&id={}&device-density=1&platform=com.mxplay.desktop&content-languages=hi,en&{}' def _entries(self, show_id): show_json = self._download_json( @@ -218,7 +217,7 @@ def _entries(self, show_id): video_id=show_id, headers={'Referer': 'https://mxplayer.in'}) page_num = 0 for season in show_json.get('items') or []: - season_id = try_get(season, lambda x: x['id'], compat_str) + season_id = try_get(season, lambda x: x['id'], str) next_url = '' while next_url is not None: page_num += 1 @@ -226,11 +225,11 @@ def _entries(self, show_id): self._API_EPISODES_URL.format(season_id, next_url), video_id=season_id, headers={'Referer': 'https://mxplayer.in'}, - note='Downloading JSON metadata page %d' % page_num) + note=f'Downloading JSON metadata page {page_num}') for episode in season_json.get('items') or []: video_url = episode['webUrl'] yield self.url_result( - 'https://mxplayer.in%s' % video_url, + f'https://mxplayer.in{video_url}', ie=MxplayerIE.ie_key(), video_id=video_url.split('-')[-1]) next_url = season_json.get('next') diff --git a/yt_dlp/extractor/mychannels.py b/yt_dlp/extractor/mychannels.py deleted file mode 100644 index 8a70c1f7b4..0000000000 --- a/yt_dlp/extractor/mychannels.py +++ /dev/null @@ -1,35 +0,0 @@ -from .common import InfoExtractor - - -class MyChannelsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mychannels\.com/.*(?P<id_type>video|production)_id=(?P<id>[0-9]+)' - _TEST = { - 'url': 'https://mychannels.com/missholland/miss-holland?production_id=3416', - 'md5': 'b8993daad4262dd68d89d651c0c52c45', - 'info_dict': { - 'id': 'wUUDZZep6vQD', - 'ext': 'mp4', - 'title': 'Miss Holland joins VOTE LEAVE', - 'description': 'Miss Holland | #13 Not a potato', - 'uploader': 'Miss Holland', - } - } - - def _real_extract(self, url): - id_type, url_id = self._match_valid_url(url).groups() - webpage = self._download_webpage(url, url_id) - video_data = self._html_search_regex(r'<div([^>]+data-%s-id="%s"[^>]+)>' % (id_type, url_id), webpage, 'video data') - - def extract_data_val(attr, fatal=False): - return self._html_search_regex(r'data-%s\s*=\s*"([^"]+)"' % attr, video_data, attr, fatal=fatal) - minoto_id = extract_data_val('minoto-id') or self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id') - - return { - '_type': 'url_transparent', - 'url': 'minoto:%s' % minoto_id, - 'id': url_id, - 'title': extract_data_val('title', True), - 'description': extract_data_val('description'), - 'thumbnail': extract_data_val('image'), - 'uploader': extract_data_val('channel'), - } diff --git a/yt_dlp/extractor/myspace.py b/yt_dlp/extractor/myspace.py index 3451098379..fa2ef14e13 100644 --- a/yt_dlp/extractor/myspace.py +++ b/yt_dlp/extractor/myspace.py @@ -95,17 +95,17 @@ def formats_from_stream_urls(stream_url, hls_stream_url, http_stream_url, width= if is_song: # songs don't store any useful info in the 'context' variable song_data = self._search_regex( - r'''<button.*data-song-id=(["\'])%s\1.*''' % video_id, + rf'''<button.*data-song-id=(["\']){video_id}\1.*''', webpage, 'song_data', default=None, group=0) if song_data is None: # some songs in an album are not playable self.report_warning( - '%s: No downloadable song on this page' % video_id) + f'{video_id}: No downloadable song on this page') return def search_data(name): return self._search_regex( - r'''data-%s=([\'"])(?P<data>.*?)\1''' % name, + rf'''data-{name}=([\'"])(?P<data>.*?)\1''', song_data, name, default='', group='data') formats = formats_from_stream_urls( search_data('stream-url'), search_data('hls-stream-url'), @@ -114,10 +114,10 @@ def search_data(name): vevo_id = search_data('vevo-id') youtube_id = search_data('youtube-id') if vevo_id: - self.to_screen('Vevo video detected: %s' % vevo_id) - return self.url_result('vevo:%s' % vevo_id, ie='Vevo') + self.to_screen(f'Vevo video detected: {vevo_id}') + return self.url_result(f'vevo:{vevo_id}', ie='Vevo') elif youtube_id: - self.to_screen('Youtube video detected: %s' % youtube_id) + self.to_screen(f'Youtube video detected: {youtube_id}') return self.url_result(youtube_id, ie='Youtube') else: raise ExtractorError( @@ -181,7 +181,7 @@ def _real_extract(self, url): tracks_paths = re.findall(r'"music:song" content="(.*?)"', webpage) if not tracks_paths: raise ExtractorError( - '%s: No songs found, try using proxy' % display_id, + f'{display_id}: No songs found, try using proxy', expected=True) entries = [ self.url_result(t_path, ie=MySpaceIE.ie_key()) diff --git a/yt_dlp/extractor/myspass.py b/yt_dlp/extractor/myspass.py index 28ac982d66..3e8d506c44 100644 --- a/yt_dlp/extractor/myspass.py +++ b/yt_dlp/extractor/myspass.py @@ -1,5 +1,4 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, parse_duration, @@ -75,7 +74,7 @@ def _real_extract(self, url): for group in self._search_regex(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url, 'myspass', group=(1, 2, 3), default=[]): group_int = int(group) if group_int > video_id_int: - video_url = video_url.replace(group, compat_str(group_int // video_id_int)) + video_url = video_url.replace(group, str(group_int // video_id_int)) return { 'id': video_id, diff --git a/yt_dlp/extractor/myvi.py b/yt_dlp/extractor/myvi.py deleted file mode 100644 index df7200be20..0000000000 --- a/yt_dlp/extractor/myvi.py +++ /dev/null @@ -1,100 +0,0 @@ -from .common import InfoExtractor -from .vimple import SprutoBaseIE - - -class MyviIE(SprutoBaseIE): - _VALID_URL = r'''(?x) - (?: - https?:// - (?:www\.)? - myvi\. - (?: - (?:ru/player|tv)/ - (?: - (?: - embed/html| - flash| - api/Video/Get - )/| - content/preloader\.swf\?.*\bid= - )| - ru/watch/ - )| - myvi: - ) - (?P<id>[\da-zA-Z_-]+) - ''' - _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//myvi\.(?:ru/player|tv)/(?:embed/html|flash)/[^"]+)\1'] - _TESTS = [{ - 'url': 'http://myvi.ru/player/embed/html/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0', - 'md5': '571bbdfba9f9ed229dc6d34cc0f335bf', - 'info_dict': { - 'id': 'f16b2bbd-cde8-481c-a981-7cd48605df43', - 'ext': 'mp4', - 'title': 'хозяин жизни', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 25, - }, - }, { - 'url': 'http://myvi.ru/player/content/preloader.swf?id=oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wOYf1WFpPfc_bWTKGVf_Zafr0', - 'only_matching': True, - }, { - 'url': 'http://myvi.ru/player/api/Video/Get/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0', - 'only_matching': True, - }, { - 'url': 'http://myvi.tv/embed/html/oTGTNWdyz4Zwy_u1nraolwZ1odenTd9WkTnRfIL9y8VOgHYqOHApE575x4_xxS9Vn0?ap=0', - 'only_matching': True, - }, { - 'url': 'http://myvi.ru/player/flash/ocp2qZrHI-eZnHKQBK4cZV60hslH8LALnk0uBfKsB-Q4WnY26SeGoYPi8HWHxu0O30', - 'only_matching': True, - }, { - 'url': 'https://www.myvi.ru/watch/YwbqszQynUaHPn_s82sx0Q2', - 'only_matching': True, - }, { - 'url': 'myvi:YwbqszQynUaHPn_s82sx0Q2', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - spruto = self._download_json( - 'http://myvi.ru/player/api/Video/Get/%s?sig' % video_id, video_id)['sprutoData'] - - return self._extract_spruto(spruto, video_id) - - -class MyviEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?myvi\.tv/(?:[^?]+\?.*?\bv=|embed/)(?P<id>[\da-z]+)' - _TESTS = [{ - 'url': 'https://www.myvi.tv/embed/ccdqic3wgkqwpb36x9sxg43t4r', - 'info_dict': { - 'id': 'b3ea0663-3234-469d-873e-7fecf36b31d1', - 'ext': 'mp4', - 'title': 'Твоя (original song).mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 277, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.myvi.tv/idmi6o?v=ccdqic3wgkqwpb36x9sxg43t4r#watch', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if MyviIE.suitable(url) else super(MyviEmbedIE, cls).suitable(url) - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'https://www.myvi.tv/embed/%s' % video_id, video_id) - - myvi_id = self._search_regex( - r'CreatePlayer\s*\(\s*["\'].*?\bv=([\da-zA-Z_]+)', - webpage, 'video id') - - return self.url_result('myvi:%s' % myvi_id, ie=MyviIE.ie_key()) diff --git a/yt_dlp/extractor/myvideoge.py b/yt_dlp/extractor/myvideoge.py index 64cee48e7f..3e0bb24995 100644 --- a/yt_dlp/extractor/myvideoge.py +++ b/yt_dlp/extractor/myvideoge.py @@ -64,7 +64,7 @@ def _real_extract(self, url): # translate any ka month to an en one re.sub('|'.join(self._MONTH_NAMES_KA), lambda m: MONTH_NAMES['en'][self._MONTH_NAMES_KA.index(m.group(0))], - upload_date, re.I)) + upload_date, flags=re.I)) if upload_date else None) return { diff --git a/yt_dlp/extractor/myvidster.py b/yt_dlp/extractor/myvidster.py index c91f294bf0..e3b700dbbb 100644 --- a/yt_dlp/extractor/myvidster.py +++ b/yt_dlp/extractor/myvidster.py @@ -2,7 +2,7 @@ class MyVidsterIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?myvidster\.com/video/(?P<id>\d+)/' + _VALID_URL = r'https?://(?:www\.)?myvidster\.com/video/(?P<id>\d+)' _TEST = { 'url': 'http://www.myvidster.com/video/32059805/Hot_chemistry_with_raw_love_making', diff --git a/yt_dlp/extractor/mzaalo.py b/yt_dlp/extractor/mzaalo.py new file mode 100644 index 0000000000..52e5ea89e7 --- /dev/null +++ b/yt_dlp/extractor/mzaalo.py @@ -0,0 +1,95 @@ +from .common import InfoExtractor +from ..utils import ( + parse_age_limit, + parse_duration, + traverse_obj, + url_or_none, +) + + +class MzaaloIE(InfoExtractor): + _VALID_URL = r'(?i)https?://(?:www\.)?mzaalo\.com/(?:play|watch)/(?P<type>movie|original|clip)/(?P<id>[a-f0-9-]+)/[\w-]+' + _TESTS = [{ + # Movies + 'url': 'https://www.mzaalo.com/play/movie/c0958d9f-f90e-4503-a755-44358758921d/Jamun', + 'info_dict': { + 'id': 'c0958d9f-f90e-4503-a755-44358758921d', + 'title': 'Jamun', + 'ext': 'mp4', + 'description': 'md5:24fe9ebb9bbe5b36f7b54b90ab1e2f31', + 'thumbnails': 'count:15', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 5527.0, + 'language': 'hin', + 'categories': ['Drama'], + 'age_limit': 13, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + # Shows + 'url': 'https://www.mzaalo.com/play/original/93d42b2b-f373-4c2d-bca4-997412cb069d/Modi-Season-2-CM-TO-PM/Episode-1:Decision,-Not-Promises', + 'info_dict': { + 'id': '93d42b2b-f373-4c2d-bca4-997412cb069d', + 'title': 'Episode 1:Decision, Not Promises', + 'ext': 'mp4', + 'description': 'md5:16f76058432a54774fbb2561a1955652', + 'thumbnails': 'count:22', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2040.0, + 'language': 'hin', + 'categories': ['Drama'], + 'age_limit': 13, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + # Streams/Clips + 'url': 'https://www.mzaalo.com/play/clip/83cdbcb5-400a-42f1-a1d2-459053cfbda5/Manto-Ki-Kahaaniya', + 'info_dict': { + 'id': '83cdbcb5-400a-42f1-a1d2-459053cfbda5', + 'title': 'Manto Ki Kahaaniya', + 'ext': 'mp4', + 'description': 'md5:c3c5f1d05f0fd1bfcb05b673d1cc9f2f', + 'thumbnails': 'count:3', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1937.0, + 'language': 'hin', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://mzaalo.com/watch/MOVIE/389c892d-0b65-4019-bf73-d4edcb1c014f/Chalo-Dilli', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, type_ = self._match_valid_url(url).group('id', 'type') + path = (f'partner/streamurl?&assetId={video_id}&getClipDetails=YES' if type_ == 'clip' + else f'api/v2/player/details?assetType={type_.upper()}&assetId={video_id}') + data = self._download_json( + f'https://production.mzaalo.com/platform/{path}', video_id, headers={ + 'Ocp-Apim-Subscription-Key': '1d0caac2702049b89a305929fdf4cbae', + })['data'] + + formats = self._extract_m3u8_formats(data['streamURL'], video_id) + + subtitles = {} + for subs_lang, subs_url in traverse_obj(data, ('subtitles', {dict.items}, ...)): + if url_or_none(subs_url): + subtitles[subs_lang] = [{'url': subs_url, 'ext': 'vtt'}] + + lang = traverse_obj(data, ('language', {str.lower})) + for f in formats: + f['language'] = lang + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'duration': ('duration', {parse_duration}), + 'age_limit': ('maturity_rating', {parse_age_limit}), + 'thumbnails': ('images', ..., {'url': {url_or_none}}), + 'categories': ('genre', ..., {str}), + }), + } diff --git a/yt_dlp/extractor/n1.py b/yt_dlp/extractor/n1.py index 55345f3983..bbb327e750 100644 --- a/yt_dlp/extractor/n1.py +++ b/yt_dlp/extractor/n1.py @@ -2,8 +2,8 @@ from .common import InfoExtractor from ..utils import ( - unified_timestamp, extract_attributes, + unified_timestamp, ) @@ -16,7 +16,7 @@ class N1InfoAssetIE(InfoExtractor): 'id': 'ljsottomazilirija3060921-n1info-si-worldwide', 'ext': 'mp4', 'title': 'ljsottomazilirija3060921-n1info-si-worldwide', - } + }, }] def _real_extract(self, url): @@ -33,7 +33,7 @@ def _real_extract(self, url): class N1InfoIIE(InfoExtractor): IE_NAME = 'N1Info:article' - _VALID_URL = r'https?://(?:(?:(?:ba|rs|hr)\.)?n1info\.(?:com|si)|nova\.rs)/(?:[^/]+/){1,2}(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:(?:\w+\.)?n1info\.\w+|nova\.rs)/(?:[^/?#]+/){1,2}(?P<id>[^/?#]+)' _TESTS = [{ # Youtube embedded 'url': 'https://rs.n1info.com/sport-klub/tenis/kako-je-djokovic-propustio-istorijsku-priliku-video/', @@ -46,7 +46,7 @@ class N1InfoIIE(InfoExtractor): 'description': 'md5:467f330af1effedd2e290f10dc31bb8e', 'uploader': 'Sport Klub', 'uploader_id': 'sportklub', - } + }, }, { 'url': 'https://rs.n1info.com/vesti/djilas-los-plan-za-metro-nece-resiti-nijedan-saobracajni-problem/', 'info_dict': { @@ -94,6 +94,16 @@ class N1InfoIIE(InfoExtractor): 'upload_date': '20211102', 'timestamp': 1635861677, }, + }, { + 'url': 'https://n1info.rs/vesti/cuta-biti-u-kosovskoj-mitrovici-znaci-da-te-docekaju-eksplozivnim-napravama/', + 'info_dict': { + 'id': '1332368', + 'ext': 'mp4', + 'title': 'Ćuta: Biti u Kosovskoj Mitrovici znači da te dočekaju eksplozivnim napravama', + 'upload_date': '20230620', + 'timestamp': 1687290536, + 'thumbnail': 'https://cdn.brid.tv/live/partners/26827/snapshot/1332368_th_6492013a8356f_1687290170.jpg', + }, }, { 'url': 'https://hr.n1info.com/vijesti/pravobraniteljica-o-ubojstvu-u-zagrebu-radi-se-o-doista-nezapamcenoj-situaciji/', 'only_matching': True, @@ -105,19 +115,35 @@ def _real_extract(self, url): title = self._html_search_regex(r'<h1[^>]+>(.+?)</h1>', webpage, 'title') timestamp = unified_timestamp(self._html_search_meta('article:published_time', webpage)) - - videos = re.findall(r'(?m)(<video[^>]+>)', webpage) + plugin_data = self._html_search_meta('BridPlugin', webpage) entries = [] - for video in videos: - video_data = extract_attributes(video) - entries.append({ - '_type': 'url_transparent', - 'url': video_data.get('data-url'), - 'id': video_data.get('id'), - 'title': title, - 'thumbnail': video_data.get('data-thumbnail'), - 'timestamp': timestamp, - 'ie_key': 'N1InfoAsset'}) + if plugin_data: + site_id = self._html_search_regex(r'site:(\d+)', webpage, 'site id') + for video_data in re.findall(r'\$bp\("Brid_\d+", (.+)\);', webpage): + video_id = self._parse_json(video_data, title)['video'] + entries.append({ + 'id': video_id, + 'title': title, + 'timestamp': timestamp, + 'thumbnail': self._html_search_meta('thumbnailURL', webpage), + 'formats': self._extract_m3u8_formats( + f'https://cdn-uc.brid.tv/live/partners/{site_id}/streaming/{video_id}/{video_id}.m3u8', + video_id, fatal=False), + }) + else: + # Old player still present in older articles + videos = re.findall(r'(?m)(<video[^>]+>)', webpage) + for video in videos: + video_data = extract_attributes(video) + entries.append({ + '_type': 'url_transparent', + 'url': video_data.get('data-url'), + 'id': video_data.get('id'), + 'title': title, + 'thumbnail': video_data.get('data-thumbnail'), + 'timestamp': timestamp, + 'ie_key': 'N1InfoAsset', + }) embedded_videos = re.findall(r'(<iframe[^>]+>)', webpage) for embedded_video in embedded_videos: diff --git a/yt_dlp/extractor/nate.py b/yt_dlp/extractor/nate.py index 5e74caa7f6..bbc641f0e2 100644 --- a/yt_dlp/extractor/nate.py +++ b/yt_dlp/extractor/nate.py @@ -29,7 +29,7 @@ class NateIE(InfoExtractor): 'uploader_id': '3606', 'tags': 'count:59', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }, { 'url': 'https://tv.nate.com/clip/4300566', 'info_dict': { @@ -47,7 +47,7 @@ class NateIE(InfoExtractor): 'uploader_id': '27987', 'tags': 'count:20', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }] _QUALITY = { @@ -60,8 +60,8 @@ class NateIE(InfoExtractor): } def _real_extract(self, url): - id = self._match_id(url) - video_data = self._download_json(f'https://tv.nate.com/api/v1/clip/{id}', id) + video_id = self._match_id(url) + video_data = self._download_json(f'https://tv.nate.com/api/v1/clip/{video_id}', video_id) formats = [{ 'format_id': f_url[-2:], 'url': f_url, @@ -69,7 +69,7 @@ def _real_extract(self, url): 'quality': int_or_none(f_url[-2:]), } for f_url in video_data.get('smcUriList') or []] return { - 'id': id, + 'id': video_id, 'title': video_data.get('clipTitle'), 'description': video_data.get('synopsis'), 'thumbnail': video_data.get('contentImg'), @@ -102,19 +102,19 @@ class NateProgramIE(InfoExtractor): }, }] - def _entries(self, id): + def _entries(self, playlist_id): for page_num in itertools.count(1): - program_data = self._download_json(f'https://tv.nate.com/api/v1/program/{id}/clip/ranking?size=20&page={page_num}', - id, note=f'Downloading page {page_num}') + program_data = self._download_json( + f'https://tv.nate.com/api/v1/program/{playlist_id}/clip/ranking?size=20&page={page_num}', + playlist_id, note=f'Downloading page {page_num}') for clip in program_data.get('content') or []: clip_id = clip.get('clipSeq') if clip_id: yield self.url_result( - 'https://tv.nate.com/clip/%s' % clip_id, - ie=NateIE.ie_key(), video_id=clip_id) + f'https://tv.nate.com/clip/{clip_id}', NateIE, playlist_id) if program_data.get('last'): break def _real_extract(self, url): - id = self._match_id(url) - return self.playlist_result(self._entries(id), playlist_id=id) + playlist_id = self._match_id(url) + return self.playlist_result(self._entries(playlist_id), playlist_id=playlist_id) diff --git a/yt_dlp/extractor/nationalgeographic.py b/yt_dlp/extractor/nationalgeographic.py index ad525c2589..43f84a9527 100644 --- a/yt_dlp/extractor/nationalgeographic.py +++ b/yt_dlp/extractor/nationalgeographic.py @@ -24,6 +24,7 @@ class NationalGeographicVideoIE(InfoExtractor): 'uploader': 'NAGS', }, 'add_ie': ['ThePlatform'], + 'skip': 'Redirects to main page', }, { 'url': 'http://video.nationalgeographic.com/wild/when-sharks-attack/the-real-jaws', @@ -38,6 +39,7 @@ class NationalGeographicVideoIE(InfoExtractor): 'uploader': 'NAGS', }, 'add_ie': ['ThePlatform'], + 'skip': 'Redirects to main page', }, ] @@ -53,7 +55,7 @@ def _real_extract(self, url): '_type': 'url_transparent', 'ie_key': 'ThePlatform', 'url': smuggle_url( - 'http://link.theplatform.com/s/ngs/media/guid/2423130747/%s?mbr=true' % guid, + f'http://link.theplatform.com/s/ngs/media/guid/2423130747/{guid}?mbr=true', {'force_smil_url': True}), 'id': guid, } @@ -75,6 +77,7 @@ class NationalGeographicTVIE(FOXIE): # XXX: Do not subclass from concrete IE 'params': { 'skip_download': True, }, + 'skip': 'Content not available', }] _HOME_PAGE_URL = 'https://www.nationalgeographic.com/tv/' _API_KEY = '238bb0a0c2aba67922c48709ce0c06fd' diff --git a/yt_dlp/extractor/naver.py b/yt_dlp/extractor/naver.py index 7a1890a618..a9f7f46078 100644 --- a/yt_dlp/extractor/naver.py +++ b/yt_dlp/extractor/naver.py @@ -1,27 +1,32 @@ +import base64 +import hashlib +import hmac import itertools +import json import re -from urllib.parse import urlparse, parse_qs +import time +import urllib.parse from .common import InfoExtractor from ..utils import ( ExtractorError, - clean_html, dict_get, int_or_none, join_nonempty, merge_dicts, - parse_duration, + parse_iso8601, traverse_obj, try_get, unified_timestamp, update_url_query, + url_or_none, ) class NaverBaseIE(InfoExtractor): _CAPTION_EXT_RE = r'\.(?:ttml|vtt)' - @staticmethod # NB: Used in VLiveWebArchiveIE + @staticmethod # NB: Used in WeverseIE def process_subtitles(vod_data, process_url): ret = {'subtitles': {}, 'automatic_captions': {}} for caption in traverse_obj(vod_data, ('captions', 'list', ...)): @@ -31,7 +36,7 @@ def process_subtitles(vod_data, process_url): type_ = 'automatic_captions' if caption.get('type') == 'auto' else 'subtitles' lang = caption.get('locale') or join_nonempty('language', 'country', from_dict=caption) or 'und' if caption.get('type') == 'fan': - lang += '_fan%d' % next(i for i in itertools.count(1) if f'{lang}_fan{i}' not in ret[type_]) + lang += '_fan{}'.format(next(i for i in itertools.count(1) if f'{lang}_fan{i}' not in ret[type_])) ret[type_].setdefault(lang, []).extend({ 'url': sub_url, 'name': join_nonempty('label', 'fanName', from_dict=caption, delim=' - '), @@ -58,7 +63,7 @@ def extract_formats(streams, stream_type, query={}): encoding_option = stream.get('encodingOption', {}) bitrate = stream.get('bitrate', {}) formats.append({ - 'format_id': '%s_%s' % (stream.get('type') or stream_type, dict_get(encoding_option, ('name', 'id'))), + 'format_id': '{}_{}'.format(stream.get('type') or stream_type, dict_get(encoding_option, ('name', 'id'))), 'url': stream_url, 'ext': 'mp4', 'width': int_or_none(encoding_option.get('width')), @@ -110,6 +115,18 @@ def get_subs(caption_url): **self.process_subtitles(video_data, get_subs), } + def _call_api(self, path, video_id): + api_endpoint = f'https://apis.naver.com/now_web2/now_web_api/v1{path}' + key = b'nbxvs5nwNG9QKEWK0ADjYA4JZoujF4gHcIwvoCxFTPAeamq5eemvt5IWAYXxrbYM' + msgpad = int(time.time() * 1000) + md = base64.b64encode(hmac.HMAC( + key, f'{api_endpoint[:255]}{msgpad}'.encode(), digestmod=hashlib.sha1).digest()).decode() + + return self._download_json(api_endpoint, video_id=video_id, headers=self.geo_verification_headers(), query={ + 'msgpad': msgpad, + 'md': md, + })['result'] + class NaverIE(NaverBaseIE): _VALID_URL = r'https?://(?:m\.)?tv(?:cast)?\.naver\.com/(?:v|embed)/(?P<id>\d+)' @@ -125,21 +142,32 @@ class NaverIE(NaverBaseIE): 'upload_date': '20130903', 'uploader': '메가스터디, 합격불변의 법칙', 'uploader_id': 'megastudy', + 'uploader_url': 'https://tv.naver.com/megastudy', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'duration': 2118, + 'thumbnail': r're:^https?://.*\.jpg', }, }, { 'url': 'http://tv.naver.com/v/395837', - 'md5': '8a38e35354d26a17f73f4e90094febd3', + 'md5': '7791205fa89dbed2f5e3eb16d287ff05', 'info_dict': { 'id': '395837', 'ext': 'mp4', 'title': '9년이 지나도 아픈 기억, 전효성의 아버지', - 'description': 'md5:eb6aca9d457b922e43860a2a2b1984d3', + 'description': 'md5:c76be23e21403a6473d8119678cdb5cb', 'timestamp': 1432030253, 'upload_date': '20150519', - 'uploader': '4가지쇼 시즌2', - 'uploader_id': 'wrappinguser29', + 'uploader': '4가지쇼', + 'uploader_id': '4show', + 'uploader_url': 'https://tv.naver.com/4show', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'duration': 277, + 'thumbnail': r're:^https?://.*\.jpg', }, - 'skip': 'Georestricted', }, { 'url': 'http://tvcast.naver.com/v/81652', 'only_matching': True, @@ -147,56 +175,63 @@ class NaverIE(NaverBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - content = self._download_json( - 'https://tv.naver.com/api/json/v/' + video_id, - video_id, headers=self.geo_verification_headers()) - player_info_json = content.get('playerInfoJson') or {} - current_clip = player_info_json.get('currentClip') or {} + data = self._call_api(f'/clips/{video_id}/play-info', video_id) - vid = current_clip.get('videoId') - in_key = current_clip.get('inKey') + vid = traverse_obj(data, ('clip', 'videoId', {str})) + in_key = traverse_obj(data, ('play', 'inKey', {str})) if not vid or not in_key: - player_auth = try_get(player_info_json, lambda x: x['playerOption']['auth']) - if player_auth == 'notCountry': - self.raise_geo_restricted(countries=['KR']) - elif player_auth == 'notLogin': - self.raise_login_required() - raise ExtractorError('couldn\'t extract vid and key') + raise ExtractorError('Unable to extract video info') + info = self._extract_video_info(video_id, vid, in_key) - info.update({ - 'description': clean_html(current_clip.get('description')), - 'timestamp': int_or_none(current_clip.get('firstExposureTime'), 1000), - 'duration': parse_duration(current_clip.get('displayPlayTime')), - 'like_count': int_or_none(current_clip.get('recommendPoint')), - 'age_limit': 19 if current_clip.get('adult') else None, - }) + info.update(traverse_obj(data, ('clip', { + 'title': 'title', + 'description': 'description', + 'timestamp': ('firstExposureDatetime', {parse_iso8601}), + 'duration': ('playTime', {int_or_none}), + 'like_count': ('likeItCount', {int_or_none}), + 'view_count': ('playCount', {int_or_none}), + 'comment_count': ('commentCount', {int_or_none}), + 'thumbnail': ('thumbnailImageUrl', {url_or_none}), + 'uploader': 'channelName', + 'uploader_id': 'channelId', + 'uploader_url': ('channelUrl', {url_or_none}), + 'age_limit': ('adultVideo', {lambda x: 19 if x else None}), + }))) return info -class NaverLiveIE(InfoExtractor): +class NaverLiveIE(NaverBaseIE): IE_NAME = 'Naver:live' _VALID_URL = r'https?://(?:m\.)?tv(?:cast)?\.naver\.com/l/(?P<id>\d+)' _GEO_BYPASS = False _TESTS = [{ - 'url': 'https://tv.naver.com/l/52010', + 'url': 'https://tv.naver.com/l/127062', 'info_dict': { - 'id': '52010', + 'id': '127062', 'ext': 'mp4', - 'title': '[LIVE] 뉴스특보 : "수도권 거리두기, 2주간 2단계로 조정"', - 'description': 'md5:df7f0c237a5ed5e786ce5c91efbeaab3', - 'channel_id': 'NTV-ytnnews24-0', - 'start_time': 1597026780000, + 'live_status': 'is_live', + 'channel': '뉴스는 YTN', + 'channel_id': 'ytnnews24', + 'title': 're:^대한민국 24시간 뉴스 채널 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:f938b5956711beab6f882314ffadf4d5', + 'start_time': 1677752280, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)', + 'like_count': int, }, }, { - 'url': 'https://tv.naver.com/l/51549', + 'url': 'https://tv.naver.com/l/140535', 'info_dict': { - 'id': '51549', + 'id': '140535', 'ext': 'mp4', - 'title': '연합뉴스TV - 코로나19 뉴스특보', - 'description': 'md5:c655e82091bc21e413f549c0eaccc481', - 'channel_id': 'NTV-yonhapnewstv-0', - 'start_time': 1596406380000, + 'live_status': 'is_live', + 'channel': 'KBS뉴스', + 'channel_id': 'kbsnews', + 'start_time': 1696867320, + 'title': 're:^언제 어디서나! KBS 뉴스 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:6ad419c0bf2f332829bda3f79c295284', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)', + 'like_count': int, }, }, { 'url': 'https://tv.naver.com/l/54887', @@ -205,56 +240,28 @@ class NaverLiveIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - page = self._download_webpage(url, video_id, 'Downloading Page', 'Unable to download Page') - secure_url = self._search_regex(r'sApiF:\s+(?:"|\')([^"\']+)', page, 'secureurl') - - info = self._extract_video_info(video_id, secure_url) - info.update({ - 'description': self._og_search_description(page) - }) - - return info - - def _extract_video_info(self, video_id, url): - video_data = self._download_json(url, video_id, headers=self.geo_verification_headers()) - meta = video_data.get('meta') - status = meta.get('status') + data = self._call_api(f'/live-end/normal/{video_id}/play-info?renewLastPlayDate=true', video_id) + status = traverse_obj(data, ('live', 'liveStatus')) if status == 'CLOSED': raise ExtractorError('Stream is offline.', expected=True) elif status != 'OPENED': - raise ExtractorError('Unknown status %s' % status) - - title = meta.get('title') - stream_list = video_data.get('streams') - - if stream_list is None: - raise ExtractorError('Could not get stream data.', expected=True) - - formats = [] - for quality in stream_list: - if not quality.get('url'): - continue - - prop = quality.get('property') - if prop.get('abr'): # This abr doesn't mean Average audio bitrate. - continue - - formats.extend(self._extract_m3u8_formats( - quality.get('url'), video_id, 'mp4', - m3u8_id=quality.get('qualityId'), live=True - )) + raise ExtractorError(f'Unknown status {status!r}') return { 'id': video_id, - 'title': title, - 'formats': formats, - 'channel_id': meta.get('channelId'), - 'channel_url': meta.get('channelUrl'), - 'thumbnail': meta.get('imgUrl'), - 'start_time': meta.get('startTime'), - 'categories': [meta.get('categoryId')], - 'is_live': True + 'formats': self._extract_m3u8_formats( + traverse_obj(data, ('playbackBody', {json.loads}, 'media', 0, 'path')), video_id, live=True), + **traverse_obj(data, ('live', { + 'title': 'title', + 'channel': 'channelName', + 'channel_id': 'channelId', + 'description': 'description', + 'like_count': (('likeCount', 'likeItCount'), {int_or_none}), + 'thumbnail': ('thumbnailImageUrl', {url_or_none}), + 'start_time': (('startTime', 'startDateTime', 'startYmdt'), {parse_iso8601}), + }), get_all=False), + 'is_live': True, } @@ -279,7 +286,7 @@ class NaverNowIE(NaverBaseIE): }, 'params': { 'noplaylist': True, - } + }, }, { 'url': 'https://now.naver.com/s/now.4759?shareHightlight=26601461#highlight=', 'md5': '9f6118e398aa0f22b2152f554ea7851b', @@ -304,7 +311,7 @@ class NaverNowIE(NaverBaseIE): 'id': '4759', 'title': '아이키의 떰즈업', }, - 'playlist_mincount': 101 + 'playlist_mincount': 101, }, { 'url': 'https://now.naver.com/s/now.4759?shareReplayId=26331132#replay', 'info_dict': { @@ -341,7 +348,7 @@ def _extract_show_replays(self, show_id): show_vod_info = self._download_json( f'{self._API_URL}/vod-shows/now.{show_id}', show_id, query={'page': page, 'page_size': page_size}, - note=f'Downloading JSON vod list for show {show_id} - page {page}' + note=f'Downloading JSON vod list for show {show_id} - page {page}', )['response']['result'] for v in show_vod_info.get('vod_list') or []: yield self._extract_replay(show_id, v['id']) @@ -381,7 +388,7 @@ def _extract_highlight(self, show_id, highlight_id): def _real_extract(self, url): show_id = self._match_id(url) - qs = parse_qs(urlparse(url).query) + qs = urllib.parse.parse_qs(urllib.parse.urlparse(url).query) if not self._yes_playlist(show_id, qs.get('shareHightlight')): return self._extract_highlight(show_id, qs['shareHightlight'][0]) diff --git a/yt_dlp/extractor/nba.py b/yt_dlp/extractor/nba.py index d8fc82488d..91ae1d14c6 100644 --- a/yt_dlp/extractor/nba.py +++ b/yt_dlp/extractor/nba.py @@ -1,15 +1,12 @@ import functools import re +import urllib.parse from .turner import TurnerBaseIE -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, -) from ..utils import ( + OnDemandPagedList, int_or_none, merge_dicts, - OnDemandPagedList, parse_duration, parse_iso8601, parse_qs, @@ -22,7 +19,7 @@ class NBACVPBaseIE(TurnerBaseIE): def _extract_nba_cvp_info(self, path, video_id, fatal=False): return self._extract_cvp_info( - 'http://secure.nba.com/%s' % path, video_id, { + f'http://secure.nba.com/{path}', video_id, { 'default': { 'media_src': 'http://nba.cdn.turner.com/nba/big', }, @@ -97,7 +94,7 @@ def _extract_video(self, filter_key, filter_value): class NBAWatchEmbedIE(NBAWatchBaseIE): - IENAME = 'nba:watch:embed' + IE_NAME = 'nba:watch:embed' _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'embed\?.*?\bid=(?P<id>\d+)' _TESTS = [{ 'url': 'http://watch.nba.com/embed?id=659395', @@ -185,7 +182,7 @@ def _fetch_page(self, collection_id, page): page += 1 videos = self._download_json( 'https://content-api-prod.nba.com/public/1/endeavor/video-list/collection/' + collection_id, - collection_id, 'Downloading page %d JSON metadata' % page, query={ + collection_id, f'Downloading page {page} JSON metadata', query={ 'count': self._PAGE_SIZE, 'page': page, })['results']['videos'] @@ -260,14 +257,14 @@ def _embed_url_result(self, team, content_id): def _call_api(self, team, content_id, query, resource): return self._download_json( - 'https://api.nba.net/2/%s/video,imported_video,wsc/' % team, - content_id, 'Download %s JSON metadata' % resource, + f'https://api.nba.net/2/{team}/video,imported_video,wsc/', + content_id, f'Download {resource} JSON metadata', query=query, headers={ 'accessToken': 'internal|bb88df6b4c2244e78822812cecf1ee1b', })['response']['result'] def _extract_video(self, video, team, extract_all=True): - video_id = compat_str(video['nid']) + video_id = str(video['nid']) team = video['brand'] info = { @@ -330,7 +327,7 @@ def _extract_video(self, video, team, extract_all=True): def _real_extract(self, url): team, display_id = self._match_valid_url(url).groups() if '/play#/' in url: - display_id = compat_urllib_parse_unquote(display_id) + display_id = urllib.parse.unquote(display_id) else: webpage = self._download_webpage(url, display_id) display_id = self._search_regex( @@ -339,7 +336,7 @@ def _real_extract(self, url): class NBAEmbedIE(NBABaseIE): - IENAME = 'nba:embed' + IE_NAME = 'nba:embed' _VALID_URL = r'https?://secure\.nba\.com/assets/amp/include/video/(?:topI|i)frame\.html\?.*?\bcontentId=(?P<id>[^?#&]+)' _TESTS = [{ 'url': 'https://secure.nba.com/assets/amp/include/video/topIframe.html?contentId=teams/bulls/2020/12/04/3478774/1607105587854-20201204_SCHEDULE_RELEASE_FINAL_DRUPAL-3478774&team=bulls&adFree=false&profile=71&videoPlayerName=TAMPCVP&baseUrl=&videoAdsection=nba.com_mobile_web_teamsites_chicagobulls&Env=', @@ -361,8 +358,8 @@ def _real_extract(self, url): class NBAIE(NBABaseIE): - IENAME = 'nba' - _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?!%s)video/(?P<id>(?:[^/]+/)*[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX + IE_NAME = 'nba' + _VALID_URL = NBABaseIE._VALID_URL_BASE + f'(?!{NBABaseIE._CHANNEL_PATH_REGEX})video/(?P<id>(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ 'url': 'https://www.nba.com/bulls/video/teams/bulls/2020/12/04/3478774/1607105587854-20201204schedulereleasefinaldrupal-3478774', 'info_dict': { @@ -388,8 +385,8 @@ def _extract_url_results(self, team, content_id): class NBAChannelIE(NBABaseIE): - IENAME = 'nba:channel' - _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?:%s)/(?P<id>[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX + IE_NAME = 'nba:channel' + _VALID_URL = NBABaseIE._VALID_URL_BASE + f'(?:{NBABaseIE._CHANNEL_PATH_REGEX})/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://www.nba.com/blazers/video/channel/summer_league', 'info_dict': { @@ -408,7 +405,7 @@ def _fetch_page(self, team, channel, page): 'channels': channel, 'count': self._PAGE_SIZE, 'offset': page * self._PAGE_SIZE, - }, 'page %d' % (page + 1)) + }, f'page {page + 1}') for video in results: yield self._extract_video(video, team, False) diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index b9f65e9270..8f6fb22b17 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -1,20 +1,26 @@ import base64 import json import re +import urllib.parse +import xml.etree.ElementTree +from .adobepass import AdobePassIE from .common import InfoExtractor from .theplatform import ThePlatformIE, default_ns -from .adobepass import AdobePassIE -from ..compat import compat_urllib_parse_unquote +from ..networking import HEADRequest from ..utils import ( ExtractorError, - HEADRequest, RegexNotFoundError, UserNotLive, clean_html, + determine_ext, + float_or_none, int_or_none, + join_nonempty, + mimetype2ext, parse_age_limit, parse_duration, + remove_end, smuggle_url, traverse_obj, try_get, @@ -22,7 +28,6 @@ unified_timestamp, update_url_query, url_basename, - xpath_attr, ) @@ -49,6 +54,8 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'chapters': 'count:1', 'tags': 'count:4', 'thumbnail': r're:https?://.+\.jpg', + 'categories': ['Series/The Tonight Show Starring Jimmy Fallon'], + 'media_type': 'Full Episode', }, 'params': { 'skip_download': 'm3u8', @@ -127,8 +134,9 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'tags': 'count:10', 'age_limit': 0, 'thumbnail': r're:https?://.+\.jpg', + 'categories': ['Series/Quantum Leap 2022'], + 'media_type': 'Highlight', }, - 'expected_warnings': ['Ignoring subtitle tracks'], 'params': { 'skip_download': 'm3u8', }, @@ -141,12 +149,12 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE # Percent escaped url 'url': 'https://www.nbc.com/up-all-night/video/day-after-valentine%27s-day/n2189', 'only_matching': True, - } + }, ] def _real_extract(self, url): permalink, video_id = self._match_valid_url(url).groups() - permalink = 'http' + compat_urllib_parse_unquote(permalink) + permalink = 'http' + urllib.parse.unquote(permalink) video_data = self._download_json( 'https://friendship.nbc.co/v2/graphql', video_id, query={ 'query': '''query bonanzaPage( @@ -194,7 +202,7 @@ def _real_extract(self, url): 'switch': 'HLSServiceSecure', } video_id = video_data['mpxGuid'] - tp_path = 'NnzsPC/media/guid/%s/%s' % (video_data.get('mpxAccountId') or '2410887629', video_id) + tp_path = 'NnzsPC/media/guid/{}/{}'.format(video_data.get('mpxAccountId') or '2410887629', video_id) tpm = self._download_theplatform_metadata(tp_path, video_id) title = tpm.get('title') or video_data.get('secondaryTitle') if video_data.get('locked'): @@ -204,7 +212,7 @@ def _real_extract(self, url): query['auth'] = self._extract_mvpd_auth( url, video_id, 'nbcentertainment', resource) theplatform_url = smuggle_url(update_url_query( - 'http://link.theplatform.com/s/NnzsPC/media/guid/%s/%s' % (video_data.get('mpxAccountId') or '2410887629', video_id), + 'http://link.theplatform.com/s/NnzsPC/media/guid/{}/{}'.format(video_data.get('mpxAccountId') or '2410887629', video_id), query), {'force_smil_url': True}) # Empty string or 0 can be valid values for these. So the check must be `is None` @@ -246,7 +254,7 @@ def _real_extract(self, url): class NBCSportsVPlayerIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:vplayer\.nbcsports\.com|(?:www\.)?nbcsports\.com/vplayer)/' _VALID_URL = _VALID_URL_BASE + r'(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)' - _EMBED_REGEX = [r'(?:iframe[^>]+|var video|div[^>]+data-(?:mpx-)?)[sS]rc\s?=\s?"(?P<url>%s[^\"]+)' % _VALID_URL_BASE] + _EMBED_REGEX = [rf'(?:iframe[^>]+|var video|div[^>]+data-(?:mpx-)?)[sS]rc\s?=\s?"(?P<url>{_VALID_URL_BASE}[^\"]+)'] _TESTS = [{ 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI', @@ -260,8 +268,8 @@ class NBCSportsVPlayerIE(InfoExtractor): 'uploader': 'NBCU-SPORTS', 'duration': 72.818, 'chapters': [], - 'thumbnail': r're:^https?://.*\.jpg$' - } + 'thumbnail': r're:^https?://.*\.jpg$', + }, }, { 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/PEgOtlNcC_y2', 'only_matching': True, @@ -282,7 +290,7 @@ class NBCSportsIE(InfoExtractor): _TESTS = [{ # iframe src - 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', + 'url': 'https://www.nbcsports.com/watch/nfl/profootballtalk/pft-pm/unpacking-addisons-reckless-driving-citation', 'info_dict': { 'id': 'PHJSaFWbrTY9', 'ext': 'mp4', @@ -294,7 +302,7 @@ class NBCSportsIE(InfoExtractor): 'chapters': [], 'thumbnail': 'https://hdliveextra-a.akamaihd.net/HD/image_sports/NBCU_Sports_Group_-_nbcsports/253/303/izzodps.jpg', 'duration': 528.395, - } + }, }, { # data-mpx-src 'url': 'https://www.nbcsports.com/philadelphia/philadelphia-phillies/bruce-bochy-hector-neris-hes-idiot', @@ -332,7 +340,7 @@ class NBCSportsStreamIE(AdobePassIE): def _real_extract(self, url): video_id = self._match_id(url) live_source = self._download_json( - 'http://stream.nbcsports.com/data/live_sources_%s.json' % video_id, + f'http://stream.nbcsports.com/data/live_sources_{video_id}.json', video_id) video_source = live_source['videoSources'][0] title = video_source['title'] @@ -377,7 +385,7 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE _TESTS = [ { 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', - 'md5': 'cf4bc9e6ce0130f00f545d80ecedd4bf', + 'md5': 'fb3dcd2d7b1dd9804305fa2fc95ab610', # md5 tends to fluctuate 'info_dict': { 'id': '269389891880', 'ext': 'mp4', @@ -385,6 +393,8 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', 'timestamp': 1401363060, 'upload_date': '20140529', + 'duration': 46.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/140529/p_tweet_snow_140529.jpg', }, }, { @@ -400,7 +410,7 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE }, { 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', - 'md5': '8eb831eca25bfa7d25ddd83e85946548', + 'md5': '40d0e48c68896359c80372306ece0fc3', 'info_dict': { 'id': '394064451844', 'ext': 'mp4', @@ -408,11 +418,13 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', 'timestamp': 1423104900, 'upload_date': '20150205', + 'duration': 1236.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/__NEW/nn_netcast_150204.jpg', }, }, { 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', - 'md5': '4a8c4cec9e1ded51060bdda36ff0a5c0', + 'md5': 'ffb59bcf0733dc3c7f0ace907f5e3939', 'info_dict': { 'id': 'n431456', 'ext': 'mp4', @@ -420,11 +432,13 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', 'upload_date': '20150922', 'timestamp': 1442917800, + 'duration': 37.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/__NEW/x_lon_vwhorn_150922.jpg', }, }, { 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', - 'md5': '118d7ca3f0bea6534f119c68ef539f71', + 'md5': '693d1fa21d23afcc9b04c66b227ed9ff', 'info_dict': { 'id': '669831235788', 'ext': 'mp4', @@ -432,6 +446,8 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', 'upload_date': '20160420', 'timestamp': 1461152093, + 'duration': 69.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/201604/2016-04-20T11-35-09-133Z--1280x720.jpg', }, }, { @@ -445,6 +461,7 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1406937606, 'upload_date': '20140802', + 'duration': 940.0, }, }, { @@ -482,10 +499,8 @@ def _real_extract(self, url): m3u8_id=format_id, fatal=False)) continue tbr = int_or_none(va.get('bitrate'), 1000) - if tbr: - format_id += '-%d' % tbr formats.append({ - 'format_id': format_id, + 'format_id': join_nonempty(format_id, tbr), 'url': public_url, 'width': int_or_none(va.get('width')), 'height': int_or_none(va.get('height')), @@ -533,6 +548,7 @@ class NBCOlympicsIE(InfoExtractor): 'upload_date': '20160815', 'uploader': 'NBCU-SPORTS', }, + 'skip': '404 Not Found', } def _real_extract(self, url): @@ -551,7 +567,7 @@ def _real_extract(self, url): except RegexNotFoundError: theplatform_url = self._search_regex( r"([\"'])embedUrl\1: *([\"'])(?P<embedUrl>.+)\2", - webpage, 'embedding URL', group="embedUrl") + webpage, 'embedding URL', group='embedUrl') return { '_type': 'url_transparent', @@ -576,6 +592,7 @@ class NBCOlympicsStreamIE(AdobePassIE): 'params': { 'skip_download': 'm3u8', }, + 'skip': 'Livestream', }, { 'note': 'Plain m3u8 source URL', 'url': 'https://stream.nbcolympics.com/gymnastics-event-finals-mens-floor-pommel-horse-womens-vault-bars', @@ -587,6 +604,7 @@ class NBCOlympicsStreamIE(AdobePassIE): 'params': { 'skip_download': 'm3u8', }, + 'skip': 'Livestream', }, ] @@ -604,7 +622,7 @@ def _real_extract(self, url): source_url = self._download_json( f'https://api-leap.nbcsports.com/feeds/assets/{pid}?application=NBCOlympics&platform=desktop&format=nbc-player&env=staging', - pid, 'Downloading leap config' + pid, 'Downloading leap config', )['videoSources'][0]['cdnSources']['primary'][0]['sourceUrl'] if event_config.get('cdnToken'): @@ -660,6 +678,7 @@ class NBCStationsIE(InfoExtractor): 'ext': 'mp4', 'title': 'Large Structure Fire in Downtown LA Prompts Smoke Odor Advisory', 'description': 'md5:417ed3c2d91fe9d301e6db7b0942f182', + 'duration': 112.513, 'timestamp': 1661135892, 'upload_date': '20220822', 'uploader': 'NBC 4', @@ -676,6 +695,7 @@ class NBCStationsIE(InfoExtractor): 'ext': 'mp4', 'title': 'Huracán complica que televidente de Tucson reciba reembolso', 'description': 'md5:af298dc73aab74d4fca6abfb12acb6cf', + 'duration': 172.406, 'timestamp': 1660886507, 'upload_date': '20220819', 'uploader': 'Telemundo Arizona', @@ -685,6 +705,22 @@ class NBCStationsIE(InfoExtractor): 'params': { 'skip_download': 'm3u8', }, + }, { + # direct mp4 link + 'url': 'https://www.nbcboston.com/weather/video-weather/highs-near-freezing-in-boston-on-wednesday/2961135/', + 'md5': '9bf8c41dc7abbb75b1a44f1491a4cc85', + 'info_dict': { + 'id': '2961135', + 'ext': 'mp4', + 'title': 'Highs Near Freezing in Boston on Wednesday', + 'description': 'md5:3ec486609a926c99f00a3512e6c0e85b', + 'duration': 235.669, + 'timestamp': 1675268656, + 'upload_date': '20230201', + 'uploader': '', + 'channel_id': 'WBTS', + 'channel': 'nbcboston', + }, }] _RESOLUTIONS = { @@ -711,7 +747,7 @@ def _real_extract(self, url): if not video_data: raise ExtractorError('No video metadata found in webpage', expected=True) - info, formats, subtitles = {}, [], {} + info, formats = {}, [] is_live = int_or_none(video_data.get('mpx_is_livestream')) == 1 query = { 'formats': 'MPEG-DASH none,M3U none,MPEG-DASH none,MPEG4,MP3', @@ -747,13 +783,14 @@ def _real_extract(self, url): video_url = traverse_obj(video_data, ((None, ('video', 'meta')), 'mp4_url'), get_all=False) if video_url: + ext = determine_ext(video_url) height = self._search_regex(r'\d+-(\d+)p', url_basename(video_url), 'height', default=None) formats.append({ 'url': video_url, - 'ext': 'mp4', + 'ext': ext, 'width': int_or_none(self._RESOLUTIONS.get(height)), 'height': int_or_none(height), - 'format_id': 'http-mp4', + 'format_id': f'http-{ext}', }) info.update({ @@ -770,14 +807,27 @@ def _real_extract(self, url): smil = self._download_xml( f'https://link.theplatform.com/s/{pdk_acct}/{player_id}', video_id, note='Downloading SMIL data', query=query, fatal=is_live) - if smil: - manifest_url = xpath_attr(smil, f'.//{{{default_ns}}}video', 'src', fatal=is_live) - subtitles = self._parse_smil_subtitles(smil, default_ns) - fmts, subs = self._extract_m3u8_formats_and_subtitles( - manifest_url, video_id, 'mp4', m3u8_id='hls', fatal=is_live, - live=is_live, errnote='No HLS formats found') - formats.extend(fmts) - self._merge_subtitles(subs, target=subtitles) + if not isinstance(smil, xml.etree.ElementTree.Element): + smil = None + subtitles = self._parse_smil_subtitles(smil, default_ns) if smil is not None else {} + for video in smil.findall(self._xpath_ns('.//video', default_ns)) if smil is not None else []: + info['duration'] = float_or_none(remove_end(video.get('dur'), 'ms'), 1000) + video_src_url = video.get('src') + ext = mimetype2ext(video.get('type'), default=determine_ext(video_src_url)) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + video_src_url, video_id, 'mp4', m3u8_id='hls', fatal=is_live, + live=is_live, errnote='No HLS formats found') + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif video_src_url: + formats.append({ + 'url': video_src_url, + 'format_id': f'https-{ext}', + 'ext': ext, + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + }) if not formats: self.raise_no_formats('No video content found in webpage', expected=True) diff --git a/yt_dlp/extractor/ndr.py b/yt_dlp/extractor/ndr.py index 41ea3629a9..5181c7f20c 100644 --- a/yt_dlp/extractor/ndr.py +++ b/yt_dlp/extractor/ndr.py @@ -1,10 +1,10 @@ import re +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse from ..utils import ( - determine_ext, ExtractorError, + determine_ext, int_or_none, merge_dicts, parse_iso8601, @@ -125,13 +125,13 @@ def _extract_embed(self, webpage, display_id, url): # some more work needed if we only found sophoraID if re.match(r'^[a-z]+\d+$', embed_url): # get the initial part of the url path,. eg /panorama/archiv/2022/ - parsed_url = compat_urllib_parse_urlparse(url) - path = self._search_regex(r'(.+/)%s' % display_id, parsed_url.path or '', 'embed URL', default='') + parsed_url = urllib.parse.urlparse(url) + path = self._search_regex(rf'(.+/){display_id}', parsed_url.path or '', 'embed URL', default='') # find tell-tale image with the actual ID - ndr_id = self._search_regex(r'%s([a-z]+\d+)(?!\.)\b' % (path, ), webpage, 'embed URL', default=None) + ndr_id = self._search_regex(rf'{path}([a-z]+\d+)(?!\.)\b', webpage, 'embed URL', default=None) # or try to use special knowledge! NDR_INFO_URL_TPL = 'https://www.ndr.de/info/%s-player.html' - embed_url = 'ndr:%s' % (ndr_id, ) if ndr_id else NDR_INFO_URL_TPL % (embed_url, ) + embed_url = f'ndr:{ndr_id}' if ndr_id else NDR_INFO_URL_TPL % (embed_url, ) if not embed_url: raise ExtractorError('Unable to extract embedUrl') @@ -141,7 +141,7 @@ def _extract_embed(self, webpage, display_id, url): timestamp = parse_iso8601( self._search_regex( (r'<span[^>]+itemprop="(?:datePublished|uploadDate)"[^>]+content="(?P<cont>[^"]+)"', - r'\bvar\s*pdt\s*=\s*(?P<q>["\'])(?P<cont>(?:(?!(?P=q)).)+)(?P=q)', ), + r'\bvar\s*pdt\s*=\s*(?P<q>["\'])(?P<cont>(?:(?!(?P=q)).)+)(?P=q)'), webpage, 'upload date', group='cont', default=None)) info = self._search_json_ld(webpage, display_id, default={}) return merge_dicts({ @@ -200,7 +200,7 @@ def _extract_embed(self, webpage, display_id, url=None): # find tell-tale URL with the actual ID, or ... video_id = self._search_regex( (r'''\bsrc\s*=\s*["']?(?:/\w+)+/([a-z]+\d+)(?!\.)\b''', - r'<iframe[^>]+id="pp_([\da-z]+)"', ), + r'<iframe[^>]+id="pp_([\da-z]+)"'), webpage, 'NDR id', default=None) description = ( @@ -211,7 +211,7 @@ def _extract_embed(self, webpage, display_id, url=None): return { '_type': 'url_transparent', 'ie_key': 'NDREmbedBase', - 'url': 'ndr:%s' % video_id, + 'url': f'ndr:{video_id}', 'display_id': display_id, 'description': description, 'title': display_id.replace('-', ' ').strip(), @@ -234,7 +234,7 @@ def _real_extract(self, url): video_id = mobj.group('id') or mobj.group('id_s') ppjson = self._download_json( - 'http://www.ndr.de/%s-ppjson.json' % video_id, video_id) + f'http://www.ndr.de/{video_id}-ppjson.json', video_id) playlist = ppjson['playlist'] diff --git a/yt_dlp/extractor/ndtv.py b/yt_dlp/extractor/ndtv.py index bfe52f77de..c328bd4d23 100644 --- a/yt_dlp/extractor/ndtv.py +++ b/yt_dlp/extractor/ndtv.py @@ -5,6 +5,7 @@ class NDTVIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:[^/]+\.)?ndtv\.com/(?:[^/]+/)*videos?/?(?:[^/]+/)*[^/?^&]+-(?P<id>\d+)' _TESTS = [ @@ -14,12 +15,12 @@ class NDTVIE(InfoExtractor): 'info_dict': { 'id': '468818', 'ext': 'mp4', - 'title': "प्राइम टाइम: सिस्टम बीमार, स्कूल बदहाल", + 'title': 'प्राइम टाइम: सिस्टम बीमार, स्कूल बदहाल', 'description': 'md5:f410512f1b49672e5695dea16ef2731d', 'upload_date': '20170928', 'duration': 2218, 'thumbnail': r're:https?://.*\.jpg', - } + }, }, { # __filename is url @@ -28,45 +29,45 @@ class NDTVIE(InfoExtractor): 'info_dict': { 'id': '470304', 'ext': 'mp4', - 'title': "Cracker-Free Diwali Wishes From Karan Johar, Kriti Sanon & Other Stars", + 'title': 'Cracker-Free Diwali Wishes From Karan Johar, Kriti Sanon & Other Stars', 'description': 'md5:f115bba1adf2f6433fa7c1ade5feb465', 'upload_date': '20171019', 'duration': 137, 'thumbnail': r're:https?://.*\.jpg', - } + }, }, { 'url': 'https://www.ndtv.com/video/news/news/delhi-s-air-quality-status-report-after-diwali-is-very-poor-470372', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://auto.ndtv.com/videos/the-cnb-daily-october-13-2017-469935', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://sports.ndtv.com/cricket/videos/2nd-t20i-rock-thrown-at-australia-cricket-team-bus-after-win-over-india-469764', - 'only_matching': True + 'only_matching': True, }, { 'url': 'http://gadgets.ndtv.com/videos/uncharted-the-lost-legacy-review-465568', - 'only_matching': True + 'only_matching': True, }, { 'url': 'http://profit.ndtv.com/videos/news/video-indian-economy-on-very-solid-track-international-monetary-fund-chief-470040', - 'only_matching': True + 'only_matching': True, }, { 'url': 'http://food.ndtv.com/video-basil-seeds-coconut-porridge-419083', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://doctor.ndtv.com/videos/top-health-stories-of-the-week-467396', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://swirlster.ndtv.com/video/how-to-make-friends-at-work-469324', - 'only_matching': True - } + 'only_matching': True, + }, ] def _real_extract(self, url): diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py index 81e2f56e62..cb8f6a67d4 100644 --- a/yt_dlp/extractor/nebula.py +++ b/yt_dlp/extractor/nebula.py @@ -1,225 +1,372 @@ import itertools import json -import urllib.error +from .art19 import Art19IE from .common import InfoExtractor -from ..utils import ExtractorError, parse_iso8601 +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + int_or_none, + make_archive_id, + parse_iso8601, + smuggle_url, + try_call, + unsmuggle_url, + update_url_query, + url_or_none, + urljoin, +) +from ..utils.traversal import traverse_obj -_BASE_URL_RE = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)' +_BASE_URL_RE = r'https?://(?:www\.|beta\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)' class NebulaBaseIE(InfoExtractor): _NETRC_MACHINE = 'watchnebula' + _token = _api_token = None - _nebula_api_token = None - _nebula_bearer_token = None - - def _perform_nebula_auth(self, username, password): - if not username or not password: - self.raise_login_required(method='password') - - data = json.dumps({'email': username, 'password': password}).encode('utf8') - response = self._download_json( - 'https://api.watchnebula.com/api/v1/auth/login/', - data=data, fatal=False, video_id=None, - headers={ - 'content-type': 'application/json', - # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint - 'cookie': '' - }, - note='Logging in to Nebula with supplied credentials', - errnote='Authentication failed or rejected') - if not response or not response.get('key'): - self.raise_login_required(method='password') - - return response['key'] - - def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''): - assert method in ('GET', 'POST',) - assert auth_type in ('api', 'bearer',) - - def inner_call(): - authorization = f'Token {self._nebula_api_token}' if auth_type == 'api' else f'Bearer {self._nebula_bearer_token}' - return self._download_json( - url, video_id, note=note, headers={'Authorization': authorization}, - data=b'' if method == 'POST' else None) - + def _perform_login(self, username, password): try: - return inner_call() - except ExtractorError as exc: - # if 401 or 403, attempt credential re-auth and retry - if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403): - self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}') - self._perform_login() - return inner_call() - else: + response = self._download_json( + 'https://nebula.tv/auth/login/', None, + 'Logging in to Nebula', 'Login failed', + data=json.dumps({'email': username, 'password': password}).encode(), + headers={'content-type': 'application/json'}) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: + raise ExtractorError('Login failed: Invalid username or password', expected=True) + raise + self._api_token = traverse_obj(response, ('key', {str})) + if not self._api_token: + raise ExtractorError('Login failed: No token') + + def _call_api(self, *args, **kwargs): + if self._token: + kwargs.setdefault('headers', {})['Authorization'] = f'Bearer {self._token}' + try: + return self._download_json(*args, **kwargs) + except ExtractorError as e: + if not isinstance(e.cause, HTTPError) or e.cause.status not in (401, 403): + raise + self.to_screen( + f'Reauthorizing with Nebula and retrying, because last API call resulted in error {e.cause.status}') + self._real_initialize() + if self._token: + kwargs.setdefault('headers', {})['Authorization'] = f'Bearer {self._token}' + return self._download_json(*args, **kwargs) + + def _real_initialize(self): + if not self._api_token: + self._api_token = try_call( + lambda: self._get_cookies('https://nebula.tv')['nebula_auth.apiToken'].value) + self._token = self._download_json( + 'https://users.api.nebula.app/api/v1/authorization/', None, + headers={'Authorization': f'Token {self._api_token}'} if self._api_token else None, + note='Authorizing to Nebula', data=b'')['token'] + + def _extract_formats(self, content_id, slug): + for retry in (False, True): + try: + fmts, subs = self._extract_m3u8_formats_and_subtitles( + f'https://content.api.nebula.app/{content_id.split(":")[0]}s/{content_id}/manifest.m3u8', + slug, 'mp4', query={ + 'token': self._token, + 'app_version': '23.10.0', + 'platform': 'ios', + }) + return {'formats': fmts, 'subtitles': subs} + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + self.raise_login_required() + if not retry and isinstance(e.cause, HTTPError) and e.cause.status == 403: + self.to_screen('Reauthorizing with Nebula and retrying, because fetching video resulted in error') + self._real_initialize() + continue raise - def _fetch_nebula_bearer_token(self): - """ - Get a Bearer token for the Nebula API. This will be required to fetch video meta data. - """ - response = self._call_nebula_api('https://api.watchnebula.com/api/v1/authorization/', - method='POST', - note='Authorizing to Nebula') - return response['token'] - - def _fetch_video_formats(self, slug): - stream_info = self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/stream/', - video_id=slug, - auth_type='bearer', - note='Fetching video stream info') - manifest_url = stream_info['manifest'] - return self._extract_m3u8_formats_and_subtitles(manifest_url, slug) - - def _build_video_info(self, episode): - fmts, subs = self._fetch_video_formats(episode['slug']) - channel_slug = episode['channel_slug'] - channel_title = episode['channel_title'] + def _extract_video_metadata(self, episode): + channel_url = traverse_obj( + episode, (('channel_slug', 'class_slug'), {lambda x: urljoin('https://nebula.tv/', x)}), get_all=False) return { - 'id': episode['zype_id'], - 'display_id': episode['slug'], - 'formats': fmts, - 'subtitles': subs, - 'webpage_url': f'https://nebula.tv/{episode["slug"]}', - 'title': episode['title'], - 'description': episode['description'], - 'timestamp': parse_iso8601(episode['published_at']), - 'thumbnails': [{ - # 'id': tn.get('name'), # this appears to be null - 'url': tn['original'], - 'height': key, - } for key, tn in episode['assets']['thumbnail'].items()], - 'duration': episode['duration'], - 'channel': channel_title, - 'channel_id': channel_slug, - 'channel_url': f'https://nebula.tv/{channel_slug}', - 'uploader': channel_title, - 'uploader_id': channel_slug, - 'uploader_url': f'https://nebula.tv/{channel_slug}', - 'series': channel_title, - 'creator': channel_title, + 'id': episode['id'].partition(':')[2], + **traverse_obj(episode, { + 'display_id': 'slug', + 'title': 'title', + 'description': 'description', + 'timestamp': ('published_at', {parse_iso8601}), + 'duration': ('duration', {int_or_none}), + 'channel_id': 'channel_slug', + 'uploader_id': 'channel_slug', + 'channel': 'channel_title', + 'uploader': 'channel_title', + 'series': 'channel_title', + 'creator': 'channel_title', + 'thumbnail': ('images', 'thumbnail', 'src', {url_or_none}), + 'episode_number': ('order', {int_or_none}), + # Old code was wrongly setting extractor_key from NebulaSubscriptionsIE + '_old_archive_ids': ('zype_id', {lambda x: [ + make_archive_id(NebulaIE, x), make_archive_id(NebulaSubscriptionsIE, x)] if x else None}), + }), + 'channel_url': channel_url, + 'uploader_url': channel_url, } - def _perform_login(self, username=None, password=None): - self._nebula_api_token = self._perform_nebula_auth(username, password) - self._nebula_bearer_token = self._fetch_nebula_bearer_token() - class NebulaIE(NebulaBaseIE): - _VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[-\w]+)' - _TESTS = [ - { - 'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast', - 'md5': '14944cfee8c7beeea106320c47560efc', - 'info_dict': { - 'id': '5c271b40b13fd613090034fd', - 'ext': 'mp4', - 'title': 'That Time Disney Remade Beauty and the Beast', - 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.', - 'upload_date': '20180731', - 'timestamp': 1533009600, - 'channel': 'Lindsay Ellis', - 'channel_id': 'lindsayellis', - 'uploader': 'Lindsay Ellis', - 'uploader_id': 'lindsayellis', - 'timestamp': 1533009600, - 'uploader_url': 'https://nebula.tv/lindsayellis', - 'series': 'Lindsay Ellis', - 'display_id': 'that-time-disney-remade-beauty-and-the-beast', - 'channel_url': 'https://nebula.tv/lindsayellis', - 'creator': 'Lindsay Ellis', - 'duration': 2212, - 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', - }, + IE_NAME = 'nebula:video' + _VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast', + 'info_dict': { + 'id': '84ed544d-4afd-4723-8cd5-2b95261f0abf', + 'ext': 'mp4', + 'title': 'That Time Disney Remade Beauty and the Beast', + 'description': 'md5:2aae3c4cfc5ee09a1ecdff0909618cf4', + 'upload_date': '20180731', + 'timestamp': 1533009600, + 'channel': 'Lindsay Ellis', + 'channel_id': 'lindsayellis', + 'uploader': 'Lindsay Ellis', + 'uploader_id': 'lindsayellis', + 'uploader_url': r're:https://nebula\.(tv|app)/lindsayellis', + 'series': 'Lindsay Ellis', + 'display_id': 'that-time-disney-remade-beauty-and-the-beast', + 'channel_url': r're:https://nebula\.(tv|app)/lindsayellis', + 'creator': 'Lindsay Ellis', + 'duration': 2212, + 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+', + '_old_archive_ids': ['nebula 5c271b40b13fd613090034fd', 'nebulasubscriptions 5c271b40b13fd613090034fd'], }, - { - 'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', - 'md5': 'd05739cf6c38c09322422f696b569c23', - 'info_dict': { - 'id': '5e7e78171aaf320001fbd6be', - 'ext': 'mp4', - 'title': 'Landing Craft - How The Allies Got Ashore', - 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.', - 'upload_date': '20200327', - 'timestamp': 1585348140, - 'channel': 'Real Engineering', - 'channel_id': 'realengineering', - 'uploader': 'Real Engineering', - 'uploader_id': 'realengineering', - 'series': 'Real Engineering', - 'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', - 'creator': 'Real Engineering', - 'duration': 841, - 'channel_url': 'https://nebula.tv/realengineering', - 'uploader_url': 'https://nebula.tv/realengineering', - 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', - }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', + 'md5': 'd05739cf6c38c09322422f696b569c23', + 'info_dict': { + 'id': '7e623145-1b44-4ca3-aa0b-ed25a247ea34', + 'ext': 'mp4', + 'title': 'Landing Craft - How The Allies Got Ashore', + 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.', + 'upload_date': '20200327', + 'timestamp': 1585348140, + 'channel': 'Real Engineering — The Logistics of D-Day', + 'channel_id': 'd-day', + 'uploader': 'Real Engineering — The Logistics of D-Day', + 'uploader_id': 'd-day', + 'series': 'Real Engineering — The Logistics of D-Day', + 'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', + 'creator': 'Real Engineering — The Logistics of D-Day', + 'duration': 841, + 'channel_url': 'https://nebula.tv/d-day', + 'uploader_url': 'https://nebula.tv/d-day', + 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+', + '_old_archive_ids': ['nebula 5e7e78171aaf320001fbd6be', 'nebulasubscriptions 5e7e78171aaf320001fbd6be'], }, - { - 'url': 'https://nebula.tv/videos/money-episode-1-the-draw', - 'md5': 'ebe28a7ad822b9ee172387d860487868', - 'info_dict': { - 'id': '5e779ebdd157bc0001d1c75a', - 'ext': 'mp4', - 'title': 'Episode 1: The Draw', - 'description': r'contains:There’s free money on offer… if the players can all work together.', - 'upload_date': '20200323', - 'timestamp': 1584980400, - 'channel': 'Tom Scott Presents: Money', - 'channel_id': 'tom-scott-presents-money', - 'uploader': 'Tom Scott Presents: Money', - 'uploader_id': 'tom-scott-presents-money', - 'uploader_url': 'https://nebula.tv/tom-scott-presents-money', - 'duration': 825, - 'channel_url': 'https://nebula.tv/tom-scott-presents-money', - 'series': 'Tom Scott Presents: Money', - 'display_id': 'money-episode-1-the-draw', - 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', - 'creator': 'Tom Scott Presents: Money', - }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://nebula.tv/videos/money-episode-1-the-draw', + 'md5': 'ebe28a7ad822b9ee172387d860487868', + 'info_dict': { + 'id': 'b96c5714-9e2b-4ec3-b3f1-20f6e89cc553', + 'ext': 'mp4', + 'title': 'Episode 1: The Draw', + 'description': r'contains:There’s free money on offer… if the players can all work together.', + 'upload_date': '20200323', + 'timestamp': 1584980400, + 'channel': 'Tom Scott Presents: Money', + 'channel_id': 'tom-scott-presents-money', + 'uploader': 'Tom Scott Presents: Money', + 'uploader_id': 'tom-scott-presents-money', + 'uploader_url': 'https://nebula.tv/tom-scott-presents-money', + 'duration': 825, + 'channel_url': 'https://nebula.tv/tom-scott-presents-money', + 'series': 'Tom Scott Presents: Money', + 'display_id': 'money-episode-1-the-draw', + 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+', + 'creator': 'Tom Scott Presents: Money', + '_old_archive_ids': ['nebula 5e779ebdd157bc0001d1c75a', 'nebulasubscriptions 5e779ebdd157bc0001d1c75a'], }, - { - 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', - 'only_matching': True, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', + 'only_matching': True, + }, { + 'url': 'https://nebula.tv/videos/tldrnewseu-did-the-us-really-blow-up-the-nordstream-pipelines', + 'info_dict': { + 'id': 'e389af9d-1dab-44f2-8788-ee24deb7ff0d', + 'ext': 'mp4', + 'display_id': 'tldrnewseu-did-the-us-really-blow-up-the-nordstream-pipelines', + 'title': 'Did the US Really Blow Up the NordStream Pipelines?', + 'description': 'md5:b4e2a14e3ff08f546a3209c75261e789', + 'upload_date': '20230223', + 'timestamp': 1677144070, + 'channel': 'TLDR News EU', + 'channel_id': 'tldrnewseu', + 'uploader': 'TLDR News EU', + 'uploader_id': 'tldrnewseu', + 'uploader_url': r're:https://nebula\.(tv|app)/tldrnewseu', + 'duration': 524, + 'channel_url': r're:https://nebula\.(tv|app)/tldrnewseu', + 'series': 'TLDR News EU', + 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+', + 'creator': 'TLDR News EU', + '_old_archive_ids': ['nebula 63f64c74366fcd00017c1513', 'nebulasubscriptions 63f64c74366fcd00017c1513'], }, - ] - - def _fetch_video_metadata(self, slug): - return self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/', - video_id=slug, - auth_type='bearer', - note='Fetching video meta data') + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://beta.nebula.tv/videos/money-episode-1-the-draw', + 'only_matching': True, + }] def _real_extract(self, url): slug = self._match_id(url) - video = self._fetch_video_metadata(slug) - return self._build_video_info(video) + url, smuggled_data = unsmuggle_url(url, {}) + if smuggled_data.get('id'): + return { + 'id': smuggled_data['id'], + 'display_id': slug, + 'title': '', + **self._extract_formats(smuggled_data['id'], slug), + } + + metadata = self._call_api( + f'https://content.api.nebula.app/content/videos/{slug}', + slug, note='Fetching video metadata') + return { + **self._extract_video_metadata(metadata), + **self._extract_formats(metadata['id'], slug), + } + + +class NebulaClassIE(NebulaBaseIE): + IE_NAME = 'nebula:media' + _VALID_URL = rf'{_BASE_URL_RE}/(?!(?:myshows|library|videos)/)(?P<id>[\w-]+)/(?P<ep>[\w-]+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://nebula.tv/copyright-for-fun-and-profit/14', + 'info_dict': { + 'id': 'd7432cdc-c608-474d-942c-f74345daed7b', + 'ext': 'mp4', + 'display_id': '14', + 'channel_url': 'https://nebula.tv/copyright-for-fun-and-profit', + 'episode_number': 14, + 'thumbnail': 'https://dj423fildxgac.cloudfront.net/d533718d-9307-42d4-8fb0-e283285e99c9', + 'uploader_url': 'https://nebula.tv/copyright-for-fun-and-profit', + 'duration': 646, + 'episode': 'Episode 14', + 'title': 'Photos, Sculpture, and Video', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://nebula.tv/extremitiespodcast/pyramiden-the-high-arctic-soviet-ghost-town', + 'info_dict': { + 'ext': 'mp3', + 'id': '018f65f0-0033-4021-8f87-2d132beb19aa', + 'description': 'md5:05d2b23ab780c955e2511a2b9127acff', + 'series_id': '335e8159-d663-491a-888f-1732285706ac', + 'modified_timestamp': 1599091504, + 'episode_id': '018f65f0-0033-4021-8f87-2d132beb19aa', + 'series': 'Extremities', + 'modified_date': '20200903', + 'upload_date': '20200902', + 'title': 'Pyramiden: The High-Arctic Soviet Ghost Town', + 'release_timestamp': 1571237958, + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', + 'duration': 1546.05714, + 'timestamp': 1599085608, + 'release_date': '20191016', + }, + }, { + 'url': 'https://nebula.tv/thelayover/the-layover-episode-1', + 'info_dict': { + 'ext': 'mp3', + 'id': '9d74a762-00bb-45a8-9e8d-9ed47c04a1d0', + 'episode_number': 1, + 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$', + 'release_date': '20230304', + 'modified_date': '20230403', + 'series': 'The Layover', + 'episode_id': '9d74a762-00bb-45a8-9e8d-9ed47c04a1d0', + 'modified_timestamp': 1680554566, + 'duration': 3130.46401, + 'release_timestamp': 1677943800, + 'title': 'The Layover — Episode 1', + 'series_id': '874303a5-4900-4626-a4b6-2aacac34466a', + 'upload_date': '20230303', + 'episode': 'Episode 1', + 'timestamp': 1677883672, + 'description': 'md5:002cca89258e3bc7c268d5b8c24ba482', + }, + }] + + def _real_extract(self, url): + slug, episode = self._match_valid_url(url).group('id', 'ep') + url, smuggled_data = unsmuggle_url(url, {}) + if smuggled_data.get('id'): + return { + 'id': smuggled_data['id'], + 'display_id': slug, + 'title': '', + **self._extract_formats(smuggled_data['id'], slug), + } + + metadata = self._call_api( + f'https://content.api.nebula.app/content/{slug}/{episode}/?include=lessons', + slug, note='Fetching class/podcast metadata') + content_type = metadata.get('type') + if content_type == 'lesson': + return { + **self._extract_video_metadata(metadata), + **self._extract_formats(metadata['id'], slug), + } + elif content_type == 'podcast_episode': + episode_url = metadata['episode_url'] + if not episode_url and metadata.get('premium'): + self.raise_login_required() + + if Art19IE.suitable(episode_url): + return self.url_result(episode_url, Art19IE) + return traverse_obj(metadata, { + 'id': ('id', {str}), + 'url': ('episode_url', {url_or_none}), + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('published_at', {parse_iso8601}), + 'duration': ('duration', {int_or_none}), + 'channel_id': ('channel_id', {str}), + 'chnanel': ('channel_title', {str}), + 'thumbnail': ('assets', 'regular', {url_or_none}), + }) + + raise ExtractorError(f'Unexpected content type {content_type!r}') class NebulaSubscriptionsIE(NebulaBaseIE): IE_NAME = 'nebula:subscriptions' - _VALID_URL = rf'{_BASE_URL_RE}/myshows' - _TESTS = [ - { - 'url': 'https://nebula.tv/myshows', - 'playlist_mincount': 1, - 'info_dict': { - 'id': 'myshows', - }, + _VALID_URL = rf'{_BASE_URL_RE}/(?P<id>myshows|library/latest-videos)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://nebula.tv/myshows', + 'playlist_mincount': 1, + 'info_dict': { + 'id': 'myshows', }, - ] + }] def _generate_playlist_entries(self): - next_url = 'https://content.watchnebula.com/library/video/?page_size=100' - page_num = 1 - while next_url: - channel = self._call_nebula_api(next_url, 'myshows', auth_type='bearer', - note=f'Retrieving subscriptions page {page_num}') + next_url = update_url_query('https://content.api.nebula.app/video_episodes/', { + 'following': 'true', + 'include': 'engagement', + 'ordering': '-published_at', + }) + for page_num in itertools.count(1): + channel = self._call_api( + next_url, 'myshows', note=f'Retrieving subscriptions page {page_num}') for episode in channel['results']: - yield self._build_video_info(episode) - next_url = channel['next'] - page_num += 1 + metadata = self._extract_video_metadata(episode) + yield self.url_result(smuggle_url( + f'https://nebula.tv/videos/{metadata["display_id"]}', + {'id': episode['id']}), NebulaIE, url_transparent=True, **metadata) + next_url = channel.get('next') + if not next_url: + return def _real_extract(self, url): return self.playlist_result(self._generate_playlist_entries(), 'myshows') @@ -227,48 +374,95 @@ def _real_extract(self, url): class NebulaChannelIE(NebulaBaseIE): IE_NAME = 'nebula:channel' - _VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|videos/)(?P<id>[-\w]+)' - _TESTS = [ - { - 'url': 'https://nebula.tv/tom-scott-presents-money', - 'info_dict': { - 'id': 'tom-scott-presents-money', - 'title': 'Tom Scott Presents: Money', - 'description': 'Tom Scott hosts a series all about trust, negotiation and money.', - }, - 'playlist_count': 5, - }, { - 'url': 'https://nebula.tv/lindsayellis', - 'info_dict': { - 'id': 'lindsayellis', - 'title': 'Lindsay Ellis', - 'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.', - }, - 'playlist_mincount': 2, + _VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|library|videos)(?P<id>[\w-]+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://nebula.tv/tom-scott-presents-money', + 'info_dict': { + 'id': 'tom-scott-presents-money', + 'title': 'Tom Scott Presents: Money', + 'description': 'Tom Scott hosts a series all about trust, negotiation and money.', }, - ] + 'playlist_count': 5, + }, { + 'url': 'https://nebula.tv/lindsayellis', + 'info_dict': { + 'id': 'lindsayellis', + 'title': 'Lindsay Ellis', + 'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://nebula.tv/johnnyharris', + 'info_dict': { + 'id': 'johnnyharris', + 'title': 'Johnny Harris', + 'description': 'I make videos about maps and many other things.', + }, + 'playlist_mincount': 90, + }, { + 'url': 'https://nebula.tv/copyright-for-fun-and-profit', + 'info_dict': { + 'id': 'copyright-for-fun-and-profit', + 'title': 'Copyright for Fun and Profit', + 'description': 'md5:6690248223eed044a9f11cd5a24f9742', + }, + 'playlist_count': 23, + }, { + 'url': 'https://nebula.tv/trussissuespodcast', + 'info_dict': { + 'id': 'trussissuespodcast', + 'title': 'The TLDR News Podcast', + 'description': 'md5:a08c4483bc0b705881d3e0199e721385', + }, + 'playlist_mincount': 80, + }] - def _generate_playlist_entries(self, collection_id, channel): - episodes = channel['episodes']['results'] - for page_num in itertools.count(2): - for episode in episodes: - yield self._build_video_info(episode) - next_url = channel['episodes']['next'] + def _generate_playlist_entries(self, collection_id, collection_slug): + next_url = f'https://content.api.nebula.app/video_channels/{collection_id}/video_episodes/?ordering=-published_at' + for page_num in itertools.count(1): + episodes = self._call_api(next_url, collection_slug, note=f'Retrieving channel page {page_num}') + for episode in episodes['results']: + metadata = self._extract_video_metadata(episode) + yield self.url_result(smuggle_url( + episode.get('share_url') or f'https://nebula.tv/videos/{metadata["display_id"]}', + {'id': episode['id']}), NebulaIE, url_transparent=True, **metadata) + next_url = episodes.get('next') + if not next_url: + break + + def _generate_class_entries(self, channel): + for lesson in channel['lessons']: + metadata = self._extract_video_metadata(lesson) + yield self.url_result(smuggle_url( + lesson.get('share_url') or f'https://nebula.tv/{metadata["class_slug"]}/{metadata["slug"]}', + {'id': lesson['id']}), NebulaClassIE, url_transparent=True, **metadata) + + def _generate_podcast_entries(self, collection_id, collection_slug): + next_url = f'https://content.api.nebula.app/podcast_channels/{collection_id}/podcast_episodes/?ordering=-published_at&premium=true' + for page_num in itertools.count(1): + episodes = self._call_api(next_url, collection_slug, note=f'Retrieving podcast page {page_num}') + + for episode in traverse_obj(episodes, ('results', lambda _, v: url_or_none(v['share_url']))): + yield self.url_result(episode['share_url'], NebulaClassIE) + next_url = episodes.get('next') if not next_url: break - channel = self._call_nebula_api(next_url, collection_id, auth_type='bearer', - note=f'Retrieving channel page {page_num}') - episodes = channel['episodes']['results'] def _real_extract(self, url): - collection_id = self._match_id(url) - channel_url = f'https://content.watchnebula.com/video/channels/{collection_id}/' - channel = self._call_nebula_api(channel_url, collection_id, auth_type='bearer', note='Retrieving channel') - channel_details = channel['details'] + collection_slug = self._match_id(url) + channel = self._call_api( + f'https://content.api.nebula.app/content/{collection_slug}/?include=lessons', + collection_slug, note='Retrieving channel') + + if channel.get('type') == 'class': + entries = self._generate_class_entries(channel) + elif channel.get('type') == 'podcast_channel': + entries = self._generate_podcast_entries(channel['id'], collection_slug) + else: + entries = self._generate_playlist_entries(channel['id'], collection_slug) return self.playlist_result( - entries=self._generate_playlist_entries(collection_id, channel), - playlist_id=collection_id, - playlist_title=channel_details['title'], - playlist_description=channel_details['description'] - ) + entries=entries, + playlist_id=collection_slug, + playlist_title=channel.get('title'), + playlist_description=channel.get('description')) diff --git a/yt_dlp/extractor/nekohacker.py b/yt_dlp/extractor/nekohacker.py new file mode 100644 index 0000000000..537158e87b --- /dev/null +++ b/yt_dlp/extractor/nekohacker.py @@ -0,0 +1,213 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + extract_attributes, + get_element_by_class, + get_element_text_and_html_by_tag, + parse_duration, + traverse_obj, + try_call, + url_or_none, +) + + +class NekoHackerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nekohacker\.com/(?P<id>(?!free-dl)[\w-]+)' + _TESTS = [{ + 'url': 'https://nekohacker.com/nekoverse/', + 'info_dict': { + 'id': 'nekoverse', + 'title': 'Nekoverse', + }, + 'playlist': [ + { + 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/01-Spaceship.mp3', + 'md5': '44223701ebedba0467ebda4cc07fb3aa', + 'info_dict': { + 'id': '1712', + 'ext': 'mp3', + 'title': 'Spaceship', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20221101', + 'album': 'Nekoverse', + 'artist': 'Neko Hacker', + 'track': 'Spaceship', + 'track_number': 1, + 'duration': 195.0, + }, + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/02-City-Runner.mp3', + 'md5': '8f853c71719389d32bbbd3f1a87b3f08', + 'info_dict': { + 'id': '1713', + 'ext': 'mp3', + 'title': 'City Runner', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20221101', + 'album': 'Nekoverse', + 'artist': 'Neko Hacker', + 'track': 'City Runner', + 'track_number': 2, + 'duration': 148.0, + }, + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/03-Nature-Talk.mp3', + 'md5': '5a8a8ae852720cee4c0ac95c7d1a7450', + 'info_dict': { + 'id': '1714', + 'ext': 'mp3', + 'title': 'Nature Talk', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20221101', + 'album': 'Nekoverse', + 'artist': 'Neko Hacker', + 'track': 'Nature Talk', + 'track_number': 3, + 'duration': 174.0, + }, + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/04-Crystal-World.mp3', + 'md5': 'd8e59a48061764e50d92386a294abd50', + 'info_dict': { + 'id': '1715', + 'ext': 'mp3', + 'title': 'Crystal World', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20221101', + 'album': 'Nekoverse', + 'artist': 'Neko Hacker', + 'track': 'Crystal World', + 'track_number': 4, + 'duration': 199.0, + }, + }, + ], + }, { + 'url': 'https://nekohacker.com/susume/', + 'info_dict': { + 'id': 'susume', + 'title': '進め!むじなカンパニー', + }, + 'playlist': [ + { + 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/進め!むじなカンパニー-feat.-六科なじむ-CV_-日高里菜-割戶真友-CV_-金元寿子-軽井沢ユキ-CV_-上坂すみれ-出稼ぎガルシア-CV_-金子彩花-.mp3', + 'md5': 'fb13f008aa81f26ba48f91fd2d6186ce', + 'info_dict': { + 'id': '711', + 'ext': 'mp3', + 'title': 'md5:1a5fcbc96ca3c3265b1c6f9f79f30fd0', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20210115', + 'album': '進め!むじなカンパニー', + 'artist': 'Neko Hacker', + 'track': 'md5:1a5fcbc96ca3c3265b1c6f9f79f30fd0', + 'track_number': 1, + }, + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/むじな-de-なじむ-feat.-六科なじむ-CV_-日高里菜-.mp3', + 'md5': '028803f70241df512b7764e73396fdd1', + 'info_dict': { + 'id': '709', + 'ext': 'mp3', + 'title': 'むじな de なじむ feat. 六科なじむ (CV: 日高里菜 )', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20210115', + 'album': '進め!むじなカンパニー', + 'artist': 'Neko Hacker', + 'track': 'むじな de なじむ feat. 六科なじむ (CV: 日高里菜 )', + 'track_number': 2, + }, + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/進め!むじなカンパニー-instrumental.mp3', + 'md5': 'adde9e9a16e1da5e602b579c247d0fb9', + 'info_dict': { + 'id': '710', + 'ext': 'mp3', + 'title': '進め!むじなカンパニー (instrumental)', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20210115', + 'album': '進め!むじなカンパニー', + 'artist': 'Neko Hacker', + 'track': '進め!むじなカンパニー (instrumental)', + 'track_number': 3, + }, + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/むじな-de-なじむ-instrumental.mp3', + 'md5': 'ebb0443039cf5f9ff7fd557ed9b23599', + 'info_dict': { + 'id': '712', + 'ext': 'mp3', + 'title': 'むじな de なじむ (instrumental)', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20210115', + 'album': '進め!むじなカンパニー', + 'artist': 'Neko Hacker', + 'track': 'むじな de なじむ (instrumental)', + 'track_number': 4, + }, + }, + ], + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + playlist = get_element_by_class('playlist', webpage) + + if not playlist: + iframe = try_call(lambda: get_element_text_and_html_by_tag('iframe', webpage)[1]) or '' + iframe_src = url_or_none(extract_attributes(iframe).get('src')) + if not iframe_src: + raise ExtractorError('No playlist or embed found in webpage') + elif re.match(r'https?://(?:\w+\.)?spotify\.com/', iframe_src): + raise ExtractorError('Spotify embeds are not supported', expected=True) + return self.url_result(url, 'Generic') + + entries = [] + for track_number, track in enumerate(re.findall(r'(<li[^>]+data-audiopath[^>]+>)', playlist), 1): + entry = traverse_obj(extract_attributes(track), { + 'url': ('data-audiopath', {url_or_none}), + 'ext': ('data-audiopath', {determine_ext}), + 'id': 'data-trackid', + 'title': 'data-tracktitle', + 'track': 'data-tracktitle', + 'album': 'data-albumtitle', + 'duration': ('data-tracktime', {parse_duration}), + 'release_date': ('data-releasedate', {lambda x: re.match(r'\d{8}', x.replace('.', ''))}, 0), + 'thumbnail': ('data-albumart', {url_or_none}), + }) + entries.append({ + **entry, + 'track_number': track_number, + 'artist': 'Neko Hacker', + 'vcodec': 'none', + 'acodec': 'mp3' if entry['ext'] == 'mp3' else None, + }) + + return self.playlist_result(entries, playlist_id, traverse_obj(entries, (0, 'album'))) diff --git a/yt_dlp/extractor/nerdcubed.py b/yt_dlp/extractor/nerdcubed.py index 7c801b5d38..5f5607a20b 100644 --- a/yt_dlp/extractor/nerdcubed.py +++ b/yt_dlp/extractor/nerdcubed.py @@ -1,33 +1,38 @@ -import datetime - from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import parse_iso8601, url_or_none +from ..utils.traversal import traverse_obj class NerdCubedFeedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nerdcubed\.co\.uk/feed\.json' + _VALID_URL = r'https?://(?:www\.)?nerdcubed\.co\.uk/?(?:$|[#?])' _TEST = { - 'url': 'http://www.nerdcubed.co.uk/feed.json', + 'url': 'http://www.nerdcubed.co.uk/', 'info_dict': { 'id': 'nerdcubed-feed', 'title': 'nerdcubed.co.uk feed', }, - 'playlist_mincount': 1300, + 'playlist_mincount': 5500, } + def _extract_video(self, feed_entry): + return self.url_result( + f'https://www.youtube.com/watch?v={feed_entry["id"]}', YoutubeIE, + **traverse_obj(feed_entry, { + 'id': ('id', {str}), + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('publishedAt', {parse_iso8601}), + 'channel': ('source', 'name', {str}), + 'channel_id': ('source', 'id', {str}), + 'channel_url': ('source', 'url', {str}), + 'thumbnail': ('thumbnail', 'source', {url_or_none}), + }), url_transparent=True) + def _real_extract(self, url): - feed = self._download_json(url, url, 'Downloading NerdCubed JSON feed') + video_id = 'nerdcubed-feed' + feed = self._download_json('https://www.nerdcubed.co.uk/_/cdn/videos.json', video_id) - entries = [{ - '_type': 'url', - 'title': feed_entry['title'], - 'uploader': feed_entry['source']['name'] if feed_entry['source'] else None, - 'upload_date': datetime.datetime.strptime(feed_entry['date'], '%Y-%m-%d').strftime('%Y%m%d'), - 'url': 'http://www.youtube.com/watch?v=' + feed_entry['youtube_id'], - } for feed_entry in feed] - - return { - '_type': 'playlist', - 'title': 'nerdcubed.co.uk feed', - 'id': 'nerdcubed-feed', - 'entries': entries, - } + return self.playlist_result( + map(self._extract_video, traverse_obj(feed, ('videos', lambda _, v: v['id']))), + video_id, 'nerdcubed.co.uk feed') diff --git a/yt_dlp/extractor/neteasemusic.py b/yt_dlp/extractor/neteasemusic.py index 595709899a..a759da2147 100644 --- a/yt_dlp/extractor/neteasemusic.py +++ b/yt_dlp/extractor/neteasemusic.py @@ -1,184 +1,215 @@ +import hashlib import itertools import json +import random import re import time -from base64 import b64encode -from binascii import hexlify -from datetime import datetime -from hashlib import md5 -from random import randint from .common import InfoExtractor from ..aes import aes_ecb_encrypt, pkcs7_padding -from ..compat import compat_urllib_parse_urlencode from ..utils import ( ExtractorError, - bytes_to_intlist, - error_to_compat_str, - float_or_none, int_or_none, - intlist_to_bytes, - sanitized_Request, - try_get, + join_nonempty, + str_or_none, + strftime_or_none, + traverse_obj, + unified_strdate, + url_or_none, + urljoin, + variadic, ) class NetEaseMusicBaseIE(InfoExtractor): - _FORMATS = ['bMusic', 'mMusic', 'hMusic'] - _NETEASE_SALT = '3go8&$8*3*3h0k(2)2' + # XXX: _extract_formats logic depends on the order of the levels in each tier + _LEVELS = ( + 'standard', # free tier; 标准; 128kbps mp3 or aac + 'higher', # free tier; 192kbps mp3 or aac + 'exhigh', # free tier; 极高 (HQ); 320kbps mp3 or aac + 'lossless', # VIP tier; 无损 (SQ); 48kHz/16bit flac + 'hires', # VIP tier; 高解析度无损 (Hi-Res); 192kHz/24bit flac + 'jyeffect', # VIP tier; 高清臻音 (Spatial Audio); 96kHz/24bit flac + 'jymaster', # SVIP tier; 超清母带 (Master); 192kHz/24bit flac + 'sky', # SVIP tier; 沉浸环绕声 (Surround Audio); flac + ) _API_BASE = 'http://music.163.com/api/' + _GEO_BYPASS = False - @classmethod - def _encrypt(cls, dfsid): - salt_bytes = bytearray(cls._NETEASE_SALT.encode('utf-8')) - string_bytes = bytearray(str(dfsid).encode('ascii')) - salt_len = len(salt_bytes) - for i in range(len(string_bytes)): - string_bytes[i] = string_bytes[i] ^ salt_bytes[i % salt_len] - m = md5() - m.update(bytes(string_bytes)) - result = b64encode(m.digest()).decode('ascii') - return result.replace('/', '_').replace('+', '-') + @staticmethod + def _kilo_or_none(value): + return int_or_none(value, scale=1000) - def make_player_api_request_data_and_headers(self, song_id, bitrate): - KEY = b'e82ckenh8dichen8' - URL = '/api/song/enhance/player/url' - now = int(time.time() * 1000) - rand = randint(0, 1000) - cookie = { - 'osver': None, - 'deviceId': None, + def _create_eapi_cipher(self, api_path, query_body, cookies): + request_text = json.dumps({**query_body, 'header': cookies}, separators=(',', ':')) + + message = f'nobody{api_path}use{request_text}md5forencrypt'.encode('latin1') + msg_digest = hashlib.md5(message).hexdigest() + + data = pkcs7_padding(list(str.encode( + f'{api_path}-36cd479b6b5-{request_text}-36cd479b6b5-{msg_digest}'))) + encrypted = bytes(aes_ecb_encrypt(data, list(b'e82ckenh8dichen8'))) + return f'params={encrypted.hex().upper()}'.encode() + + def _download_eapi_json(self, path, video_id, query_body, headers={}, **kwargs): + cookies = { + 'osver': 'undefined', + 'deviceId': 'undefined', 'appver': '8.0.0', 'versioncode': '140', - 'mobilename': None, + 'mobilename': 'undefined', 'buildver': '1623435496', 'resolution': '1920x1080', '__csrf': '', 'os': 'pc', - 'channel': None, - 'requestId': '{0}_{1:04}'.format(now, rand), + 'channel': 'undefined', + 'requestId': f'{int(time.time() * 1000)}_{random.randint(0, 1000):04}', + **traverse_obj(self._get_cookies(self._API_BASE), { + 'MUSIC_U': ('MUSIC_U', {lambda i: i.value}), + }), } - request_text = json.dumps( - {'ids': '[{0}]'.format(song_id), 'br': bitrate, 'header': cookie}, - separators=(',', ':')) - message = 'nobody{0}use{1}md5forencrypt'.format( - URL, request_text).encode('latin1') - msg_digest = md5(message).hexdigest() + return self._download_json( + urljoin('https://interface3.music.163.com/', f'/eapi{path}'), video_id, + data=self._create_eapi_cipher(f'/api{path}', query_body, cookies), headers={ + 'Referer': 'https://music.163.com', + 'Cookie': '; '.join([f'{k}={v}' for k, v in cookies.items()]), + **headers, + }, **kwargs) - data = '{0}-36cd479b6b5-{1}-36cd479b6b5-{2}'.format( - URL, request_text, msg_digest) - data = pkcs7_padding(bytes_to_intlist(data)) - encrypted = intlist_to_bytes(aes_ecb_encrypt(data, bytes_to_intlist(KEY))) - encrypted_params = hexlify(encrypted).decode('ascii').upper() + def _call_player_api(self, song_id, level): + return self._download_eapi_json( + '/song/enhance/player/url/v1', song_id, + {'ids': f'[{song_id}]', 'level': level, 'encodeType': 'flac'}, + note=f'Downloading song URL info: level {level}') - cookie = '; '.join( - ['{0}={1}'.format(k, v if v is not None else 'undefined') - for [k, v] in cookie.items()]) - - headers = { - 'User-Agent': self.extractor.get_param('http_headers')['User-Agent'], - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': 'https://music.163.com', - 'Cookie': cookie, - } - return ('params={0}'.format(encrypted_params), headers) - - def _call_player_api(self, song_id, bitrate): - url = 'https://interface3.music.163.com/eapi/song/enhance/player/url' - data, headers = self.make_player_api_request_data_and_headers(song_id, bitrate) - try: - msg = 'empty result' - result = self._download_json( - url, song_id, data=data.encode('ascii'), headers=headers) - if result: - return result - except ExtractorError as e: - if type(e.cause) in (ValueError, TypeError): - # JSON load failure - raise - except Exception as e: - msg = error_to_compat_str(e) - self.report_warning('%s API call (%s) failed: %s' % ( - song_id, bitrate, msg)) - return {} - - def extract_formats(self, info): - err = 0 + def _extract_formats(self, info): formats = [] song_id = info['id'] - for song_format in self._FORMATS: - details = info.get(song_format) - if not details: + for level in self._LEVELS: + song = traverse_obj( + self._call_player_api(song_id, level), ('data', lambda _, v: url_or_none(v['url']), any)) + if not song: + break # Media is not available due to removal or geo-restriction + actual_level = song.get('level') + if actual_level and actual_level != level: + if level in ('lossless', 'jymaster'): + break # We've already extracted the highest level of the user's account tier continue - - bitrate = int_or_none(details.get('bitrate')) or 999000 - data = self._call_player_api(song_id, bitrate) - for song in try_get(data, lambda x: x['data'], list) or []: - song_url = try_get(song, lambda x: x['url']) - if not song_url: - continue - if self._is_valid_url(song_url, info['id'], 'song'): - formats.append({ - 'url': song_url, - 'ext': details.get('extension'), - 'abr': float_or_none(song.get('br'), scale=1000), - 'format_id': song_format, - 'filesize': int_or_none(song.get('size')), - 'asr': int_or_none(details.get('sr')), - }) - elif err == 0: - err = try_get(song, lambda x: x['code'], int) - + formats.append({ + 'url': song['url'], + 'format_id': level, + 'vcodec': 'none', + **traverse_obj(song, { + 'ext': ('type', {str}), + 'abr': ('br', {self._kilo_or_none}), + 'filesize': ('size', {int_or_none}), + }), + }) + if not actual_level: + break # Only 1 level is available if API does not return a value (netease:program) if not formats: - msg = 'No media links found' - if err != 0 and (err < 200 or err >= 400): - raise ExtractorError( - '%s (site code %d)' % (msg, err, ), expected=True) - else: - self.raise_geo_restricted( - msg + ': probably this video is not available from your location due to geo restriction.', - countries=['CN']) - + self.raise_geo_restricted( + 'No media links found; possibly due to geo restriction', countries=['CN']) return formats - @classmethod - def convert_milliseconds(cls, ms): - return int(round(ms / 1000.0)) + def _query_api(self, endpoint, video_id, note): + result = self._download_json( + f'{self._API_BASE}{endpoint}', video_id, note, headers={'Referer': self._API_BASE}) + code = traverse_obj(result, ('code', {int})) + message = traverse_obj(result, ('message', {str})) or '' + if code == -462: + self.raise_login_required(f'Login required to download: {message}') + elif code != 200: + raise ExtractorError(f'Failed to get meta info: {code} {message}') + return result - def query_api(self, endpoint, video_id, note): - req = sanitized_Request('%s%s' % (self._API_BASE, endpoint)) - req.add_header('Referer', self._API_BASE) - return self._download_json(req, video_id, note) + def _get_entries(self, songs_data, entry_keys=None, id_key='id', name_key='name'): + for song in traverse_obj(songs_data, ( + *variadic(entry_keys, (str, bytes, dict, set)), + lambda _, v: int_or_none(v[id_key]) is not None)): + song_id = str(song[id_key]) + yield self.url_result( + f'http://music.163.com/#/song?id={song_id}', NetEaseMusicIE, + song_id, traverse_obj(song, (name_key, {str}))) class NetEaseMusicIE(NetEaseMusicBaseIE): IE_NAME = 'netease:song' IE_DESC = '网易云音乐' - _VALID_URL = r'https?://(y\.)?music\.163\.com/(?:[#m]/)?song\?.*?\bid=(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:y\.)?music\.163\.com/(?:[#m]/)?song\?.*?\bid=(?P<id>[0-9]+)' _TESTS = [{ + 'url': 'https://music.163.com/#/song?id=550136151', + 'info_dict': { + 'id': '550136151', + 'ext': 'mp3', + 'title': 'It\'s Ok (Live)', + 'creators': 'count:10', + 'timestamp': 1522944000, + 'upload_date': '20180405', + 'description': 'md5:9fd07059c2ccee3950dc8363429a3135', + 'duration': 197, + 'thumbnail': r're:^http.*\.jpg', + 'album': '偶像练习生 表演曲目合集', + 'average_rating': int, + 'album_artists': ['偶像练习生'], + }, + }, { + 'url': 'http://music.163.com/song?id=17241424', + 'info_dict': { + 'id': '17241424', + 'ext': 'mp3', + 'title': 'Opus 28', + 'upload_date': '20080211', + 'timestamp': 1202745600, + 'duration': 263, + 'thumbnail': r're:^http.*\.jpg', + 'album': 'Piano Solos Vol. 2', + 'album_artist': 'Dustin O\'Halloran', + 'average_rating': int, + 'description': '[00:05.00]纯音乐,请欣赏\n', + 'album_artists': ['Dustin O\'Halloran'], + 'creators': ['Dustin O\'Halloran'], + 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, + }, + }, { + 'url': 'https://y.music.163.com/m/song?app_version=8.8.45&id=95670&uct2=sKnvS4+0YStsWkqsPhFijw%3D%3D&dlt=0846', + 'md5': 'b896be78d8d34bd7bb665b26710913ff', + 'info_dict': { + 'id': '95670', + 'ext': 'mp3', + 'title': '国际歌', + 'upload_date': '19911130', + 'timestamp': 691516800, + 'description': 'md5:1ba2f911a2b0aa398479f595224f2141', + 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, + 'duration': 268, + 'alt_title': '伴唱:现代人乐队 合唱:总政歌舞团', + 'thumbnail': r're:^http.*\.jpg', + 'average_rating': int, + 'album': '红色摇滚', + 'album_artist': '侯牧人', + 'creators': ['马备'], + 'album_artists': ['侯牧人'], + }, + }, { 'url': 'http://music.163.com/#/song?id=32102397', 'md5': '3e909614ce09b1ccef4a3eb205441190', 'info_dict': { 'id': '32102397', 'ext': 'mp3', 'title': 'Bad Blood', - 'creator': 'Taylor Swift / Kendrick Lamar', + 'creators': ['Taylor Swift', 'Kendrick Lamar'], 'upload_date': '20150516', 'timestamp': 1431792000, - 'description': 'md5:25fc5f27e47aad975aa6d36382c7833c', - }, - }, { - 'note': 'No lyrics.', - 'url': 'http://music.163.com/song?id=17241424', - 'info_dict': { - 'id': '17241424', - 'ext': 'mp3', - 'title': 'Opus 28', - 'creator': 'Dustin O\'Halloran', - 'upload_date': '20080211', - 'description': 'md5:f12945b0f6e0365e3b73c5032e1b0ff4', - 'timestamp': 1202745600, + 'description': 'md5:21535156efb73d6d1c355f95616e285a', + 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, + 'duration': 199, + 'thumbnail': r're:^http.*\.jpg', + 'album': 'Bad Blood', + 'average_rating': int, + 'album_artist': 'Taylor Swift', }, + 'skip': 'Blocked outside Mainland China', }, { 'note': 'Has translated name.', 'url': 'http://music.163.com/#/song?id=22735043', @@ -186,123 +217,136 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'id': '22735043', 'ext': 'mp3', 'title': '소원을 말해봐 (Genie)', - 'creator': '少女时代', - 'description': 'md5:79d99cc560e4ca97e0c4d86800ee4184', + 'creators': ['少女时代'], 'upload_date': '20100127', 'timestamp': 1264608000, + 'description': 'md5:03d1ffebec3139aa4bafe302369269c5', + 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, + 'duration': 229, 'alt_title': '说出愿望吧(Genie)', + 'thumbnail': r're:^http.*\.jpg', + 'average_rating': int, + 'album': 'Oh!', + 'album_artist': '少女时代', }, - }, { - 'url': 'https://y.music.163.com/m/song?app_version=8.8.45&id=95670&uct2=sKnvS4+0YStsWkqsPhFijw%3D%3D&dlt=0846', - 'md5': '95826c73ea50b1c288b22180ec9e754d', - 'info_dict': { - 'id': '95670', - 'ext': 'mp3', - 'title': '国际歌', - 'creator': '马备', - 'upload_date': '19911130', - 'timestamp': 691516800, - 'description': 'md5:1ba2f911a2b0aa398479f595224f2141', - }, + 'skip': 'Blocked outside Mainland China', }] def _process_lyrics(self, lyrics_info): - original = lyrics_info.get('lrc', {}).get('lyric') - translated = lyrics_info.get('tlyric', {}).get('lyric') + original = traverse_obj(lyrics_info, ('lrc', 'lyric', {str})) + translated = traverse_obj(lyrics_info, ('tlyric', 'lyric', {str})) + + if not original or original == '[99:00.00]纯音乐,请欣赏\n': + return None if not translated: - return original + return { + 'lyrics': [{'data': original, 'ext': 'lrc'}], + } lyrics_expr = r'(\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\])([^\n]+)' original_ts_texts = re.findall(lyrics_expr, original) - translation_ts_dict = dict( - (time_stamp, text) for time_stamp, text in re.findall(lyrics_expr, translated) - ) - lyrics = '\n'.join([ - '%s%s / %s' % (time_stamp, text, translation_ts_dict.get(time_stamp, '')) - for time_stamp, text in original_ts_texts - ]) - return lyrics + translation_ts_dict = dict(re.findall(lyrics_expr, translated)) + + merged = '\n'.join( + join_nonempty(f'{timestamp}{text}', translation_ts_dict.get(timestamp, ''), delim=' / ') + for timestamp, text in original_ts_texts) + + return { + 'lyrics_merged': [{'data': merged, 'ext': 'lrc'}], + 'lyrics': [{'data': original, 'ext': 'lrc'}], + 'lyrics_translated': [{'data': translated, 'ext': 'lrc'}], + } def _real_extract(self, url): song_id = self._match_id(url) - params = { - 'id': song_id, - 'ids': '[%s]' % song_id - } - info = self.query_api( - 'song/detail?' + compat_urllib_parse_urlencode(params), - song_id, 'Downloading song info')['songs'][0] + info = self._query_api( + f'song/detail?id={song_id}&ids=%5B{song_id}%5D', song_id, 'Downloading song info')['songs'][0] - formats = self.extract_formats(info) + formats = self._extract_formats(info) - lyrics_info = self.query_api( - 'song/lyric?id=%s&lv=-1&tv=-1' % song_id, - song_id, 'Downloading lyrics data') - lyrics = self._process_lyrics(lyrics_info) - - alt_title = None - if info.get('transNames'): - alt_title = '/'.join(info.get('transNames')) + lyrics = self._process_lyrics(self._query_api( + f'song/lyric?id={song_id}&lv=-1&tv=-1', song_id, 'Downloading lyrics data')) + lyric_data = { + 'description': traverse_obj(lyrics, (('lyrics_merged', 'lyrics'), 0, 'data'), get_all=False), + 'subtitles': lyrics, + } if lyrics else {} return { 'id': song_id, - 'title': info['name'], - 'alt_title': alt_title, - 'creator': ' / '.join([artist['name'] for artist in info.get('artists', [])]), - 'timestamp': self.convert_milliseconds(info.get('album', {}).get('publishTime')), - 'thumbnail': info.get('album', {}).get('picUrl'), - 'duration': self.convert_milliseconds(info.get('duration', 0)), - 'description': lyrics, 'formats': formats, + 'alt_title': '/'.join(traverse_obj(info, (('transNames', 'alias'), ...))) or None, + 'creators': traverse_obj(info, ('artists', ..., 'name')) or None, + 'album_artists': traverse_obj(info, ('album', 'artists', ..., 'name')) or None, + **lyric_data, + **traverse_obj(info, { + 'title': ('name', {str}), + 'timestamp': ('album', 'publishTime', {self._kilo_or_none}), + 'thumbnail': ('album', 'picUrl', {url_or_none}), + 'duration': ('duration', {self._kilo_or_none}), + 'album': ('album', 'name', {str}), + 'average_rating': ('score', {int_or_none}), + }), } class NetEaseMusicAlbumIE(NetEaseMusicBaseIE): IE_NAME = 'netease:album' IE_DESC = '网易云音乐 - 专辑' - _VALID_URL = r'https?://music\.163\.com/(#/)?album\?id=(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://music\.163\.com/(?:#/)?album\?id=(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://music.163.com/#/album?id=133153666', + 'info_dict': { + 'id': '133153666', + 'title': '桃几的翻唱', + 'upload_date': '20210913', + 'description': '桃几2021年翻唱合集', + 'thumbnail': r're:^http.*\.jpg', + }, + 'playlist_mincount': 12, + }, { 'url': 'http://music.163.com/#/album?id=220780', 'info_dict': { 'id': '220780', - 'title': 'B\'day', + 'title': 'B\'Day', + 'upload_date': '20060904', + 'description': 'md5:71a74e1d8f392d88cf1bbe48879ad0b0', + 'thumbnail': r're:^http.*\.jpg', }, 'playlist_count': 23, - 'skip': 'Blocked outside Mainland China', - } + }] def _real_extract(self, url): album_id = self._match_id(url) + webpage = self._download_webpage(f'https://music.163.com/album?id={album_id}', album_id) - info = self.query_api( - 'album/%s?id=%s' % (album_id, album_id), - album_id, 'Downloading album data')['album'] - - name = info['name'] - desc = info.get('description') - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song['id'], - 'NetEaseMusic', song['id']) - for song in info['songs'] - ] - return self.playlist_result(entries, album_id, name, desc) + songs = self._search_json( + r'<textarea[^>]+\bid="song-list-pre-data"[^>]*>', webpage, 'metainfo', album_id, + end_pattern=r'</textarea>', contains_pattern=r'\[(?s:.+)\]') + metainfo = { + 'title': self._og_search_property('title', webpage, 'title', fatal=False), + 'description': self._html_search_regex( + (rf'<div[^>]+\bid="album-desc-{suffix}"[^>]*>(.*?)</div>' for suffix in ('more', 'dot')), + webpage, 'description', flags=re.S, fatal=False), + 'thumbnail': self._og_search_property('image', webpage, 'thumbnail', fatal=False), + 'upload_date': unified_strdate(self._html_search_meta('music:release_date', webpage, 'date', fatal=False)), + } + return self.playlist_result(self._get_entries(songs), album_id, **metainfo) class NetEaseMusicSingerIE(NetEaseMusicBaseIE): IE_NAME = 'netease:singer' IE_DESC = '网易云音乐 - 歌手' - _VALID_URL = r'https?://music\.163\.com/(#/)?artist\?id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://music\.163\.com/(?:#/)?artist\?id=(?P<id>[0-9]+)' _TESTS = [{ 'note': 'Singer has aliases.', 'url': 'http://music.163.com/#/artist?id=10559', 'info_dict': { 'id': '10559', - 'title': '张惠妹 - aMEI;阿密特', + 'title': '张惠妹 - aMEI;阿妹;阿密特', }, 'playlist_count': 50, - 'skip': 'Blocked outside Mainland China', }, { 'note': 'Singer has translated name.', 'url': 'http://music.163.com/#/artist?id=124098', @@ -311,238 +355,292 @@ class NetEaseMusicSingerIE(NetEaseMusicBaseIE): 'title': '李昇基 - 이승기', }, 'playlist_count': 50, - 'skip': 'Blocked outside Mainland China', + }, { + 'note': 'Singer with both translated and alias', + 'url': 'https://music.163.com/#/artist?id=159692', + 'info_dict': { + 'id': '159692', + 'title': '初音ミク - 初音未来;Hatsune Miku', + }, + 'playlist_count': 50, }] def _real_extract(self, url): singer_id = self._match_id(url) - info = self.query_api( - 'artist/%s?id=%s' % (singer_id, singer_id), - singer_id, 'Downloading singer data') + info = self._query_api( + f'artist/{singer_id}?id={singer_id}', singer_id, note='Downloading singer data') - name = info['artist']['name'] - if info['artist']['trans']: - name = '%s - %s' % (name, info['artist']['trans']) - if info['artist']['alias']: - name = '%s - %s' % (name, ';'.join(info['artist']['alias'])) + name = join_nonempty( + traverse_obj(info, ('artist', 'name', {str})), + join_nonempty(*traverse_obj(info, ('artist', ('trans', ('alias', ...)), {str})), delim=';'), + delim=' - ') - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song['id'], - 'NetEaseMusic', song['id']) - for song in info['hotSongs'] - ] - return self.playlist_result(entries, singer_id, name) + return self.playlist_result(self._get_entries(info, 'hotSongs'), singer_id, name) class NetEaseMusicListIE(NetEaseMusicBaseIE): IE_NAME = 'netease:playlist' IE_DESC = '网易云音乐 - 歌单' - _VALID_URL = r'https?://music\.163\.com/(#/)?(playlist|discover/toplist)\?id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://music\.163\.com/(?:#/)?(?:playlist|discover/toplist)\?id=(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://music.163.com/#/playlist?id=79177352', 'info_dict': { 'id': '79177352', 'title': 'Billboard 2007 Top 100', - 'description': 'md5:12fd0819cab2965b9583ace0f8b7b022' + 'description': 'md5:12fd0819cab2965b9583ace0f8b7b022', + 'tags': ['欧美'], + 'uploader': '浑然破灭', + 'uploader_id': '67549805', + 'timestamp': int, + 'upload_date': r're:\d{8}', }, - 'playlist_count': 99, - 'skip': 'Blocked outside Mainland China', + 'playlist_mincount': 95, + }, { + 'note': 'Toplist/Charts sample', + 'url': 'https://music.163.com/#/discover/toplist?id=60198', + 'info_dict': { + 'id': '60198', + 'title': 're:美国Billboard榜 [0-9]{4}-[0-9]{2}-[0-9]{2}', + 'description': '美国Billboard排行榜', + 'tags': ['流行', '欧美', '榜单'], + 'uploader': 'Billboard公告牌', + 'uploader_id': '48171', + 'timestamp': int, + 'upload_date': r're:\d{8}', + }, + 'playlist_count': 100, }, { 'note': 'Toplist/Charts sample', 'url': 'http://music.163.com/#/discover/toplist?id=3733003', 'info_dict': { 'id': '3733003', - 'title': 're:韩国Melon排行榜周榜 [0-9]{4}-[0-9]{2}-[0-9]{2}', + 'title': 're:韩国Melon排行榜周榜(?: [0-9]{4}-[0-9]{2}-[0-9]{2})?', 'description': 'md5:73ec782a612711cadc7872d9c1e134fc', + 'upload_date': '20200109', + 'uploader_id': '2937386', + 'tags': ['韩语', '榜单'], + 'uploader': 'Melon榜单', + 'timestamp': 1578569373, }, 'playlist_count': 50, - 'skip': 'Blocked outside Mainland China', }] def _real_extract(self, url): list_id = self._match_id(url) - info = self.query_api( - 'playlist/detail?id=%s&lv=-1&tv=-1' % list_id, - list_id, 'Downloading playlist data')['result'] + info = self._download_eapi_json( + '/v3/playlist/detail', list_id, + {'id': list_id, 't': '-1', 'n': '500', 's': '0'}, + note='Downloading playlist info') - name = info['name'] - desc = info.get('description') + metainfo = traverse_obj(info, ('playlist', { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'tags': ('tags', ..., {str}), + 'uploader': ('creator', 'nickname', {str}), + 'uploader_id': ('creator', 'userId', {str_or_none}), + 'timestamp': ('updateTime', {self._kilo_or_none}), + })) + if traverse_obj(info, ('playlist', 'specialType')) == 10: + metainfo['title'] = f'{metainfo.get("title")} {strftime_or_none(metainfo.get("timestamp"), "%Y-%m-%d")}' - if info.get('specialType') == 10: # is a chart/toplist - datestamp = datetime.fromtimestamp( - self.convert_milliseconds(info['updateTime'])).strftime('%Y-%m-%d') - name = '%s %s' % (name, datestamp) - - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song['id'], - 'NetEaseMusic', song['id']) - for song in info['tracks'] - ] - return self.playlist_result(entries, list_id, name, desc) + return self.playlist_result(self._get_entries(info, ('playlist', 'tracks')), list_id, **metainfo) class NetEaseMusicMvIE(NetEaseMusicBaseIE): IE_NAME = 'netease:mv' IE_DESC = '网易云音乐 - MV' - _VALID_URL = r'https?://music\.163\.com/(#/)?mv\?id=(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://music\.163\.com/(?:#/)?mv\?id=(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://music.163.com/#/mv?id=10958064', + 'info_dict': { + 'id': '10958064', + 'ext': 'mp4', + 'title': '交换余生', + 'description': 'md5:e845872cff28820642a2b02eda428fea', + 'creators': ['林俊杰'], + 'upload_date': '20200916', + 'thumbnail': r're:http.*\.jpg', + 'duration': 364, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + }, { 'url': 'http://music.163.com/#/mv?id=415350', 'info_dict': { 'id': '415350', 'ext': 'mp4', 'title': '이럴거면 그러지말지', 'description': '白雅言自作曲唱甜蜜爱情', - 'creator': '白雅言', + 'creators': ['白娥娟'], 'upload_date': '20150520', + 'thumbnail': r're:http.*\.jpg', + 'duration': 216, + 'view_count': int, + 'like_count': int, + 'comment_count': int, }, 'skip': 'Blocked outside Mainland China', - } + }, { + 'note': 'This MV has multiple creators.', + 'url': 'https://music.163.com/#/mv?id=22593543', + 'info_dict': { + 'id': '22593543', + 'ext': 'mp4', + 'title': '老北京杀器', + 'creators': ['秃子2z', '辉子', 'Saber梁维嘉'], + 'duration': 206, + 'upload_date': '20240618', + 'like_count': int, + 'comment_count': int, + 'thumbnail': r're:http.*\.jpg', + 'view_count': int, + }, + }] def _real_extract(self, url): mv_id = self._match_id(url) - info = self.query_api( - 'mv/detail?id=%s&type=mp4' % mv_id, - mv_id, 'Downloading mv info')['data'] + info = self._query_api( + f'mv/detail?id={mv_id}&type=mp4', mv_id, 'Downloading mv info')['data'] formats = [ - {'url': mv_url, 'ext': 'mp4', 'format_id': '%sp' % brs, 'height': int(brs)} + {'url': mv_url, 'ext': 'mp4', 'format_id': f'{brs}p', 'height': int_or_none(brs)} for brs, mv_url in info['brs'].items() ] return { 'id': mv_id, - 'title': info['name'], - 'description': info.get('desc') or info.get('briefDesc'), - 'creator': info['artistName'], - 'upload_date': info['publishTime'].replace('-', ''), 'formats': formats, - 'thumbnail': info.get('cover'), - 'duration': self.convert_milliseconds(info.get('duration', 0)), + 'creators': traverse_obj(info, ('artists', ..., 'name')) or [info.get('artistName')], + **traverse_obj(info, { + 'title': ('name', {str}), + 'description': (('desc', 'briefDesc'), {str}, {lambda x: x or None}), + 'upload_date': ('publishTime', {unified_strdate}), + 'thumbnail': ('cover', {url_or_none}), + 'duration': ('duration', {self._kilo_or_none}), + 'view_count': ('playCount', {int_or_none}), + 'like_count': ('likeCount', {int_or_none}), + 'comment_count': ('commentCount', {int_or_none}), + }, get_all=False), } class NetEaseMusicProgramIE(NetEaseMusicBaseIE): IE_NAME = 'netease:program' IE_DESC = '网易云音乐 - 电台节目' - _VALID_URL = r'https?://music\.163\.com/(#/?)program\?id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://music\.163\.com/(?:#/)?program\?id=(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://music.163.com/#/program?id=10109055', 'info_dict': { - 'id': '10109055', + 'id': '32593346', 'ext': 'mp3', 'title': '不丹足球背后的故事', 'description': '喜马拉雅人的足球梦 ...', - 'creator': '大话西藏', - 'timestamp': 1434179342, + 'creators': ['大话西藏'], + 'timestamp': 1434179287, 'upload_date': '20150613', + 'thumbnail': r're:http.*\.jpg', 'duration': 900, }, - 'skip': 'Blocked outside Mainland China', }, { 'note': 'This program has accompanying songs.', 'url': 'http://music.163.com/#/program?id=10141022', 'info_dict': { 'id': '10141022', - 'title': '25岁,你是自在如风的少年<27°C>', + 'title': '滚滚电台的有声节目', 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b', + 'creators': ['滚滚电台ORZ'], + 'timestamp': 1434450733, + 'upload_date': '20150616', + 'thumbnail': r're:http.*\.jpg', }, 'playlist_count': 4, - 'skip': 'Blocked outside Mainland China', }, { 'note': 'This program has accompanying songs.', 'url': 'http://music.163.com/#/program?id=10141022', 'info_dict': { - 'id': '10141022', + 'id': '32647209', 'ext': 'mp3', - 'title': '25岁,你是自在如风的少年<27°C>', + 'title': '滚滚电台的有声节目', 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b', - 'timestamp': 1434450841, + 'creators': ['滚滚电台ORZ'], + 'timestamp': 1434450733, 'upload_date': '20150616', + 'thumbnail': r're:http.*\.jpg', + 'duration': 1104, }, 'params': { - 'noplaylist': True + 'noplaylist': True, }, - 'skip': 'Blocked outside Mainland China', }] def _real_extract(self, url): program_id = self._match_id(url) - info = self.query_api( - 'dj/program/detail?id=%s' % program_id, - program_id, 'Downloading program info')['program'] + info = self._query_api( + f'dj/program/detail?id={program_id}', program_id, note='Downloading program info')['program'] - name = info['name'] - description = info['description'] + metainfo = traverse_obj(info, { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'creator': ('dj', 'brand', {str}), + 'thumbnail': ('coverUrl', {url_or_none}), + 'timestamp': ('createTime', {self._kilo_or_none}), + }) - if not self._yes_playlist(info['songs'] and program_id, info['mainSong']['id']): - formats = self.extract_formats(info['mainSong']) + if not self._yes_playlist( + info['songs'] and program_id, info['mainSong']['id'], playlist_label='program', video_label='song'): + formats = self._extract_formats(info['mainSong']) return { - 'id': info['mainSong']['id'], - 'title': name, - 'description': description, - 'creator': info['dj']['brand'], - 'timestamp': self.convert_milliseconds(info['createTime']), - 'thumbnail': info['coverUrl'], - 'duration': self.convert_milliseconds(info.get('duration', 0)), + 'id': str(info['mainSong']['id']), 'formats': formats, + 'duration': traverse_obj(info, ('mainSong', 'duration', {self._kilo_or_none})), + **metainfo, } - song_ids = [info['mainSong']['id']] - song_ids.extend([song['id'] for song in info['songs']]) - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song_id, - 'NetEaseMusic', song_id) - for song_id in song_ids - ] - return self.playlist_result(entries, program_id, name, description) + songs = traverse_obj(info, (('mainSong', ('songs', ...)),)) + return self.playlist_result(self._get_entries(songs), program_id, **metainfo) class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE): IE_NAME = 'netease:djradio' IE_DESC = '网易云音乐 - 电台' - _VALID_URL = r'https?://music\.163\.com/(#/)?djradio\?id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://music\.163\.com/(?:#/)?djradio\?id=(?P<id>[0-9]+)' _TEST = { 'url': 'http://music.163.com/#/djradio?id=42', 'info_dict': { 'id': '42', 'title': '声音蔓延', - 'description': 'md5:766220985cbd16fdd552f64c578a6b15' + 'description': 'md5:c7381ebd7989f9f367668a5aee7d5f08', }, 'playlist_mincount': 40, - 'skip': 'Blocked outside Mainland China', } _PAGE_SIZE = 1000 def _real_extract(self, url): dj_id = self._match_id(url) - name = None - desc = None + metainfo = {} entries = [] for offset in itertools.count(start=0, step=self._PAGE_SIZE): - info = self.query_api( - 'dj/program/byradio?asc=false&limit=%d&radioId=%s&offset=%d' - % (self._PAGE_SIZE, dj_id, offset), - dj_id, 'Downloading dj programs - %d' % offset) + info = self._query_api( + f'dj/program/byradio?asc=false&limit={self._PAGE_SIZE}&radioId={dj_id}&offset={offset}', + dj_id, note=f'Downloading dj programs - {offset}') - entries.extend([ - self.url_result( - 'http://music.163.com/#/program?id=%s' % program['id'], - 'NetEaseMusicProgram', program['id']) - for program in info['programs'] - ]) - - if name is None: - radio = info['programs'][0]['radio'] - name = radio['name'] - desc = radio['desc'] + entries.extend(self.url_result( + f'http://music.163.com/#/program?id={program["id"]}', NetEaseMusicProgramIE, + program['id'], program.get('name')) for program in info['programs']) + if not metainfo: + metainfo = traverse_obj(info, ('programs', 0, 'radio', { + 'title': ('name', {str}), + 'description': ('desc', {str}), + })) if not info['more']: break - return self.playlist_result(entries, dj_id, name, desc) + return self.playlist_result(entries, dj_id, **metainfo) diff --git a/yt_dlp/extractor/netverse.py b/yt_dlp/extractor/netverse.py index 398198a1b0..2ddec5c0ae 100644 --- a/yt_dlp/extractor/netverse.py +++ b/yt_dlp/extractor/netverse.py @@ -63,7 +63,7 @@ class NetverseIE(NetverseBaseIE): 'timestamp': 1626919804, 'like_count': int, 'uploader': 'Net Prime', - } + }, }, { # series 'url': 'https://www.netverse.id/watch/jadoo-seorang-model', @@ -87,7 +87,7 @@ class NetverseIE(NetverseBaseIE): 'uploader': 'Net Prime', 'age_limit': 0, }, - 'skip': 'video get Geo-blocked for some country' + 'skip': 'video get Geo-blocked for some country', }, { # non www host 'url': 'https://netverse.id/watch/tetangga-baru', @@ -135,7 +135,7 @@ class NetverseIE(NetverseBaseIE): 'timestamp': 1645764984, 'upload_date': '20220225', }, - 'skip': 'This video get Geo-blocked for some country' + 'skip': 'This video get Geo-blocked for some country', }, { # video with comments 'url': 'https://netverse.id/video/episode-1-season-2016-ok-food', @@ -160,9 +160,9 @@ class NetverseIE(NetverseBaseIE): 'uploader': 'Net Prime', 'comment_count': int, }, - 'params':{ - 'getcomments': True - } + 'params': { + 'getcomments': True, + }, }, { # video with multiple page comment 'url': 'https://netverse.id/video/match-island-eps-1-fix', @@ -187,9 +187,9 @@ class NetverseIE(NetverseBaseIE): 'season': 'Season 1', 'comment_count': int, }, - 'params':{ - 'getcomments': True - } + 'params': { + 'getcomments': True, + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/netzkino.py b/yt_dlp/extractor/netzkino.py index 9c314e2233..c07b1715af 100644 --- a/yt_dlp/extractor/netzkino.py +++ b/yt_dlp/extractor/netzkino.py @@ -8,6 +8,7 @@ class NetzkinoIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/[^/]+/(?P<id>[^/]+)' _TESTS = [{ @@ -25,7 +26,7 @@ class NetzkinoIE(InfoExtractor): }, 'params': { 'skip_download': 'Download only works from Germany', - } + }, }, { 'url': 'https://www.netzkino.de/#!/filme/dr-jekyll-mrs-hyde-2', 'md5': 'c7728b2dadd04ff6727814847a51ef03', @@ -41,14 +42,14 @@ class NetzkinoIE(InfoExtractor): }, 'params': { 'skip_download': 'Download only works from Germany', - } + }, }] def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') - api_url = 'https://api.netzkino.de.simplecache.net/capi-2.0a/movies/%s.json?d=www' % video_id + api_url = f'https://api.netzkino.de.simplecache.net/capi-2.0a/movies/{video_id}.json?d=www' info = self._download_json(api_url, video_id) custom_fields = info['custom_fields'] diff --git a/yt_dlp/extractor/newgrounds.py b/yt_dlp/extractor/newgrounds.py index 9e3286dfe7..9f5a464e65 100644 --- a/yt_dlp/extractor/newgrounds.py +++ b/yt_dlp/extractor/newgrounds.py @@ -2,7 +2,10 @@ import re from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( + ExtractorError, + OnDemandPagedList, clean_html, extract_attributes, get_element_by_id, @@ -10,12 +13,15 @@ parse_count, parse_duration, unified_timestamp, - OnDemandPagedList, - try_get, + url_or_none, + urlencode_postdata, + urljoin, ) +from ..utils.traversal import traverse_obj class NewgroundsIE(InfoExtractor): + _NETRC_MACHINE = 'newgrounds' _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:audio/listen|portal/view)/(?P<id>\d+)(?:/format/flash)?' _TESTS = [{ 'url': 'https://www.newgrounds.com/audio/listen/549479', @@ -25,11 +31,13 @@ class NewgroundsIE(InfoExtractor): 'ext': 'mp3', 'title': 'B7 - BusMode', 'uploader': 'Burn7', - 'timestamp': 1378878540, + 'timestamp': 1378892945, 'upload_date': '20130911', 'duration': 143, 'view_count': int, 'description': 'md5:b8b3c2958875189f07d8e313462e8c4f', + 'age_limit': 0, + 'thumbnail': r're:^https://aicon\.ngfiles\.com/549/549479\.png', }, }, { 'url': 'https://www.newgrounds.com/portal/view/1', @@ -39,11 +47,12 @@ class NewgroundsIE(InfoExtractor): 'ext': 'mp4', 'title': 'Scrotum 1', 'uploader': 'Brian-Beaton', - 'timestamp': 955064100, - 'upload_date': '20000406', + 'timestamp': 955078533, + 'upload_date': '20000407', 'view_count': int, 'description': 'Scrotum plays "catch."', 'age_limit': 17, + 'thumbnail': r're:^https://picon\.ngfiles\.com/0/flash_1_card\.png', }, }, { # source format unavailable, additional mp4 formats @@ -53,11 +62,12 @@ class NewgroundsIE(InfoExtractor): 'ext': 'mp4', 'title': 'ZTV News Episode 8', 'uploader': 'ZONE-SAMA', - 'timestamp': 1487965140, - 'upload_date': '20170224', + 'timestamp': 1487983183, + 'upload_date': '20170225', 'view_count': int, 'description': 'md5:aff9b330ec2e78ed93b1ad6d017accc6', 'age_limit': 17, + 'thumbnail': r're:^https://picon\.ngfiles\.com/689000/flash_689400_card\.png', }, 'params': { 'skip_download': True, @@ -70,12 +80,13 @@ class NewgroundsIE(InfoExtractor): 'ext': 'mp4', 'title': 'Metal Gear Awesome', 'uploader': 'Egoraptor', - 'timestamp': 1140663240, + 'timestamp': 1140681292, 'upload_date': '20060223', 'view_count': int, 'description': 'md5:9246c181614e23754571995104da92e0', 'age_limit': 13, - } + 'thumbnail': r're:^https://picon\.ngfiles\.com/297000/flash_297383_card\.png', + }, }, { 'url': 'https://www.newgrounds.com/portal/view/297383/format/flash', 'md5': '5d05585a9a0caca059f5abfbd3865524', @@ -86,9 +97,25 @@ class NewgroundsIE(InfoExtractor): 'description': 'Metal Gear Awesome', 'uploader': 'Egoraptor', 'upload_date': '20060223', - 'timestamp': 1140663240, + 'timestamp': 1140681292, + 'view_count': int, 'age_limit': 13, - } + 'thumbnail': r're:^https://picon\.ngfiles\.com/297000/flash_297383_card\.png', + }, + }, { + 'url': 'https://www.newgrounds.com/portal/view/823109', + 'info_dict': { + 'id': '823109', + 'ext': 'mp4', + 'title': 'Rouge Futa Fleshlight Fuck', + 'description': 'I made a fleshlight model and I wanted to use it in an animation. Based on a video by CDNaturally.', + 'uploader': 'DefaultUser12', + 'upload_date': '20211122', + 'timestamp': 1637611540, + 'view_count': int, + 'age_limit': 18, + 'thumbnail': r're:^https://picon\.ngfiles\.com/823000/flash_823109_card\.png', + }, }] _AGE_LIMIT = { 'e': 0, @@ -96,42 +123,59 @@ class NewgroundsIE(InfoExtractor): 'm': 17, 'a': 18, } + _LOGIN_URL = 'https://www.newgrounds.com/passport' + + def _perform_login(self, username, password): + login_webpage = self._download_webpage(self._LOGIN_URL, None, 'Downloading login page') + login_url = urljoin(self._LOGIN_URL, self._search_regex( + r'<form action="([^"]+)"', login_webpage, 'login endpoint', default=None)) + result = self._download_json(login_url, None, 'Logging in', headers={ + 'Accept': 'application/json', + 'Referer': self._LOGIN_URL, + 'X-Requested-With': 'XMLHttpRequest', + }, data=urlencode_postdata({ + **self._hidden_inputs(login_webpage), + 'username': username, + 'password': password, + })) + if errors := traverse_obj(result, ('errors', ..., {str})): + raise ExtractorError(', '.join(errors) or 'Unknown Error', expected=True) def _real_extract(self, url): media_id = self._match_id(url) - formats = [] - uploader = None - webpage = self._download_webpage(url, media_id) - - title = self._html_extract_title(webpage) + try: + webpage = self._download_webpage(url, media_id) + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 401: + self.raise_login_required() + raise media_url_string = self._search_regex( - r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None) - + r'embedController\(\[{"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None) if media_url_string: - media_url = self._parse_json(media_url_string, media_id) + uploader = None formats = [{ - 'url': media_url, + 'url': self._parse_json(media_url_string, media_id), 'format_id': 'source', 'quality': 1, }] + else: - json_video = self._download_json('https://www.newgrounds.com/portal/video/' + media_id, media_id, headers={ + json_video = self._download_json(f'https://www.newgrounds.com/portal/video/{media_id}', media_id, headers={ 'Accept': 'application/json', 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest' + 'X-Requested-With': 'XMLHttpRequest', }) - uploader = json_video.get('author') - media_formats = json_video.get('sources', []) - for media_format in media_formats: - media_sources = media_formats[media_format] - for source in media_sources: - formats.append({ - 'format_id': media_format, - 'quality': int_or_none(media_format[:-1]), - 'url': source.get('src') - }) + formats = [] + uploader = traverse_obj(json_video, ('author', {str})) + for format_id, sources in traverse_obj(json_video, ('sources', {dict.items}, ...)): + quality = int_or_none(format_id[:-1]) + formats.extend({ + 'format_id': format_id, + 'quality': quality, + 'url': url, + } for url in traverse_obj(sources, (..., 'src', {url_or_none}))) if not uploader: uploader = self._html_search_regex( @@ -139,51 +183,35 @@ def _real_extract(self, url): r'(?:Author|Writer)\s*<a[^>]+>([^<]+)'), webpage, 'uploader', fatal=False) - age_limit = self._html_search_regex( - r'<h2\s*class=["\']rated-([^"\'])["\'][^>]+>', webpage, 'age_limit', default='e') - age_limit = self._AGE_LIMIT.get(age_limit) - - timestamp = unified_timestamp(self._html_search_regex( - (r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+</dd>\s*<dd>[^<]+)', - r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+)'), webpage, 'timestamp', - default=None)) - - duration = parse_duration(self._html_search_regex( - r'"duration"\s*:\s*["\']?(\d+)["\']?', webpage, - 'duration', default=None)) - - description = clean_html(get_element_by_id('author_comments', webpage)) or self._og_search_description(webpage) - - view_count = parse_count(self._html_search_regex( - r'(?s)<dt>\s*(?:Views|Listens)\s*</dt>\s*<dd>([\d\.,]+)</dd>', webpage, - 'view count', default=None)) - - filesize = int_or_none(self._html_search_regex( - r'"filesize"\s*:\s*["\']?([\d]+)["\']?,', webpage, 'filesize', - default=None)) - - video_type_description = self._html_search_regex( - r'"description"\s*:\s*["\']?([^"\']+)["\']?,', webpage, 'filesize', - default=None) - if len(formats) == 1: - formats[0]['filesize'] = filesize + formats[0]['filesize'] = int_or_none(self._html_search_regex( + r'"filesize"\s*:\s*["\']?([\d]+)["\']?,', webpage, 'filesize', default=None)) + + video_type_description = self._html_search_regex( + r'"description"\s*:\s*["\']?([^"\']+)["\']?,', webpage, 'media type', default=None) + if video_type_description == 'Audio File': + formats[0]['vcodec'] = 'none' - if video_type_description == 'Audio File': - formats[0]['vcodec'] = 'none' self._check_formats(formats, media_id) - return { 'id': media_id, - 'title': title, + 'title': self._html_extract_title(webpage), 'uploader': uploader, - 'timestamp': timestamp, - 'duration': duration, + 'timestamp': unified_timestamp(self._search_regex( + r'itemprop="(?:uploadDate|datePublished)"\s+content="([^"]+)"', + webpage, 'timestamp', default=None)), + 'duration': parse_duration(self._html_search_regex( + r'"duration"\s*:\s*["\']?(\d+)["\']?', webpage, 'duration', default=None)), 'formats': formats, 'thumbnail': self._og_search_thumbnail(webpage), - 'description': description, - 'age_limit': age_limit, - 'view_count': view_count, + 'description': ( + clean_html(get_element_by_id('author_comments', webpage)) + or self._og_search_description(webpage)), + 'age_limit': self._AGE_LIMIT.get(self._html_search_regex( + r'<h2\s+class=["\']rated-([etma])["\']', webpage, 'age_limit', default='e')), + 'view_count': parse_count(self._html_search_regex( + r'(?s)<dt>\s*(?:Views|Listens)\s*</dt>\s*<dd>([\d\.,]+)</dd>', + webpage, 'view count', default=None)), } @@ -263,19 +291,16 @@ class NewgroundsUserIE(InfoExtractor): def _fetch_page(self, channel_id, url, page): page += 1 posts_info = self._download_json( - f'{url}/page/{page}', channel_id, + f'{url}?page={page}', channel_id, note=f'Downloading page {page}', headers={ 'Accept': 'application/json, text/javascript, */*; q = 0.01', 'X-Requested-With': 'XMLHttpRequest', }) - sequence = posts_info.get('sequence', []) - for year in sequence: - posts = try_get(posts_info, lambda x: x['years'][str(year)]['items']) - for post in posts: - path, media_id = self._search_regex( - r'<a[^>]+\bhref=["\'][^"\']+((?:portal/view|audio/listen)/(\d+))[^>]+>', - post, 'url', group=(1, 2)) - yield self.url_result(f'https://www.newgrounds.com/{path}', NewgroundsIE.ie_key(), media_id) + for post in traverse_obj(posts_info, ('items', ..., ..., {str})): + path, media_id = self._search_regex( + r'<a[^>]+\bhref=["\'][^"\']+((?:portal/view|audio/listen)/(\d+))[^>]+>', + post, 'url', group=(1, 2)) + yield self.url_result(f'https://www.newgrounds.com/{path}', NewgroundsIE.ie_key(), media_id) def _real_extract(self, url): channel_id = self._match_id(url) diff --git a/yt_dlp/extractor/newspicks.py b/yt_dlp/extractor/newspicks.py index b6334dcbaf..4a1cb0a735 100644 --- a/yt_dlp/extractor/newspicks.py +++ b/yt_dlp/extractor/newspicks.py @@ -5,7 +5,7 @@ class NewsPicksIE(InfoExtractor): - _VALID_URL = r'https://newspicks\.com/movie-series/(?P<channel_id>\d+)\?movieId=(?P<id>\d+)' + _VALID_URL = r'https?://newspicks\.com/movie-series/(?P<channel_id>\d+)\?movieId=(?P<id>\d+)' _TESTS = [{ 'url': 'https://newspicks.com/movie-series/11?movieId=1813', diff --git a/yt_dlp/extractor/newstube.py b/yt_dlp/extractor/newstube.py deleted file mode 100644 index 820eb4ba7f..0000000000 --- a/yt_dlp/extractor/newstube.py +++ /dev/null @@ -1,75 +0,0 @@ -import base64 -import hashlib - -from .common import InfoExtractor -from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 -from ..utils import ( - int_or_none, - parse_codecs, - parse_duration, -) - - -class NewstubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?newstube\.ru/media/(?P<id>.+)' - _TEST = { - 'url': 'http://www.newstube.ru/media/telekanal-cnn-peremestil-gorod-slavyansk-v-krym', - 'md5': '9d10320ad473444352f72f746ccb8b8c', - 'info_dict': { - 'id': '728e0ef2-e187-4012-bac0-5a081fdcb1f6', - 'ext': 'mp4', - 'title': 'Телеканал CNN переместил город Славянск в Крым', - 'description': 'md5:419a8c9f03442bc0b0a794d689360335', - 'duration': 31.05, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - page = self._download_webpage(url, video_id) - title = self._html_search_meta(['og:title', 'twitter:title'], page, fatal=True) - - video_guid = self._html_search_regex( - r'<meta\s+property="og:video(?::(?:(?:secure_)?url|iframe))?"\s+content="https?://(?:www\.)?newstube\.ru/embed/(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', - page, 'video GUID') - - enc_data = base64.b64decode(self._download_webpage( - 'https://www.newstube.ru/embed/api/player/getsources2', - video_guid, query={ - 'guid': video_guid, - 'ff': 3, - })) - key = hashlib.pbkdf2_hmac( - 'sha1', video_guid.replace('-', '').encode(), enc_data[:16], 1)[:16] - dec_data = unpad_pkcs7(aes_cbc_decrypt_bytes(enc_data[32:], key, enc_data[16:32])) - sources = self._parse_json(dec_data, video_guid) - - formats = [] - for source in sources: - source_url = source.get('Src') - if not source_url: - continue - height = int_or_none(source.get('Height')) - f = { - 'format_id': 'http' + ('-%dp' % height if height else ''), - 'url': source_url, - 'width': int_or_none(source.get('Width')), - 'height': height, - } - source_type = source.get('Type') - if source_type: - f.update(parse_codecs(self._search_regex( - r'codecs="([^"]+)"', source_type, 'codecs', fatal=False))) - formats.append(f) - - self._check_formats(formats, video_guid) - - return { - 'id': video_guid, - 'title': title, - 'description': self._html_search_meta(['description', 'og:description'], page), - 'thumbnail': self._html_search_meta(['og:image:secure_url', 'og:image', 'twitter:image'], page), - 'duration': parse_duration(self._html_search_meta('duration', page)), - 'formats': formats, - } diff --git a/yt_dlp/extractor/newsy.py b/yt_dlp/extractor/newsy.py index a5a7b168cd..941cb93311 100644 --- a/yt_dlp/extractor/newsy.py +++ b/yt_dlp/extractor/newsy.py @@ -19,9 +19,9 @@ class NewsyIE(InfoExtractor): 'timestamp': 1621339200, 'duration': 339630, 'thumbnail': 'https://cdn.newsy.com/images/videos/x/1620927824_xyrrP4.jpg', - 'upload_date': '20210518' + 'upload_date': '20210518', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/nextmedia.py b/yt_dlp/extractor/nextmedia.py index 0e47a4d45a..81da3ffde3 100644 --- a/yt_dlp/extractor/nextmedia.py +++ b/yt_dlp/extractor/nextmedia.py @@ -1,5 +1,6 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( clean_html, get_element_by_class, @@ -24,7 +25,7 @@ class NextMediaIE(InfoExtractor): 'description': 'md5:28222b9912b6665a21011b034c70fcc7', 'timestamp': 1415456273, 'upload_date': '20141108', - } + }, }] _URL_PATTERN = r'\{ url: \'(.+)\' \}' @@ -39,7 +40,7 @@ def _extract_from_nextmedia_page(self, news_id, url, page): r'window\.location\.href\s*=\s*([\'"])(?P<url>(?!\1).+)\1', page, 'redirection URL', default=None, group='url') if redirection_url: - return self.url_result(compat_urlparse.urljoin(url, redirection_url)) + return self.url_result(urllib.parse.urljoin(url, redirection_url)) title = self._fetch_title(page) video_url = self._search_regex(self._URL_PATTERN, page, 'video url') @@ -67,8 +68,8 @@ def _fetch_thumbnail(self, page): return self._og_search_thumbnail(page) def _fetch_timestamp(self, page): - dateCreated = self._search_regex('"dateCreated":"([^"]+)"', page, 'created time') - return parse_iso8601(dateCreated) + date_created = self._search_regex('"dateCreated":"([^"]+)"', page, 'created time') + return parse_iso8601(date_created) def _fetch_upload_date(self, url): return self._search_regex(self._VALID_URL, url, 'upload date', group='date') @@ -91,7 +92,7 @@ class NextMediaActionNewsIE(NextMediaIE): # XXX: Do not subclass from concrete 'description': 'md5:cd802fad1f40fd9ea178c1e2af02d659', 'timestamp': 1421791200, 'upload_date': '20150120', - } + }, }] def _real_extract(self, url): @@ -115,7 +116,7 @@ class AppleDailyIE(NextMediaIE): # XXX: Do not subclass from concrete IE 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4', 'upload_date': '20150128', - } + }, }, { 'url': 'http://www.appledaily.com.tw/realtimenews/article/strange/20150128/550549/%E4%B8%8D%E6%BB%BF%E8%A2%AB%E8%B8%A9%E8%85%B3%E3%80%80%E5%B1%B1%E6%9D%B1%E5%85%A9%E5%A4%A7%E5%AA%BD%E4%B8%80%E8%B7%AF%E6%89%93%E4%B8%8B%E8%BB%8A', 'md5': '86b4e9132d158279c7883822d94ccc49', @@ -126,7 +127,7 @@ class AppleDailyIE(NextMediaIE): # XXX: Do not subclass from concrete IE 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'md5:175b4260c1d7c085993474217e4ab1b4', 'upload_date': '20150128', - } + }, }, { 'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671', 'md5': '03df296d95dedc2d5886debbb80cb43f', @@ -191,6 +192,8 @@ def _fetch_description(self, page): class NextTVIE(InfoExtractor): + _WORKING = False + _ENABLED = None # XXX: pass through to GenericIE IE_DESC = '壹電視' _VALID_URL = r'https?://(?:www\.)?nexttv\.com\.tw/(?:[^/]+/)+(?P<id>\d+)' diff --git a/yt_dlp/extractor/nexx.py b/yt_dlp/extractor/nexx.py index b4874c8f3e..cd32892fa0 100644 --- a/yt_dlp/extractor/nexx.py +++ b/yt_dlp/extractor/nexx.py @@ -4,7 +4,6 @@ import time from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, @@ -41,7 +40,7 @@ class NexxIE(InfoExtractor): 'timestamp': 1384264416, 'upload_date': '20131112', }, - 'skip': 'Spiegel nexx CDNs are now disabled' + 'skip': 'Spiegel nexx CDNs are now disabled', }, { # episode with captions 'url': 'https://api.nexx.cloud/v3.1/741/videos/byid/1701834', @@ -92,7 +91,7 @@ class NexxIE(InfoExtractor): 'timestamp': 1527874460, 'upload_date': '20180601', }, - 'skip': 'Spiegel nexx CDNs are now disabled' + 'skip': 'Spiegel nexx CDNs are now disabled', }, { 'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907', 'only_matching': True, @@ -128,8 +127,7 @@ def _extract_embed_urls(cls, url, webpage): r'(?is)onPLAYReady.+?_play\.(?:init|(?:control\.)?addPlayer)\s*\(.+?\s*,\s*["\']?(\d+)', webpage): entries.append( - 'https://api.nexx.cloud/v3/%s/videos/byid/%s' - % (domain_id, video_id)) + f'https://api.nexx.cloud/v3/{domain_id}/videos/byid/{video_id}') # TODO: support more embed formats @@ -137,20 +135,20 @@ def _extract_embed_urls(cls, url, webpage): def _handle_error(self, response): if traverse_obj(response, ('metadata', 'notice'), expected_type=str): - self.report_warning('%s said: %s' % (self.IE_NAME, response['metadata']['notice'])) + self.report_warning('{} said: {}'.format(self.IE_NAME, response['metadata']['notice'])) status = int_or_none(try_get( response, lambda x: x['metadata']['status']) or 200) if 200 <= status < 300: return raise ExtractorError( - '%s said: %s' % (self.IE_NAME, response['metadata']['errorhint']), + '{} said: {}'.format(self.IE_NAME, response['metadata']['errorhint']), expected=True) def _call_api(self, domain_id, path, video_id, data=None, headers={}): headers['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8' result = self._download_json( - 'https://api.nexx.cloud/v3/%s/%s' % (domain_id, path), video_id, - 'Downloading %s JSON' % path, data=urlencode_postdata(data), + f'https://api.nexx.cloud/v3/{domain_id}/{path}', video_id, + f'Downloading {path} JSON', data=urlencode_postdata(data), headers=headers) self._handle_error(result) return result['result'] @@ -160,20 +158,20 @@ def _extract_free_formats(self, video, video_id): cdn = stream_data['cdnType'] assert cdn == 'free' - hash = video['general']['hash'] + video_hash = video['general']['hash'] - ps = compat_str(stream_data['originalDomain']) + ps = str(stream_data['originalDomain']) if stream_data['applyFolderHierarchy'] == 1: s = ('%04d' % int(video_id))[::-1] - ps += '/%s/%s' % (s[0:2], s[2:4]) - ps += '/%s/%s_' % (video_id, hash) + ps += f'/{s[0:2]}/{s[2:4]}' + ps += f'/{video_id}/{video_hash}_' t = 'http://%s' + ps fd = stream_data['azureFileDistribution'].split(',') cdn_provider = stream_data['cdnProvider'] def p0(p): - return '_%s' % p if stream_data['applyAzureStructure'] == 1 else '' + return f'_{p}' if stream_data['applyAzureStructure'] == 1 else '' formats = [] if cdn_provider == 'ak': @@ -191,10 +189,10 @@ def p0(p): for i in fd: p = i.split(':') tbr = int(p[0]) - filename = '%s%s%s.mp4' % (h, p[1], p0(tbr)) + filename = f'{h}{p[1]}{p0(tbr)}.mp4' f = { 'url': http_base + '/' + filename, - 'format_id': '%s-http-%d' % (cdn, tbr), + 'format_id': f'{cdn}-http-{tbr}', 'tbr': tbr, } width_height = p[1].split('x') @@ -204,7 +202,7 @@ def p0(p): 'height': int_or_none(width_height[1]), }) formats.append(f) - a = filename + ':%s' % (tbr * 1000) + a = filename + f':{tbr * 1000}' t += a + ',' t = t[:-1] + '&audiostream=' + a.split(':')[0] else: @@ -213,10 +211,10 @@ def p0(p): if cdn_provider == 'ce': formats.extend(self._extract_mpd_formats( t % (stream_data['cdnPathDASH'], 'mpd'), video_id, - mpd_id='%s-dash' % cdn, fatal=False)) + mpd_id=f'{cdn}-dash', fatal=False)) formats.extend(self._extract_m3u8_formats( t % (stream_data['cdnPathHLS'], 'm3u8'), video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='%s-hls' % cdn, fatal=False)) + entry_protocol='m3u8_native', m3u8_id=f'{cdn}-hls', fatal=False)) return formats @@ -231,9 +229,9 @@ def _extract_3q_formats(self, video, video_id): def get_cdn_shield_base(shield_type=''): for secure in ('', 's'): - cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper())) + cdn_shield = stream_data.get(f'cdnShield{shield_type}HTTP{secure.upper()}') if cdn_shield: - return 'http%s://%s' % (secure, cdn_shield) + return f'http{secure}://{cdn_shield}' return f'http://sdn-global-{"prog" if shield_type.lower() == "prog" else "streaming"}-cache.3qsdn.com/' + (f's/{protection_key}/' if protection_key else '') stream_base = get_cdn_shield_base() @@ -256,7 +254,7 @@ def get_cdn_shield_base(shield_type=''): tbr = int_or_none(ss[1], scale=1000) formats.append({ 'url': f'{progressive_base}{q_acc}/uploads/{q_acc}-{ss[2]}.webm', - 'format_id': f'{cdn}-{ss[0]}{"-%s" % tbr if tbr else ""}', + 'format_id': f'{cdn}-{ss[0]}{f"-{tbr}" if tbr else ""}', 'tbr': tbr, }) @@ -270,7 +268,7 @@ def get_cdn_shield_base(shield_type=''): width, height = ss[1].split('x') if len(ss[1].split('x')) == 2 else (None, None) f = { 'url': f'{progressive_base}{q_acc}/files/{q_prefix}/{q_locator}/{ss[2]}.mp4', - 'format_id': f'{cdn}-http-{"-%s" % tbr if tbr else ""}', + 'format_id': f'{cdn}-http-{f"-{tbr}" if tbr else ""}', 'tbr': tbr, 'width': int_or_none(width), 'height': int_or_none(height), @@ -288,38 +286,37 @@ def _extract_azure_formats(self, video, video_id): def get_cdn_shield_base(shield_type='', static=False): for secure in ('', 's'): - cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper())) + cdn_shield = stream_data.get(f'cdnShield{shield_type}HTTP{secure.upper()}') if cdn_shield: - return 'http%s://%s' % (secure, cdn_shield) + return f'http{secure}://{cdn_shield}' + if 'fb' in stream_data['azureAccount']: + prefix = 'df' if static else 'f' else: - if 'fb' in stream_data['azureAccount']: - prefix = 'df' if static else 'f' - else: - prefix = 'd' if static else 'p' - account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', '')) - return 'http://nx-%s%02d.akamaized.net/' % (prefix, account) + prefix = 'd' if static else 'p' + account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', '')) + return 'http://nx-%s%02d.akamaized.net/' % (prefix, account) language = video['general'].get('language_raw') or '' azure_stream_base = get_cdn_shield_base() is_ml = ',' in language - azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % ( + azure_manifest_url = '{}{}/{}_src{}.ism/Manifest'.format( azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s' protection_token = try_get( - video, lambda x: x['protectiondata']['token'], compat_str) + video, lambda x: x['protectiondata']['token'], str) if protection_token: - azure_manifest_url += '?hdnts=%s' % protection_token + azure_manifest_url += f'?hdnts={protection_token}' formats = self._extract_m3u8_formats( azure_manifest_url % '(format=m3u8-aapl)', video_id, 'mp4', 'm3u8_native', - m3u8_id='%s-hls' % cdn, fatal=False) + m3u8_id=f'{cdn}-hls', fatal=False) formats.extend(self._extract_mpd_formats( azure_manifest_url % '(format=mpd-time-csf)', - video_id, mpd_id='%s-dash' % cdn, fatal=False)) + video_id, mpd_id=f'{cdn}-dash', fatal=False)) formats.extend(self._extract_ism_formats( - azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False)) + azure_manifest_url % '', video_id, ism_id=f'{cdn}-mss', fatal=False)) azure_progressive_base = get_cdn_shield_base('Prog', True) azure_file_distribution = stream_data.get('azureFileDistribution') @@ -332,9 +329,8 @@ def get_cdn_shield_base(shield_type='', static=False): tbr = int_or_none(ss[0]) if tbr: f = { - 'url': '%s%s/%s_src_%s_%d.mp4' % ( - azure_progressive_base, azure_locator, video_id, ss[1], tbr), - 'format_id': '%s-http-%d' % (cdn, tbr), + 'url': f'{azure_progressive_base}{azure_locator}/{video_id}_src_{ss[1]}_{tbr}.mp4', + 'format_id': f'{cdn}-http-{tbr}', 'tbr': tbr, } width_height = ss[1].split('x') @@ -365,7 +361,7 @@ def find_video(result): return None response = self._download_json( - 'https://arc.nexx.cloud/api/video/%s.json' % video_id, + f'https://arc.nexx.cloud/api/video/{video_id}.json', video_id, fatal=False) if response and isinstance(response, dict): result = response.get('result') @@ -375,9 +371,7 @@ def find_video(result): # not all videos work via arc, e.g. nexx:741:1269984 if not video: # Reverse engineered from JS code (see getDeviceID function) - device_id = '%d:%d:%d%d' % ( - random.randint(1, 4), int(time.time()), - random.randint(1e4, 99999), random.randint(1, 9)) + device_id = f'{random.randint(1, 4)}:{int(time.time())}:{random.randint(1e4, 99999)}{random.randint(1, 9)}' result = self._call_api(domain_id, 'session/init', video_id, data={ 'nxp_devh': device_id, @@ -416,10 +410,10 @@ def find_video(result): # Reversed from JS code for _play.api.call function (search for # X-Request-Token) request_token = hashlib.md5( - ''.join((op, domain_id, secret)).encode('utf-8')).hexdigest() + ''.join((op, domain_id, secret)).encode()).hexdigest() result = self._call_api( - domain_id, 'videos/%s/%s' % (op, video_id), video_id, data={ + domain_id, f'videos/{op}/{video_id}', video_id, data={ 'additionalfields': 'language,channel,format,licenseby,slug,fileversion,episode,season', 'addInteractionOptions': '1', 'addStatusDetails': '1', @@ -460,13 +454,13 @@ def find_video(result): 'data': '\n\n'.join( f'{i + 1}\n{srt_subtitles_timecode(line["fromms"] / 1000)} --> {srt_subtitles_timecode(line["toms"] / 1000)}\n{line["caption"]}' for i, line in enumerate(sub['data'])), - 'name': sub.get('language_long') or sub.get('title') + 'name': sub.get('language_long') or sub.get('title'), }) elif sub.get('url'): subtitles.setdefault(sub.get('language', 'en'), []).append({ 'url': sub['url'], 'ext': sub.get('format'), - 'name': sub.get('language_long') or sub.get('title') + 'name': sub.get('language_long') or sub.get('title'), }) return { @@ -477,7 +471,7 @@ def find_video(result): 'release_year': int_or_none(general.get('year')), 'creator': general.get('studio') or general.get('studio_adref') or None, 'thumbnail': try_get( - video, lambda x: x['imagedata']['thumb'], compat_str), + video, lambda x: x['imagedata']['thumb'], str), 'duration': parse_duration(general.get('runtime')), 'timestamp': int_or_none(general.get('uploaded')), 'episode_number': traverse_obj( diff --git a/yt_dlp/extractor/nfb.py b/yt_dlp/extractor/nfb.py index 38e068af41..968c9728b0 100644 --- a/yt_dlp/extractor/nfb.py +++ b/yt_dlp/extractor/nfb.py @@ -1,10 +1,52 @@ from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + join_nonempty, + merge_dicts, + parse_count, + url_or_none, +) +from ..utils.traversal import traverse_obj -class NFBIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nfb\.ca/film/(?P<id>[^/?#&]+)' +class NFBBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?(?P<site>nfb|onf)\.ca' + _GEO_COUNTRIES = ['CA'] + + def _extract_ep_data(self, webpage, video_id, fatal=False): + return self._search_json( + r'episodesData\s*:', webpage, 'episode data', video_id, fatal=fatal) or {} + + def _extract_ep_info(self, data, video_id, slug=None): + info = traverse_obj(data, (lambda _, v: video_id in v['embed_url'], { + 'description': ('description', {str}), + 'thumbnail': ('thumbnail_url', {url_or_none}), + 'uploader': ('data_layer', 'episodeMaker', {str}), + 'release_year': ('data_layer', 'episodeYear', {int_or_none}), + 'episode': ('data_layer', 'episodeTitle', {str}), + 'season': ('data_layer', 'seasonTitle', {str}), + 'season_number': ('data_layer', 'seasonTitle', {parse_count}), + 'series': ('data_layer', 'seriesTitle', {str}), + }), get_all=False) + + return { + **info, + 'id': video_id, + 'title': join_nonempty('series', 'episode', from_dict=info, delim=' - '), + 'episode_number': int_or_none(self._search_regex( + r'[/-]e(?:pisode)?-?(\d+)(?:[/-]|$)', slug or video_id, 'episode number', default=None)), + } + + +class NFBIE(NFBBaseIE): + IE_NAME = 'nfb' + IE_DESC = 'nfb.ca and onf.ca films and episodes' + _VALID_URL = [ + rf'{NFBBaseIE._VALID_URL_BASE}/(?P<type>film)/(?P<id>[^/?#&]+)', + rf'{NFBBaseIE._VALID_URL_BASE}/(?P<type>series?)/(?P<id>[^/?#&]+/s(?:ea|ai)son\d+/episode\d+)', + ] _TESTS = [{ + 'note': 'NFB film', 'url': 'https://www.nfb.ca/film/trafficopter/', 'info_dict': { 'id': 'trafficopter', @@ -14,45 +56,238 @@ class NFBIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Barrie Howells', 'release_year': 1972, + 'duration': 600.0, }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'ONF film', + 'url': 'https://www.onf.ca/film/mal-du-siecle/', + 'info_dict': { + 'id': 'mal-du-siecle', + 'ext': 'mp4', + 'title': 'Le mal du siècle', + 'description': 'md5:1abf774d77569ebe603419f2d344102b', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Catherine Lepage', + 'release_year': 2019, + 'duration': 300.0, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'NFB episode with English title', + 'url': 'https://www.nfb.ca/series/true-north-inside-the-rise-of-toronto-basketball/season1/episode9/', + 'info_dict': { + 'id': 'true-north-episode9-true-north-finale-making-it', + 'ext': 'mp4', + 'title': 'True North: Inside the Rise of Toronto Basketball - Finale: Making It', + 'description': 'We catch up with each player in the midst of their journey as they reflect on their road ahead.', + 'series': 'True North: Inside the Rise of Toronto Basketball', + 'release_year': 2018, + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Finale: Making It', + 'episode_number': 9, + 'uploader': 'Ryan Sidhoo', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'ONF episode with French title', + 'url': 'https://www.onf.ca/serie/direction-nord-la-montee-du-basketball-a-toronto/saison1/episode9/', + 'info_dict': { + 'id': 'direction-nord-episode-9', + 'ext': 'mp4', + 'title': 'Direction nord – La montée du basketball à Toronto - Finale : Réussir', + 'description': 'md5:349a57419b71432b97bf6083d92b029d', + 'series': 'Direction nord – La montée du basketball à Toronto', + 'release_year': 2018, + 'season': 'Saison 1', + 'season_number': 1, + 'episode': 'Finale : Réussir', + 'episode_number': 9, + 'uploader': 'Ryan Sidhoo', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'NFB episode with French title (needs geo-bypass)', + 'url': 'https://www.nfb.ca/series/etoile-du-nord/saison1/episode1/', + 'info_dict': { + 'id': 'etoile-du-nord-episode-1-lobservation', + 'ext': 'mp4', + 'title': 'Étoile du Nord - L\'observation', + 'description': 'md5:161a4617260dee3de70f509b2c9dd21b', + 'series': 'Étoile du Nord', + 'release_year': 2023, + 'season': 'Saison 1', + 'season_number': 1, + 'episode': 'L\'observation', + 'episode_number': 1, + 'uploader': 'Patrick Bossé', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'ONF episode with English title (needs geo-bypass)', + 'url': 'https://www.onf.ca/serie/north-star/season1/episode1/', + 'info_dict': { + 'id': 'north-star-episode-1-observation', + 'ext': 'mp4', + 'title': 'North Star - Observation', + 'description': 'md5:c727f370839d8a817392b9e3f23655c7', + 'series': 'North Star', + 'release_year': 2023, + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Observation', + 'episode_number': 1, + 'uploader': 'Patrick Bossé', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'NFB episode with /film/ URL and English title (needs geo-bypass)', + 'url': 'https://www.nfb.ca/film/north-star-episode-1-observation/', + 'info_dict': { + 'id': 'north-star-episode-1-observation', + 'ext': 'mp4', + 'title': 'North Star - Observation', + 'description': 'md5:c727f370839d8a817392b9e3f23655c7', + 'series': 'North Star', + 'release_year': 2023, + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Observation', + 'episode_number': 1, + 'uploader': 'Patrick Bossé', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'ONF episode with /film/ URL and French title (needs geo-bypass)', + 'url': 'https://www.onf.ca/film/etoile-du-nord-episode-1-lobservation/', + 'info_dict': { + 'id': 'etoile-du-nord-episode-1-lobservation', + 'ext': 'mp4', + 'title': 'Étoile du Nord - L\'observation', + 'description': 'md5:161a4617260dee3de70f509b2c9dd21b', + 'series': 'Étoile du Nord', + 'release_year': 2023, + 'season': 'Saison 1', + 'season_number': 1, + 'episode': 'L\'observation', + 'episode_number': 1, + 'uploader': 'Patrick Bossé', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'Season 2 episode w/o episode num in id, extract from json ld', + 'url': 'https://www.onf.ca/film/liste-des-choses-qui-existent-saison-2-ours', + 'info_dict': { + 'id': 'liste-des-choses-qui-existent-saison-2-ours', + 'ext': 'mp4', + 'title': 'La liste des choses qui existent - L\'ours en peluche', + 'description': 'md5:d5e8d8fc5f3a7385a9cf0f509b37e28a', + 'series': 'La liste des choses qui existent', + 'release_year': 2022, + 'season': 'Saison 2', + 'season_number': 2, + 'episode': 'L\'ours en peluche', + 'episode_number': 12, + 'uploader': 'Francis Papillon', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'NFB film /embed/player/ page', + 'url': 'https://www.nfb.ca/film/afterlife/embed/player/', + 'info_dict': { + 'id': 'afterlife', + 'ext': 'mp4', + 'title': 'Afterlife', + 'description': 'md5:84951394f594f1fb1e62d9c43242fdf5', + 'release_year': 1978, + 'duration': 420.0, + 'uploader': 'Ishu Patel', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): - video_id = self._match_id(url) + site, type_, slug = self._match_valid_url(url).group('site', 'type', 'id') + # Need to construct the URL since we match /embed/player/ URLs as well + webpage, urlh = self._download_webpage_handle(f'https://www.{site}.ca/{type_}/{slug}/', slug) + # type_ can change from film to serie(s) after redirect; new slug may have episode number + type_, slug = self._match_valid_url(urlh.url).group('type', 'id') - webpage = self._download_webpage('https://www.nfb.ca/film/%s/' % video_id, video_id) + player_data = self._search_json( + r'window\.PLAYER_OPTIONS\[[^\]]+\]\s*=', webpage, 'player data', slug) + video_id = self._match_id(player_data['overlay']['url']) # overlay url always has unique slug - iframe = self._html_search_regex( - r'<[^>]+\bid=["\']player-iframe["\'][^>]*src=["\']([^"\']+)', - webpage, 'iframe', default=None, fatal=True) - if iframe.startswith('/'): - iframe = f'https://www.nfb.ca{iframe}' + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + player_data['source'], video_id, 'mp4', m3u8_id='hls') - player = self._download_webpage(iframe, video_id) + if dv_source := url_or_none(player_data.get('dvSource')): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + dv_source, video_id, 'mp4', m3u8_id='dv', preference=-2, fatal=False) + for fmt in fmts: + fmt['format_note'] = 'described video' + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) - source = self._html_search_regex( - r'source:\s*\'([^\']+)', - player, 'source', default=None, fatal=True) - - formats, subtitles = self._extract_m3u8_formats_and_subtitles(source, video_id, ext='mp4') - - return { + info = { 'id': video_id, 'title': self._html_search_regex( - r'<[^>]+\bid=["\']titleHeader["\'][^>]*>\s*<h1[^>]*>\s*([^<]+?)\s*</h1>', + r'["\']nfb_version_title["\']\s*:\s*["\']([^"\']+)', webpage, 'title', default=None), 'description': self._html_search_regex( r'<[^>]+\bid=["\']tabSynopsis["\'][^>]*>\s*<p[^>]*>\s*([^<]+)', webpage, 'description', default=None), - 'thumbnail': self._html_search_regex( - r'poster:\s*\'([^\']+)', - player, 'thumbnail', default=None), + 'thumbnail': url_or_none(player_data.get('poster')), 'uploader': self._html_search_regex( - r'<[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', - webpage, 'uploader', default=None), + r'<[^>]+\bitemprop=["\']director["\'][^>]*>([^<]+)', webpage, 'uploader', default=None), 'release_year': int_or_none(self._html_search_regex( - r'<[^>]+\bitemprop=["\']datePublished["\'][^>]*>([^<]+)', + r'["\']nfb_version_year["\']\s*:\s*["\']([^"\']+)', webpage, 'release_year', default=None)), + } if type_ == 'film' else self._extract_ep_info(self._extract_ep_data(webpage, video_id, slug), video_id) + + return merge_dicts({ 'formats': formats, 'subtitles': subtitles, - } + }, info, self._search_json_ld(webpage, video_id, default={})) + + +class NFBSeriesIE(NFBBaseIE): + IE_NAME = 'nfb:series' + IE_DESC = 'nfb.ca and onf.ca series' + _VALID_URL = rf'{NFBBaseIE._VALID_URL_BASE}/(?P<type>series?)/(?P<id>[^/?#&]+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.nfb.ca/series/true-north-inside-the-rise-of-toronto-basketball/', + 'playlist_mincount': 9, + 'info_dict': { + 'id': 'true-north-inside-the-rise-of-toronto-basketball', + }, + }, { + 'url': 'https://www.onf.ca/serie/la-liste-des-choses-qui-existent-serie/', + 'playlist_mincount': 26, + 'info_dict': { + 'id': 'la-liste-des-choses-qui-existent-serie', + }, + }] + + def _entries(self, episodes): + for episode in traverse_obj(episodes, lambda _, v: NFBIE.suitable(v['embed_url'])): + mobj = NFBIE._match_valid_url(episode['embed_url']) + yield self.url_result( + mobj[0], NFBIE, **self._extract_ep_info([episode], mobj.group('id'))) + + def _real_extract(self, url): + site, type_, series_id = self._match_valid_url(url).group('site', 'type', 'id') + season_path = 'saison' if type_ == 'serie' else 'season' + webpage = self._download_webpage( + f'https://www.{site}.ca/{type_}/{series_id}/{season_path}1/episode1', series_id) + episodes = self._extract_ep_data(webpage, series_id, fatal=True) + + return self.playlist_result(self._entries(episodes), series_id) diff --git a/yt_dlp/extractor/nfhsnetwork.py b/yt_dlp/extractor/nfhsnetwork.py index febad8fdf3..ec746ecb17 100644 --- a/yt_dlp/extractor/nfhsnetwork.py +++ b/yt_dlp/extractor/nfhsnetwork.py @@ -1,11 +1,5 @@ from .common import InfoExtractor - - -from ..utils import ( - try_get, - unified_strdate, - unified_timestamp -) +from ..utils import try_get, unified_strdate, unified_timestamp class NFHSNetworkIE(InfoExtractor): @@ -23,12 +17,12 @@ class NFHSNetworkIE(InfoExtractor): 'uploader_url': 'https://www.nfhsnetwork.com/schools/rockford-high-school-rockford-mi', 'location': 'Rockford, Michigan', 'timestamp': 1616859000, - 'upload_date': '20210327' + 'upload_date': '20210327', }, 'params': { # m3u8 download 'skip_download': True, - } + }, }, { # Non-sport activity with description 'url': 'https://www.nfhsnetwork.com/events/limon-high-school-limon-co/evt4a30e3726c', @@ -42,12 +36,12 @@ class NFHSNetworkIE(InfoExtractor): 'uploader_url': 'https://www.nfhsnetwork.com/schools/limon-high-school-limon-co', 'location': 'Limon, Colorado', 'timestamp': 1607893200, - 'upload_date': '20201213' + 'upload_date': '20201213', }, 'params': { # m3u8 download 'skip_download': True, - } + }, }, { # Postseason game 'url': 'https://www.nfhsnetwork.com/events/nfhs-network-special-events/dd8de71d45', @@ -60,12 +54,12 @@ class NFHSNetworkIE(InfoExtractor): 'uploader_url': 'https://www.nfhsnetwork.com/affiliates/socal-sports-productions', 'location': 'San Diego, California', 'timestamp': 1451187000, - 'upload_date': '20151226' + 'upload_date': '20151226', }, 'params': { # m3u8 download 'skip_download': True, - } + }, }, { # Video with no broadcasts object 'url': 'https://www.nfhsnetwork.com/events/wiaa-wi/9aa2f92f82', @@ -79,13 +73,13 @@ class NFHSNetworkIE(InfoExtractor): 'uploader_url': 'https://www.nfhsnetwork.com/associations/wiaa-wi', 'location': 'Stevens Point, Wisconsin', 'timestamp': 1421856000, - 'upload_date': '20150121' + 'upload_date': '20150121', }, 'params': { # m3u8 download 'skip_download': True, - } - } + }, + }, ] def _real_extract(self, url): @@ -97,17 +91,17 @@ def _real_extract(self, url): publisher = data.get('publishers')[0] # always exists broadcast = (publisher.get('broadcasts') or publisher.get('vods'))[0] # some (older) videos don't have a broadcasts object uploader = publisher.get('formatted_name') or publisher.get('name') - uploaderID = publisher.get('publisher_key') - pubType = publisher.get('type') - uploaderPrefix = ( - "schools" if pubType == "school" - else "associations" if "association" in pubType - else "affiliates" if (pubType == "publisher" or pubType == "affiliate") - else "schools") - uploaderPage = 'https://www.nfhsnetwork.com/%s/%s' % (uploaderPrefix, publisher.get('slug')) - location = '%s, %s' % (data.get('city'), data.get('state_name')) + uploader_id = publisher.get('publisher_key') + pub_type = publisher.get('type') + uploader_prefix = ( + 'schools' if pub_type == 'school' + else 'associations' if 'association' in pub_type + else 'affiliates' if (pub_type == 'publisher' or pub_type == 'affiliate') + else 'schools') + uploader_page = 'https://www.nfhsnetwork.com/{}/{}'.format(uploader_prefix, publisher.get('slug')) + location = '{}, {}'.format(data.get('city'), data.get('state_name')) description = broadcast.get('description') - isLive = broadcast.get('on_air') or broadcast.get('status') == 'on_air' or False + is_live = broadcast.get('on_air') or broadcast.get('status') == 'on_air' or False timestamp = unified_timestamp(data.get('local_start_time')) upload_date = unified_strdate(data.get('local_start_time')) @@ -117,13 +111,13 @@ def _real_extract(self, url): or self._html_search_regex(r'<h1 class="sr-hidden">(.*?)</h1>', webpage, 'title')) title = title.split('|')[0].strip() - video_type = 'broadcasts' if isLive else 'vods' - key = broadcast.get('key') if isLive else try_get(publisher, lambda x: x['vods'][0]['key']) + video_type = 'broadcasts' if is_live else 'vods' + key = broadcast.get('key') if is_live else try_get(publisher, lambda x: x['vods'][0]['key']) m3u8_url = self._download_json( - 'https://cfunity.nfhsnetwork.com/v2/%s/%s/url' % (video_type, key), + f'https://cfunity.nfhsnetwork.com/v2/{video_type}/{key}/url', video_id).get('video_url') - formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', live=isLive) + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', live=is_live) return { 'id': video_id, @@ -132,10 +126,10 @@ def _real_extract(self, url): 'description': description, 'timestamp': timestamp, 'uploader': uploader, - 'uploader_id': uploaderID, - 'uploader_url': uploaderPage, + 'uploader_id': uploader_id, + 'uploader_url': uploader_page, 'location': location, 'upload_date': upload_date, - 'is_live': isLive, + 'is_live': is_live, '_format_sort_fields': ('res', 'tbr'), } diff --git a/yt_dlp/extractor/nfl.py b/yt_dlp/extractor/nfl.py index cc3f4495c1..c537c1c47c 100644 --- a/yt_dlp/extractor/nfl.py +++ b/yt_dlp/extractor/nfl.py @@ -64,6 +64,85 @@ class NFLBaseIE(InfoExtractor): _VIDEO_CONFIG_REGEX = r'<script[^>]+id="[^"]*video-config-[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}[^"]*"[^>]*>\s*({.+});?\s*</script>' _ANVATO_PREFIX = 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:' + _CLIENT_DATA = { + 'clientKey': '4cFUW6DmwJpzT9L7LrG3qRAcABG5s04g', + 'clientSecret': 'CZuvCL49d9OwfGsR', + 'deviceId': str(uuid.uuid4()), + 'deviceInfo': base64.b64encode(json.dumps({ + 'model': 'desktop', + 'version': 'Chrome', + 'osName': 'Windows', + 'osVersion': '10.0', + }, separators=(',', ':')).encode()).decode(), + 'networkType': 'other', + 'nflClaimGroupsToAdd': [], + 'nflClaimGroupsToRemove': [], + } + _ACCOUNT_INFO = {} + _API_KEY = None + + _TOKEN = None + _TOKEN_EXPIRY = 0 + + def _get_account_info(self, url, slug): + if not self._API_KEY: + webpage = self._download_webpage(url, slug, fatal=False) or '' + self._API_KEY = self._search_regex( + r'window\.gigyaApiKey\s*=\s*["\'](\w+)["\'];', webpage, 'API key', + fatal=False) or '3_Qa8TkWpIB8ESCBT8tY2TukbVKgO5F6BJVc7N1oComdwFzI7H2L9NOWdm11i_BY9f' + + cookies = self._get_cookies('https://auth-id.nfl.com/') + login_token = traverse_obj(cookies, ( + (f'glt_{self._API_KEY}', lambda k, _: k.startswith('glt_')), {lambda x: x.value}), get_all=False) + if not login_token: + self.raise_login_required() + if 'ucid' not in cookies: + raise ExtractorError( + 'Required cookies for the auth-id.nfl.com domain were not found among passed cookies. ' + 'If using --cookies, these cookies must be exported along with .nfl.com cookies, ' + 'or else try using --cookies-from-browser instead', expected=True) + + account = self._download_json( + 'https://auth-id.nfl.com/accounts.getAccountInfo', slug, + note='Downloading account info', data=urlencode_postdata({ + 'include': 'profile,data', + 'lang': 'en', + 'APIKey': self._API_KEY, + 'sdk': 'js_latest', + 'login_token': login_token, + 'authMode': 'cookie', + 'pageURL': url, + 'sdkBuild': traverse_obj(cookies, ( + 'gig_canary_ver', {lambda x: x.value.partition('-')[0]}), default='15170'), + 'format': 'json', + }), headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + self._ACCOUNT_INFO = traverse_obj(account, { + 'signatureTimestamp': 'signatureTimestamp', + 'uid': 'UID', + 'uidSignature': 'UIDSignature', + }) + + if len(self._ACCOUNT_INFO) != 3: + raise ExtractorError('Failed to retrieve account info with provided cookies', expected=True) + + def _get_auth_token(self, url, slug): + if self._TOKEN and self._TOKEN_EXPIRY > int(time.time() + 30): + return + + if not self._ACCOUNT_INFO: + self._get_account_info(url, slug) + + token = self._download_json( + 'https://api.nfl.com/identity/v3/token%s' % ( + '/refresh' if self._ACCOUNT_INFO.get('refreshToken') else ''), + slug, headers={'Content-Type': 'application/json'}, note='Downloading access token', + data=json.dumps({**self._CLIENT_DATA, **self._ACCOUNT_INFO}, separators=(',', ':')).encode()) + + self._TOKEN = token['accessToken'] + self._TOKEN_EXPIRY = token['expiresIn'] + self._ACCOUNT_INFO['refreshToken'] = token['refreshToken'] + def _parse_video_config(self, video_config, display_id): video_config = self._parse_json(video_config, display_id) item = video_config['playlist'][0] @@ -116,7 +195,7 @@ class NFLIE(NFLBaseIE): 'tags': 'count:6', 'duration': 157, 'categories': 'count:3', - } + }, }, { 'url': 'https://www.chiefs.com/listen/patrick-mahomes-travis-kelce-react-to-win-over-dolphins-the-breakdown', 'md5': '6886b32c24b463038c760ceb55a34566', @@ -168,7 +247,7 @@ def _real_extract(self, url): class NFLPlusReplayIE(NFLBaseIE): IE_NAME = 'nfl.com:plus:replay' - _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/games/[\w-]+/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?nfl\.com/plus/games/(?P<slug>[\w-]+)(?:/(?P<id>\d+))?' _TESTS = [{ 'url': 'https://www.nfl.com/plus/games/giants-at-vikings-2022-post-1/1572108', 'info_dict': { @@ -185,23 +264,92 @@ class NFLPlusReplayIE(NFLBaseIE): 'thumbnail': r're:^https?://.*\.jpg', }, 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'Subscription required', + 'url': 'https://www.nfl.com/plus/games/giants-at-vikings-2022-post-1', + 'playlist_count': 4, + 'info_dict': { + 'id': 'giants-at-vikings-2022-post-1', + }, + }, { + 'note': 'Subscription required', + 'url': 'https://www.nfl.com/plus/games/giants-at-patriots-2011-pre-4', + 'playlist_count': 2, + 'info_dict': { + 'id': 'giants-at-patriots-2011-pre-4', + }, + }, { + 'note': 'Subscription required', + 'url': 'https://www.nfl.com/plus/games/giants-at-patriots-2011-pre-4', + 'info_dict': { + 'id': '950701', + 'ext': 'mp4', + 'title': 'Giants @ Patriots', + 'description': 'Giants at Patriots on September 01, 2011', + 'uploader': 'NFL', + 'upload_date': '20210724', + 'timestamp': 1627085874, + 'duration': 1532, + 'categories': ['Game Highlights'], + 'tags': ['play-by-play'], + 'thumbnail': r're:^https?://.*\.jpg', + }, + 'params': { + 'skip_download': 'm3u8', + 'extractor_args': {'nflplusreplay': {'type': ['condensed_game']}}, + }, }] + _REPLAY_TYPES = { + 'full_game': 'Full Game', + 'full_game_spanish': 'Full Game - Spanish', + 'condensed_game': 'Condensed Game', + 'all_22': 'All-22', + } + def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id) + slug, video_id = self._match_valid_url(url).group('slug', 'id') + requested_types = self._configuration_arg('type', ['all']) + if 'all' in requested_types: + requested_types = list(self._REPLAY_TYPES.keys()) + requested_types = traverse_obj(self._REPLAY_TYPES, (None, requested_types)) + + if not video_id: + self._get_auth_token(url, slug) + headers = {'Authorization': f'Bearer {self._TOKEN}'} + game_id = self._download_json( + f'https://api.nfl.com/football/v2/games/externalId/slug/{slug}', slug, + 'Downloading game ID', query={'withExternalIds': 'true'}, headers=headers)['id'] + replays = self._download_json( + 'https://api.nfl.com/content/v1/videos/replays', slug, 'Downloading replays JSON', + query={'gameId': game_id}, headers=headers) + if len(requested_types) == 1: + video_id = traverse_obj(replays, ( + 'items', lambda _, v: v['subType'] == requested_types[0], 'mcpPlaybackId'), get_all=False) + + if video_id: + return self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id) + + def entries(): + for replay in traverse_obj( + replays, ('items', lambda _, v: v['mcpPlaybackId'] and v['subType'] in requested_types), + ): + video_id = replay['mcpPlaybackId'] + yield self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id) + + return self.playlist_result(entries(), slug) class NFLPlusEpisodeIE(NFLBaseIE): IE_NAME = 'nfl.com:plus:episode' - _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/episodes/(?P<id>[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?nfl\.com/plus/episodes/(?P<id>[\w-]+)' _TESTS = [{ - 'note': 'premium content', + 'note': 'Subscription required', 'url': 'https://www.nfl.com/plus/episodes/kurt-s-qb-insider-conference-championships', 'info_dict': { 'id': '1576832', 'ext': 'mp4', - 'title': 'Kurt\'s QB Insider: Conference Championships', + 'title': 'Conference Championships', 'description': 'md5:944f7fab56f7a37430bf8473f5473857', 'uploader': 'NFL', 'upload_date': '20230127', @@ -214,85 +362,9 @@ class NFLPlusEpisodeIE(NFLBaseIE): 'params': {'skip_download': 'm3u8'}, }] - _CLIENT_DATA = { - 'clientKey': '4cFUW6DmwJpzT9L7LrG3qRAcABG5s04g', - 'clientSecret': 'CZuvCL49d9OwfGsR', - 'deviceId': str(uuid.uuid4()), - 'deviceInfo': base64.b64encode(json.dumps({ - 'model': 'desktop', - 'version': 'Chrome', - 'osName': 'Windows', - 'osVersion': '10.0', - }, separators=(',', ':')).encode()).decode(), - 'networkType': 'other', - 'nflClaimGroupsToAdd': [], - 'nflClaimGroupsToRemove': [], - } - _ACCOUNT_INFO = {} - _API_KEY = None - - _TOKEN = None - _TOKEN_EXPIRY = 0 - - def _get_account_info(self, url, video_id): - cookies = self._get_cookies('https://www.nfl.com/') - login_token = traverse_obj(cookies, ( - (f'glt_{self._API_KEY}', f'gig_loginToken_{self._API_KEY}', - lambda k, _: k.startswith('glt_') or k.startswith('gig_loginToken_')), - {lambda x: x.value}), get_all=False) - if not login_token: - self.raise_login_required() - - account = self._download_json( - 'https://auth-id.nfl.com/accounts.getAccountInfo', video_id, - note='Downloading account info', data=urlencode_postdata({ - 'include': 'profile,data', - 'lang': 'en', - 'APIKey': self._API_KEY, - 'sdk': 'js_latest', - 'login_token': login_token, - 'authMode': 'cookie', - 'pageURL': url, - 'sdkBuild': traverse_obj(cookies, ( - 'gig_canary_ver', {lambda x: x.value.partition('-')[0]}), default='13642'), - 'format': 'json', - }), headers={'Content-Type': 'application/x-www-form-urlencoded'}) - - self._ACCOUNT_INFO = traverse_obj(account, { - 'signatureTimestamp': 'signatureTimestamp', - 'uid': 'UID', - 'uidSignature': 'UIDSignature', - }) - - if len(self._ACCOUNT_INFO) != 3: - raise ExtractorError('Failed to retrieve account info with provided cookies', expected=True) - - def _get_auth_token(self, url, video_id): - if not self._ACCOUNT_INFO: - self._get_account_info(url, video_id) - - token = self._download_json( - 'https://api.nfl.com/identity/v3/token%s' % ( - '/refresh' if self._ACCOUNT_INFO.get('refreshToken') else ''), - video_id, headers={'Content-Type': 'application/json'}, note='Downloading access token', - data=json.dumps({**self._CLIENT_DATA, **self._ACCOUNT_INFO}, separators=(',', ':')).encode()) - - self._TOKEN = token['accessToken'] - self._TOKEN_EXPIRY = token['expiresIn'] - self._ACCOUNT_INFO['refreshToken'] = token['refreshToken'] - def _real_extract(self, url): slug = self._match_id(url) - - if not self._API_KEY: - webpage = self._download_webpage(url, slug, fatal=False) or '' - self._API_KEY = self._search_regex( - r'window\.gigyaApiKey=["\'](\w+)["\'];', webpage, 'API key', - default='3_Qa8TkWpIB8ESCBT8tY2TukbVKgO5F6BJVc7N1oComdwFzI7H2L9NOWdm11i_BY9f') - - if not self._TOKEN or self._TOKEN_EXPIRY <= int(time.time()): - self._get_auth_token(url, slug) - + self._get_auth_token(url, slug) video_id = self._download_json( f'https://api.nfl.com/content/v1/videos/episodes/{slug}', slug, headers={ 'Authorization': f'Bearer {self._TOKEN}', diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index 59702b247e..0bd6edfcba 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -2,18 +2,26 @@ from .common import InfoExtractor from ..utils import ( + ExtractorError, + clean_html, + filter_dict, + get_element_by_class, + int_or_none, + join_nonempty, parse_duration, + remove_end, traverse_obj, + try_call, unescapeHTML, unified_timestamp, - urljoin + url_or_none, + urljoin, ) class NhkBaseIE(InfoExtractor): _API_URL_TEMPLATE = 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json' - _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand' - _TYPE_REGEX = r'/(?P<type>video|audio)/' + _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/' def _call_api(self, m_id, lang, is_video, is_episode, is_clip): return self._download_json( @@ -24,21 +32,71 @@ def _call_api(self, m_id, lang, is_video, is_episode, is_clip): m_id, lang, '/all' if is_video else ''), m_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'] or [] + def _get_api_info(self, refresh=True): + if not refresh: + return self.cache.load('nhk', 'api_info') + + self.cache.store('nhk', 'api_info', {}) + movie_player_js = self._download_webpage( + 'https://movie-a.nhk.or.jp/world/player/js/movie-player.js', None, + note='Downloading stream API information') + api_info = { + 'url': self._search_regex( + r'prod:[^;]+\bapiUrl:\s*[\'"]([^\'"]+)[\'"]', movie_player_js, None, 'stream API url'), + 'token': self._search_regex( + r'prod:[^;]+\btoken:\s*[\'"]([^\'"]+)[\'"]', movie_player_js, None, 'stream API token'), + } + self.cache.store('nhk', 'api_info', api_info) + return api_info + + def _extract_stream_info(self, vod_id): + for refresh in (False, True): + api_info = self._get_api_info(refresh) + if not api_info: + continue + + api_url = api_info.pop('url') + meta = traverse_obj( + self._download_json( + api_url, vod_id, 'Downloading stream url info', fatal=False, query={ + **api_info, + 'type': 'json', + 'optional_id': vod_id, + 'active_flg': 1, + }), ('meta', 0)) + stream_url = traverse_obj( + meta, ('movie_url', ('mb_auto', 'auto_sp', 'auto_pc'), {url_or_none}), get_all=False) + + if stream_url: + formats, subtitles = self._extract_m3u8_formats_and_subtitles(stream_url, vod_id) + return { + **traverse_obj(meta, { + 'duration': ('duration', {int_or_none}), + 'timestamp': ('publication_date', {unified_timestamp}), + 'release_timestamp': ('insert_date', {unified_timestamp}), + 'modified_timestamp': ('update_date', {unified_timestamp}), + }), + 'formats': formats, + 'subtitles': subtitles, + } + raise ExtractorError('Unable to extract stream url') + def _extract_episode_info(self, url, episode=None): fetch_episode = episode is None - lang, m_type, episode_id = NhkVodIE._match_valid_url(url).groups() - if len(episode_id) == 7: + lang, m_type, episode_id = NhkVodIE._match_valid_url(url).group('lang', 'type', 'id') + is_video = m_type != 'audio' + + if is_video: episode_id = episode_id[:4] + '-' + episode_id[4:] - is_video = m_type == 'video' if fetch_episode: episode = self._call_api( episode_id, lang, is_video, True, episode_id[:4] == '9999')[0] - title = episode.get('sub_title_clean') or episode['sub_title'] def get_clean_field(key): - return episode.get(key + '_clean') or episode.get(key) + return clean_html(episode.get(key + '_clean') or episode.get(key)) + title = get_clean_field('sub_title') series = get_clean_field('title') thumbnails = [] @@ -47,33 +105,44 @@ def get_clean_field(key): if not img_path: continue thumbnails.append({ - 'id': '%dp' % h, + 'id': f'{h}p', 'height': h, 'width': w, 'url': 'https://www3.nhk.or.jp' + img_path, }) + episode_name = title + if series and title: + title = f'{series} - {title}' + elif series and not title: + title = series + series = None + episode_name = None + else: # title, no series + episode_name = None + info = { 'id': episode_id + '-' + lang, - 'title': '%s - %s' % (series, title) if series and title else title, + 'title': title, 'description': get_clean_field('description'), 'thumbnails': thumbnails, 'series': series, - 'episode': title, + 'episode': episode_name, } + if is_video: vod_id = episode['vod_id'] info.update({ - '_type': 'url_transparent', - 'ie_key': 'Piksel', - 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + vod_id, + **self._extract_stream_info(vod_id), 'id': vod_id, }) + else: if fetch_episode: - audio_path = episode['audio']['audio'] + # From https://www3.nhk.or.jp/nhkworld/common/player/radio/inline/rod.html + audio_path = remove_end(episode['audio']['audio'], '.m4a') info['formats'] = self._extract_m3u8_formats( - 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path, + f'{urljoin("https://vod-stream.nhk.jp", audio_path)}/index.m3u8', episode_id, 'm4a', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) for f in info['formats']: @@ -88,34 +157,62 @@ def get_clean_field(key): class NhkVodIE(NhkBaseIE): - # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg - _VALID_URL = r'%s%s(?P<id>[0-9a-z]{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) + _VALID_URL = [ + rf'{NhkBaseIE._BASE_URL_REGEX}shows/(?:(?P<type>video)/)?(?P<id>\d{{4}}[\da-z]\d+)/?(?:$|[?#])', + rf'{NhkBaseIE._BASE_URL_REGEX}(?:ondemand|shows)/(?P<type>audio)/(?P<id>[^/?#]+?-\d{{8}}-[\da-z]+)', + rf'{NhkBaseIE._BASE_URL_REGEX}ondemand/(?P<type>video)/(?P<id>\d{{4}}[\da-z]\d+)', # deprecated + ] # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ - # video clip - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', - 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca', + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2049126/', 'info_dict': { - 'id': 'a95j5iza', + 'id': 'nw_vod_v_en_2049_126_20230413233000_01_1681398302', 'ext': 'mp4', - 'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU", - 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5', - 'timestamp': 1565965194, - 'upload_date': '20190816', + 'title': 'Japan Railway Journal - The Tohoku Shinkansen: Full Speed Ahead', + 'description': 'md5:49f7c5b206e03868a2fdf0d0814b92f6', + 'thumbnail': r're:https://.+/.+\.jpg', + 'episode': 'The Tohoku Shinkansen: Full Speed Ahead', + 'series': 'Japan Railway Journal', + 'modified_timestamp': 1707217907, + 'timestamp': 1681428600, + 'release_timestamp': 1693883728, + 'duration': 1679, + 'upload_date': '20230413', + 'modified_date': '20240206', + 'release_date': '20230905', }, }, { - # audio clip - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/', + # video clip + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', + 'md5': '153c3016dfd252ba09726588149cf0e7', 'info_dict': { - 'id': 'r_inventions-20201104-1-en', - 'ext': 'm4a', - 'title': "Japan's Top Inventions - Miniature Video Cameras", - 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b', + 'id': 'lpZXIwaDE6_Z-976CPsFdxyICyWUzlT5', + 'ext': 'mp4', + 'title': 'Dining with the Chef - Chef Saito\'s Family recipe: MENCHI-KATSU', + 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5', + 'thumbnail': r're:https://.+/.+\.jpg', + 'series': 'Dining with the Chef', + 'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU', + 'duration': 148, + 'upload_date': '20190816', + 'release_date': '20230902', + 'release_timestamp': 1693619292, + 'modified_timestamp': 1707217907, + 'modified_date': '20240206', + 'timestamp': 1565997540, }, - 'params': { - # m3u8 download - 'skip_download': True, + }, { + # radio + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/livinginjapan-20231001-1/', + 'info_dict': { + 'id': 'livinginjapan-20231001-1-en', + 'ext': 'm4a', + 'title': 'Living in Japan - Tips for Travelers to Japan / Ramen Vending Machines', + 'series': 'Living in Japan', + 'description': 'md5:0a0e2077d8f07a03071e990a6f51bfab', + 'thumbnail': r're:https://.+/.+\.jpg', + 'episode': 'Tips for Travelers to Japan / Ramen Vending Machines', }, }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', @@ -132,7 +229,6 @@ class NhkVodIE(NhkBaseIE): }, { # video, alphabetic character in ID #29670 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/', - 'only_matching': True, 'info_dict': { 'id': 'qfjay6cg', 'ext': 'mp4', @@ -141,7 +237,121 @@ class NhkVodIE(NhkBaseIE): 'thumbnail': r're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$', 'upload_date': '20210615', 'timestamp': 1623722008, - } + }, + 'skip': '404 Not Found', + }, { + # japanese-language, longer id than english + 'url': 'https://www3.nhk.or.jp/nhkworld/ja/ondemand/video/0020271111/', + 'info_dict': { + 'id': 'nw_ja_v_jvod_ohayou_20231008', + 'ext': 'mp4', + 'title': 'おはよう日本(7時台) - 10月8日放送', + 'series': 'おはよう日本(7時台)', + 'episode': '10月8日放送', + 'thumbnail': r're:https://.+/.+\.jpg', + 'description': 'md5:9c1d6cbeadb827b955b20e99ab920ff0', + }, + 'skip': 'expires 2023-10-15', + }, { + # a one-off (single-episode series). title from the api is just '<p></p>' + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/3004952/', + 'info_dict': { + 'id': 'nw_vod_v_en_3004_952_20230723091000_01_1690074552', + 'ext': 'mp4', + 'title': 'Barakan Discovers - AMAMI OSHIMA: Isson\'s Treasure Isla', + 'description': 'md5:5db620c46a0698451cc59add8816b797', + 'thumbnail': r're:https://.+/.+\.jpg', + 'release_date': '20230905', + 'timestamp': 1690103400, + 'duration': 2939, + 'release_timestamp': 1693898699, + 'upload_date': '20230723', + 'modified_timestamp': 1707217907, + 'modified_date': '20240206', + 'episode': 'AMAMI OSHIMA: Isson\'s Treasure Isla', + 'series': 'Barakan Discovers', + }, + }, { + # /ondemand/video/ url with alphabetical character in 5th position of id + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a07/', + 'info_dict': { + 'id': 'nw_c_en_9999-a07', + 'ext': 'mp4', + 'episode': 'Mini-Dramas on SDGs: Ep 1 Close the Gender Gap [Director\'s Cut]', + 'series': 'Mini-Dramas on SDGs', + 'modified_date': '20240206', + 'title': 'Mini-Dramas on SDGs - Mini-Dramas on SDGs: Ep 1 Close the Gender Gap [Director\'s Cut]', + 'description': 'md5:3f9dcb4db22fceb675d90448a040d3f6', + 'timestamp': 1621962360, + 'duration': 189, + 'release_date': '20230903', + 'modified_timestamp': 1707217907, + 'upload_date': '20210525', + 'thumbnail': r're:https://.+/.+\.jpg', + 'release_timestamp': 1693713487, + }, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999d17/', + 'info_dict': { + 'id': 'nw_c_en_9999-d17', + 'ext': 'mp4', + 'title': 'Flowers of snow blossom - The 72 Pentads of Yamato', + 'description': 'Today’s focus: Snow', + 'release_timestamp': 1693792402, + 'release_date': '20230904', + 'upload_date': '20220128', + 'timestamp': 1643370960, + 'thumbnail': r're:https://.+/.+\.jpg', + 'duration': 136, + 'series': '', + 'modified_date': '20240206', + 'modified_timestamp': 1707217907, + }, + }, { + # new /shows/ url format + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/2032307/', + 'info_dict': { + 'id': 'nw_vod_v_en_2032_307_20240321113000_01_1710990282', + 'ext': 'mp4', + 'title': 'Japanology Plus - 20th Anniversary Special Part 1', + 'description': 'md5:817d41fc8e54339ad2a916161ea24faf', + 'episode': '20th Anniversary Special Part 1', + 'series': 'Japanology Plus', + 'thumbnail': r're:https://.+/.+\.jpg', + 'duration': 1680, + 'timestamp': 1711020600, + 'upload_date': '20240321', + 'release_timestamp': 1711022683, + 'release_date': '20240321', + 'modified_timestamp': 1711031012, + 'modified_date': '20240321', + }, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/3020025/', + 'info_dict': { + 'id': 'nw_vod_v_en_3020_025_20230325144000_01_1679723944', + 'ext': 'mp4', + 'title': '100 Ideas to Save the World - Working Styles Evolve', + 'description': 'md5:9e6c7778eaaf4f7b4af83569649f84d9', + 'episode': 'Working Styles Evolve', + 'series': '100 Ideas to Save the World', + 'thumbnail': r're:https://.+/.+\.jpg', + 'duration': 899, + 'upload_date': '20230325', + 'timestamp': 1679755200, + 'release_date': '20230905', + 'release_timestamp': 1693880540, + 'modified_date': '20240206', + 'modified_timestamp': 1707217907, + }, + }, { + # new /shows/audio/ url format + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/audio/livinginjapan-20231001-1/', + 'only_matching': True, + }, { + # valid url even if can't be found in wild; support needed for clip entries extraction + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/9999o80/', + 'only_matching': True, }] def _real_extract(self, url): @@ -149,51 +359,91 @@ def _real_extract(self, url): class NhkVodProgramIE(NhkBaseIE): - _VALID_URL = r'%s/program%s(?P<id>[0-9a-z]+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) + _VALID_URL = rf'''(?x) + {NhkBaseIE._BASE_URL_REGEX}(?:shows|tv)/ + (?:(?P<type>audio)/programs/)?(?P<id>\w+)/? + (?:\?(?:[^#]+&)?type=(?P<episode_type>clip|(?:radio|tv)Episode))?''' _TESTS = [{ # video program episodes - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway', + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/sumo/', 'info_dict': { - 'id': 'japanrailway', - 'title': 'Japan Railway Journal', + 'id': 'sumo', + 'title': 'GRAND SUMO Highlights', + 'description': 'md5:fc20d02dc6ce85e4b72e0273aa52fdbf', }, 'playlist_mincount': 1, }, { - # video program clips - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip', + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/japanrailway/', 'info_dict': { 'id': 'japanrailway', 'title': 'Japan Railway Journal', + 'description': 'md5:ea39d93af7d05835baadf10d1aae0e3f', }, - 'playlist_mincount': 5, + 'playlist_mincount': 12, }, { - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/', - 'only_matching': True, + # video program clips + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/japanrailway/?type=clip', + 'info_dict': { + 'id': 'japanrailway', + 'title': 'Japan Railway Journal', + 'description': 'md5:ea39d93af7d05835baadf10d1aae0e3f', + }, + 'playlist_mincount': 12, }, { # audio program - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/', + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/audio/programs/livinginjapan/', + 'info_dict': { + 'id': 'livinginjapan', + 'title': 'Living in Japan', + 'description': 'md5:665bb36ec2a12c5a7f598ee713fc2b54', + }, + 'playlist_mincount': 12, + }, { + # /tv/ program url + 'url': 'https://www3.nhk.or.jp/nhkworld/en/tv/designtalksplus/', + 'info_dict': { + 'id': 'designtalksplus', + 'title': 'DESIGN TALKS plus', + 'description': 'md5:47b3b3a9f10d4ac7b33b53b70a7d2837', + }, + 'playlist_mincount': 20, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/10yearshayaomiyazaki/', 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if NhkVodIE.suitable(url) else super().suitable(url) + + def _extract_meta_from_class_elements(self, class_values, html): + for class_value in class_values: + if value := clean_html(get_element_by_class(class_value, html)): + return value + def _real_extract(self, url): - lang, m_type, program_id, episode_type = self._match_valid_url(url).groups() - + lang, m_type, program_id, episode_type = self._match_valid_url(url).group('lang', 'type', 'id', 'episode_type') episodes = self._call_api( - program_id, lang, m_type == 'video', False, episode_type == 'clip') + program_id, lang, m_type != 'audio', False, episode_type == 'clip') - entries = [] - for episode in episodes: - episode_path = episode.get('url') - if not episode_path: - continue - entries.append(self._extract_episode_info( - urljoin(url, episode_path), episode)) + def entries(): + for episode in episodes: + if episode_path := episode.get('url'): + yield self._extract_episode_info(urljoin(url, episode_path), episode) - program_title = None - if entries: - program_title = entries[0].get('series') + html = self._download_webpage(url, program_id) + program_title = self._extract_meta_from_class_elements([ + 'p-programDetail__title', # /ondemand/program/ + 'pProgramHero__logoText', # /shows/ + 'tAudioProgramMain__title', # /shows/audio/programs/ + 'p-program-name'], html) # /tv/ + program_description = self._extract_meta_from_class_elements([ + 'p-programDetail__text', # /ondemand/program/ + 'pProgramHero__description', # /shows/ + 'tAudioProgramMain__info', # /shows/audio/programs/ + 'p-program-description'], html) # /tv/ - return self.playlist_result(entries, program_id, program_title) + return self.playlist_result(entries(), program_id, program_title, program_description) class NhkForSchoolBangumiIE(InfoExtractor): @@ -209,7 +459,7 @@ class NhkForSchoolBangumiIE(InfoExtractor): 'upload_date': '20140402', 'ext': 'mp4', - 'chapters': 'count:12' + 'chapters': 'count:12', }, 'params': { # m3u8 download @@ -272,7 +522,8 @@ class NhkForSchoolSubjectIE(InfoExtractor): 'eigo', 'tokkatsu', 'tokushi', 'sonota', ) - _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>%s)/?(?:[\?#].*)?$' % '|'.join(re.escape(s) for s in KNOWN_SUBJECTS) + _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>{})/?(?:[\?#].*)?$'.format( + '|'.join(re.escape(s) for s in KNOWN_SUBJECTS)) _TESTS = [{ 'url': 'https://www.nhk.or.jp/school/sougou/', @@ -302,9 +553,8 @@ def _real_extract(self, url): class NhkForSchoolProgramListIE(InfoExtractor): - _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>(?:%s)/[a-zA-Z0-9_-]+)' % ( - '|'.join(re.escape(s) for s in NhkForSchoolSubjectIE.KNOWN_SUBJECTS) - ) + _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>(?:{})/[a-zA-Z0-9_-]+)'.format( + '|'.join(re.escape(s) for s in NhkForSchoolSubjectIE.KNOWN_SUBJECTS)) _TESTS = [{ 'url': 'https://www.nhk.or.jp/school/sougou/q/', 'info_dict': { @@ -334,3 +584,343 @@ def _real_extract(self, url): for x in traverse_obj(bangumi_list, ('part', ..., 'part-video-dasid')) or []] return self.playlist_result(bangumis, program_id, title, description) + + +class NhkRadiruIE(InfoExtractor): + _GEO_COUNTRIES = ['JP'] + IE_DESC = 'NHK らじる (Radiru/Rajiru)' + _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P<site>[\da-zA-Z]+)_(?P<corner>[\da-zA-Z]+)(?:_(?P<headline>[\da-zA-Z]+))?' + _TESTS = [{ + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_4003239', + 'skip': 'Episode expired on 2024-06-09', + 'info_dict': { + 'title': 'ジャズ・トゥナイト ジャズ「Night and Day」特集', + 'id': '0449_01_4003239', + 'ext': 'm4a', + 'uploader': 'NHK FM 東京', + 'description': 'md5:ad05f3c3f3f6e99b2e69f9b5e49551dc', + 'series': 'ジャズ・トゥナイト', + 'channel': 'NHK FM 東京', + 'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg', + 'upload_date': '20240601', + 'series_id': '0449_01', + 'release_date': '20240601', + 'timestamp': 1717257600, + 'release_timestamp': 1717250400, + }, + }, { + # playlist, airs every weekday so it should _hopefully_ be okay forever + 'url': 'https://www.nhk.or.jp/radio/ondemand/detail.html?p=0458_01', + 'info_dict': { + 'id': '0458_01', + 'title': 'ベストオブクラシック', + 'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。', + 'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg', + 'series_id': '0458_01', + 'uploader': 'NHK FM', + 'channel': 'NHK FM', + 'series': 'ベストオブクラシック', + }, + 'playlist_mincount': 3, + }, { + # one with letters in the id + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F683_01_3910688', + 'note': 'Expires on 2025-03-31', + 'info_dict': { + 'id': 'F683_01_3910688', + 'ext': 'm4a', + 'title': '夏目漱石「文鳥」第1回', + 'series': '【らじる文庫】夏目漱石「文鳥」(全4回)', + 'series_id': 'F683_01', + 'description': '朗読:浅井理アナウンサー', + 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F683/img/roudoku_05_rod_640.jpg', + 'upload_date': '20240106', + 'release_date': '20240106', + 'uploader': 'NHK R1', + 'release_timestamp': 1704511800, + 'channel': 'NHK R1', + 'timestamp': 1704512700, + }, + 'expected_warnings': ['Unable to download JSON metadata', + 'Failed to get extended metadata. API returned Error 1: Invalid parameters'], + }, { + # news + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_4012173', + 'info_dict': { + 'id': 'F261_01_4012173', + 'ext': 'm4a', + 'channel': 'NHKラジオ第1', + 'uploader': 'NHKラジオ第1', + 'series': 'NHKラジオニュース', + 'title': '午前0時のNHKニュース', + 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg', + 'release_timestamp': 1718290800, + 'release_date': '20240613', + 'timestamp': 1718291400, + 'upload_date': '20240613', + }, + }, { + # fallback when extended metadata fails + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=2834_01_4009298', + 'skip': 'Expires on 2024-06-07', + 'info_dict': { + 'id': '2834_01_4009298', + 'title': 'まち☆キラ!開成町特集', + 'ext': 'm4a', + 'release_date': '20240531', + 'upload_date': '20240531', + 'series': 'はま☆キラ!', + 'thumbnail': 'https://www.nhk.or.jp/prog/img/2834/g2834.jpg', + 'channel': 'NHK R1,FM', + 'description': '', + 'timestamp': 1717123800, + 'uploader': 'NHK R1,FM', + 'release_timestamp': 1717120800, + 'series_id': '2834_01', + }, + 'expected_warnings': ['Failed to get extended metadata. API returned empty list.'], + }] + + _API_URL_TMPL = None + + def _extract_extended_metadata(self, episode_id, aa_vinfo): + service, _, area = traverse_obj(aa_vinfo, (2, {str}, {lambda x: (x or '').partition(',')})) + detail_url = try_call( + lambda: self._API_URL_TMPL.format(area=area, service=service, dateid=aa_vinfo[3])) + if not detail_url: + return {} + + response = self._download_json( + detail_url, episode_id, 'Downloading extended metadata', + 'Failed to download extended metadata', fatal=False, expected_status=400) + if not response: + return {} + + if error := traverse_obj(response, ('error', {dict})): + self.report_warning( + 'Failed to get extended metadata. API returned ' + f'Error {join_nonempty("code", "message", from_dict=error, delim=": ")}') + return {} + + full_meta = traverse_obj(response, ('list', service, 0, {dict})) + if not full_meta: + self.report_warning('Failed to get extended metadata. API returned empty list.') + return {} + + station = ' '.join(traverse_obj(full_meta, (('service', 'area'), 'name', {str}))) or None + thumbnails = [{ + 'id': str(id_), + 'preference': 1 if id_.startswith('thumbnail') else -2 if id_.startswith('logo') else -1, + **traverse_obj(thumb, { + 'url': 'url', + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + } for id_, thumb in traverse_obj(full_meta, ('images', {dict.items}, lambda _, v: v[1]['url']))] + + return filter_dict({ + 'channel': station, + 'uploader': station, + 'description': join_nonempty( + 'subtitle', 'content', 'act', 'music', delim='\n\n', from_dict=full_meta), + 'thumbnails': thumbnails, + **traverse_obj(full_meta, { + 'title': ('title', {str}), + 'timestamp': ('end_time', {unified_timestamp}), + 'release_timestamp': ('start_time', {unified_timestamp}), + }), + }) + + def _extract_episode_info(self, episode, programme_id, series_meta): + episode_id = f'{programme_id}_{episode["id"]}' + aa_vinfo = traverse_obj(episode, ('aa_contents_id', {lambda x: x.split(';')})) + extended_metadata = self._extract_extended_metadata(episode_id, aa_vinfo) + fallback_start_time, _, fallback_end_time = traverse_obj( + aa_vinfo, (4, {str}, {lambda x: (x or '').partition('_')})) + + return { + **series_meta, + 'id': episode_id, + 'formats': self._extract_m3u8_formats(episode.get('stream_url'), episode_id, fatal=False), + 'container': 'm4a_dash', # force fixup, AAC-only HLS + 'was_live': True, + 'title': episode.get('program_title'), + 'description': episode.get('program_sub_title'), # fallback + 'timestamp': unified_timestamp(fallback_end_time), + 'release_timestamp': unified_timestamp(fallback_start_time), + **extended_metadata, + } + + def _extract_news_info(self, headline, programme_id, series_meta): + episode_id = f'{programme_id}_{headline["headline_id"]}' + episode = traverse_obj(headline, ('file_list', 0, {dict})) + + return { + **series_meta, + 'id': episode_id, + 'formats': self._extract_m3u8_formats(episode.get('file_name'), episode_id, fatal=False), + 'container': 'm4a_dash', # force fixup, AAC-only HLS + 'was_live': True, + 'series': series_meta.get('title'), + 'thumbnail': url_or_none(headline.get('headline_image')) or series_meta.get('thumbnail'), + **traverse_obj(episode, { + 'title': ('file_title', {str}), + 'description': ('file_title_sub', {str}), + 'timestamp': ('open_time', {unified_timestamp}), + 'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}, {unified_timestamp}), + }), + } + + def _real_initialize(self): + if self._API_URL_TMPL: + return + api_config = self._download_xml( + 'https://www.nhk.or.jp/radio/config/config_web.xml', None, 'Downloading API config', fatal=False) + NhkRadiruIE._API_URL_TMPL = try_call(lambda: f'https:{api_config.find(".//url_program_detail").text}') + + def _real_extract(self, url): + site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline') + programme_id = f'{site_id}_{corner_id}' + + if site_id == 'F261': # XXX: News programmes use old API (for now?) + meta = self._download_json( + 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json', programme_id)['main'] + series_meta = traverse_obj(meta, { + 'title': ('program_name', {str}), + 'channel': ('media_name', {str}), + 'uploader': ('media_name', {str}), + 'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}), + }, get_all=False) + + if headline_id: + headline = traverse_obj( + meta, ('detail_list', lambda _, v: v['headline_id'] == headline_id, any)) + if not headline: + raise ExtractorError('Content not found; it has most likely expired', expected=True) + return self._extract_news_info(headline, programme_id, series_meta) + + def news_entries(): + for headline in traverse_obj(meta, ('detail_list', ..., {dict})): + yield self._extract_news_info(headline, programme_id, series_meta) + + return self.playlist_result( + news_entries(), programme_id, description=meta.get('site_detail'), **series_meta) + + meta = self._download_json( + 'https://www.nhk.or.jp/radio-api/app/v1/web/ondemand/series', programme_id, query={ + 'site_id': site_id, + 'corner_site_id': corner_id, + }) + + fallback_station = join_nonempty('NHK', traverse_obj(meta, ('radio_broadcast', {str})), delim=' ') + series_meta = { + 'series': join_nonempty('title', 'corner_name', delim=' ', from_dict=meta), + 'series_id': programme_id, + 'thumbnail': traverse_obj(meta, ('thumbnail_url', {url_or_none})), + 'channel': fallback_station, + 'uploader': fallback_station, + } + + if headline_id: + episode = traverse_obj(meta, ('episodes', lambda _, v: v['id'] == int(headline_id), any)) + if not episode: + raise ExtractorError('Content not found; it has most likely expired', expected=True) + return self._extract_episode_info(episode, programme_id, series_meta) + + def entries(): + for episode in traverse_obj(meta, ('episodes', ..., {dict})): + yield self._extract_episode_info(episode, programme_id, series_meta) + + return self.playlist_result( + entries(), programme_id, title=series_meta.get('series'), + description=meta.get('series_description'), **series_meta) + + +class NhkRadioNewsPageIE(InfoExtractor): + _VALID_URL = r'https?://www\.nhk\.or\.jp/radionews/?(?:$|[?#])' + _TESTS = [{ + # airs daily, on-the-hour most hours + 'url': 'https://www.nhk.or.jp/radionews/', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'F261_01', + 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg', + 'description': 'md5:bf2c5b397e44bc7eb26de98d8f15d79d', + 'channel': 'NHKラジオ第1', + 'uploader': 'NHKラジオ第1', + 'title': 'NHKラジオニュース', + }, + }] + + def _real_extract(self, url): + return self.url_result('https://www.nhk.or.jp/radio/ondemand/detail.html?p=F261_01', NhkRadiruIE) + + +class NhkRadiruLiveIE(InfoExtractor): + _GEO_COUNTRIES = ['JP'] + _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/player/\?ch=(?P<id>r[12]|fm)' + _TESTS = [{ + # radio 1, no area specified + 'url': 'https://www.nhk.or.jp/radio/player/?ch=r1', + 'info_dict': { + 'id': 'r1-tokyo', + 'title': 're:^NHKネットラジオ第1 東京.+$', + 'ext': 'm4a', + 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r1-200x200.png', + 'live_status': 'is_live', + }, + }, { + # radio 2, area specified + # (the area doesnt actually matter, r2 is national) + 'url': 'https://www.nhk.or.jp/radio/player/?ch=r2', + 'params': {'extractor_args': {'nhkradirulive': {'area': ['fukuoka']}}}, + 'info_dict': { + 'id': 'r2-fukuoka', + 'title': 're:^NHKネットラジオ第2 福岡.+$', + 'ext': 'm4a', + 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r2-200x200.png', + 'live_status': 'is_live', + }, + }, { + # fm, area specified + 'url': 'https://www.nhk.or.jp/radio/player/?ch=fm', + 'params': {'extractor_args': {'nhkradirulive': {'area': ['sapporo']}}}, + 'info_dict': { + 'id': 'fm-sapporo', + 'title': 're:^NHKネットラジオFM 札幌.+$', + 'ext': 'm4a', + 'thumbnail': 'https://www.nhk.or.jp/common/img/media/fm-200x200.png', + 'live_status': 'is_live', + }, + }] + + _NOA_STATION_IDS = {'r1': 'n1', 'r2': 'n2', 'fm': 'n3'} + + def _real_extract(self, url): + station = self._match_id(url) + area = self._configuration_arg('area', ['tokyo'])[0] + + config = self._download_xml( + 'https://www.nhk.or.jp/radio/config/config_web.xml', station, 'Downloading area information') + data = config.find(f'.//data//area[.="{area}"]/..') + + if not data: + raise ExtractorError('Invalid area. Valid areas are: {}'.format(', '.join( + [i.text for i in config.findall('.//data//area')])), expected=True) + + noa_info = self._download_json( + f'https:{config.find(".//url_program_noa").text}'.format(area=data.find('areakey').text), + station, note=f'Downloading {area} station metadata', fatal=False) + present_info = traverse_obj(noa_info, ('nowonair_list', self._NOA_STATION_IDS.get(station), 'present')) + + return { + 'title': ' '.join(traverse_obj(present_info, (('service', 'area'), 'name', {str}))), + 'id': join_nonempty(station, area), + 'thumbnails': traverse_obj(present_info, ('service', 'images', ..., { + 'url': 'url', + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + })), + 'formats': self._extract_m3u8_formats(data.find(f'{station}hls').text, station), + 'is_live': True, + } diff --git a/yt_dlp/extractor/nhl.py b/yt_dlp/extractor/nhl.py index 2521c40e08..ca47a81211 100644 --- a/yt_dlp/extractor/nhl.py +++ b/yt_dlp/extractor/nhl.py @@ -1,10 +1,10 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, int_or_none, - parse_iso8601, + join_nonempty, parse_duration, + parse_iso8601, ) @@ -12,8 +12,8 @@ class NHLBaseIE(InfoExtractor): def _real_extract(self, url): site, tmp_id = self._match_valid_url(url).groups() video_data = self._download_json( - 'https://%s/%s/%sid/v1/%s/details/web-v1.json' - % (self._CONTENT_DOMAIN, site[:3], 'item/' if site == 'mlb' else '', tmp_id), tmp_id) + 'https://{}/{}/{}id/v1/{}/details/web-v1.json'.format( + self._CONTENT_DOMAIN, site[:3], 'item/' if site == 'mlb' else '', tmp_id), tmp_id) if video_data.get('type') != 'video': video_data = video_data['media'] video = video_data.get('video') @@ -24,7 +24,7 @@ def _real_extract(self, url): if videos: video_data = videos[0] - video_id = compat_str(video_data['id']) + video_id = str(video_data['id']) title = video_data['title'] formats = [] @@ -42,7 +42,7 @@ def _real_extract(self, url): else: height = int_or_none(playback.get('height')) formats.append({ - 'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')), + 'format_id': playback.get('name') or join_nonempty('http', height and f'{height}p'), 'url': playback_url, 'width': int_or_none(playback.get('width')), 'height': height, diff --git a/yt_dlp/extractor/nick.py b/yt_dlp/extractor/nick.py index de22cb8d64..653b10b9d0 100644 --- a/yt_dlp/extractor/nick.py +++ b/yt_dlp/extractor/nick.py @@ -22,7 +22,7 @@ class NickIE(MTVServicesInfoExtractor): 'title': 'SpongeBob SquarePants: "A Place for Pets/Lockdown for Love" S1', 'description': 'A Place for Pets/Lockdown for Love: When customers bring pets into the Krusty Krab, Mr. Krabs realizes pets are more profitable than owners. Plankton ruins another date with Karen, so she puts the Chum Bucket on lockdown until he proves his affection.', - } + }, }, { 'md5': '839a04f49900a1fcbf517020d94e0737', @@ -32,7 +32,7 @@ class NickIE(MTVServicesInfoExtractor): 'title': 'SpongeBob SquarePants: "A Place for Pets/Lockdown for Love" S2', 'description': 'A Place for Pets/Lockdown for Love: When customers bring pets into the Krusty Krab, Mr. Krabs realizes pets are more profitable than owners. Plankton ruins another date with Karen, so she puts the Chum Bucket on lockdown until he proves his affection.', - } + }, }, { 'md5': 'f1145699f199770e2919ee8646955d46', @@ -42,7 +42,7 @@ class NickIE(MTVServicesInfoExtractor): 'title': 'SpongeBob SquarePants: "A Place for Pets/Lockdown for Love" S3', 'description': 'A Place for Pets/Lockdown for Love: When customers bring pets into the Krusty Krab, Mr. Krabs realizes pets are more profitable than owners. Plankton ruins another date with Karen, so she puts the Chum Bucket on lockdown until he proves his affection.', - } + }, }, { 'md5': 'd463116875aee2585ee58de3b12caebd', @@ -52,7 +52,7 @@ class NickIE(MTVServicesInfoExtractor): 'title': 'SpongeBob SquarePants: "A Place for Pets/Lockdown for Love" S4', 'description': 'A Place for Pets/Lockdown for Love: When customers bring pets into the Krusty Krab, Mr. Krabs realizes pets are more profitable than owners. Plankton ruins another date with Karen, so she puts the Chum Bucket on lockdown until he proves his affection.', - } + }, }, ], }, { @@ -63,7 +63,7 @@ class NickIE(MTVServicesInfoExtractor): 'description': 'md5:9d65a66df38e02254852794b2809d1cf', 'title': 'Blue\'s Imagination Station', }, - 'skip': 'Not accessible?' + 'skip': 'Not accessible?', }] def _get_feed_query(self, uri): @@ -74,10 +74,10 @@ def _get_feed_query(self, uri): def _real_extract(self, url): domain, video_type, display_id = self._match_valid_url(url).groups() - if video_type.startswith("episodes"): + if video_type.startswith('episodes'): return super()._real_extract(url) video_data = self._download_json( - 'http://%s/data/video.endLevel.json' % domain, + f'http://{domain}/data/video.endLevel.json', display_id, query={ 'urlKey': display_id, }) @@ -184,30 +184,10 @@ class NickDeIE(MTVServicesInfoExtractor): def _get_feed_url(self, uri, url=None): video_id = self._id_from_uri(uri) config = self._download_json( - 'http://media.mtvnservices.com/pmt/e1/access/index.html?uri=%s&configtype=edge&ref=%s' % (uri, url), video_id) + f'http://media.mtvnservices.com/pmt/e1/access/index.html?uri={uri}&configtype=edge&ref={url}', video_id) return self._remove_template_parameter(config['feedWithQueryParams']) -class NickNightIE(NickDeIE): # XXX: Do not subclass from concrete IE - IE_NAME = 'nicknight' - _VALID_URL = r'https?://(?:www\.)(?P<host>nicknight\.(?:de|at|tv))/(?:playlist|shows)/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://www.nicknight.at/shows/977-awkward/videos/85987-nimmer-beste-freunde', - 'only_matching': True, - }, { - 'url': 'http://www.nicknight.at/shows/977-awkward', - 'only_matching': True, - }, { - 'url': 'http://www.nicknight.at/shows/1900-faking-it', - 'only_matching': True, - }] - - def _extract_mrss_url(self, webpage, *args): - return self._search_regex( - r'mrss\s*:\s*(["\'])(?P<url>http.+?)\1', webpage, - 'mrss url', group='url') - - class NickRuIE(MTVServicesInfoExtractor): IE_NAME = 'nickelodeonru' _VALID_URL = r'https?://(?:www\.)nickelodeon\.(?:ru|fr|es|pt|ro|hu|com\.tr)/[^/]+/(?:[^/]+/)*(?P<id>[^/?#&]+)' @@ -241,4 +221,4 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) mgid = self._extract_mgid(webpage, url) - return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) + return self.url_result(f'http://media.mtvnservices.com/embed/{mgid}') diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 9c3a5a4bc8..179e7a9b16 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -1,24 +1,22 @@ -import datetime +import datetime as dt import functools import itertools import json import re import time +import urllib.parse from .common import InfoExtractor, SearchInfoExtractor -from ..compat import ( - compat_HTTPError, -) +from ..networking import Request +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, OnDemandPagedList, - bug_reports_message, clean_html, float_or_none, int_or_none, join_nonempty, parse_duration, - parse_filesize, parse_iso8601, parse_resolution, qualities, @@ -37,10 +35,11 @@ class NiconicoIE(InfoExtractor): IE_NAME = 'niconico' IE_DESC = 'ニコニコ動画' + _GEO_COUNTRIES = ['JP'] + _GEO_BYPASS = False _TESTS = [{ 'url': 'http://www.nicovideo.jp/watch/sm22312215', - 'md5': 'd1a75c0823e2f629128c43e1212760f9', 'info_dict': { 'id': 'sm22312215', 'ext': 'mp4', @@ -54,25 +53,31 @@ class NiconicoIE(InfoExtractor): 'duration': 33, 'view_count': int, 'comment_count': int, + 'genres': ['未設定'], + 'tags': [], }, - 'skip': 'Requires an account', + 'params': {'skip_download': 'm3u8'}, }, { # File downloaded with and without credentials are different, so omit # the md5 field 'url': 'http://www.nicovideo.jp/watch/nm14296458', 'info_dict': { 'id': 'nm14296458', - 'ext': 'swf', - 'title': '【鏡音リン】Dance on media【オリジナル】take2!', - 'description': 'md5:689f066d74610b3b22e0f1739add0f58', + 'ext': 'mp4', + 'title': '【Kagamine Rin】Dance on media【Original】take2!', + 'description': 'md5:9368f2b1f4178de64f2602c2f3d6cbf5', 'thumbnail': r're:https?://.*', 'uploader': 'りょうた', 'uploader_id': '18822557', 'upload_date': '20110429', 'timestamp': 1304065916, - 'duration': 209, + 'duration': 208.0, + 'comment_count': int, + 'view_count': int, + 'genres': ['音楽・サウンド'], + 'tags': ['Translation_Request', 'Kagamine_Rin', 'Rin_Original'], }, - 'skip': 'Requires an account', + 'params': {'skip_download': 'm3u8'}, }, { # 'video exists but is marked as "deleted" # md5 is unstable @@ -106,60 +111,61 @@ class NiconicoIE(InfoExtractor): }, { # video not available via `getflv`; "old" HTML5 video 'url': 'http://www.nicovideo.jp/watch/sm1151009', - 'md5': '8fa81c364eb619d4085354eab075598a', 'info_dict': { 'id': 'sm1151009', 'ext': 'mp4', 'title': 'マスターシステム本体内蔵のスペハリのメインテーマ(PSG版)', - 'description': 'md5:6ee077e0581ff5019773e2e714cdd0b7', + 'description': 'md5:f95a3d259172667b293530cc2e41ebda', 'thumbnail': r're:https?://.*', 'duration': 184, - 'timestamp': 1190868283, - 'upload_date': '20070927', + 'timestamp': 1190835883, + 'upload_date': '20070926', 'uploader': 'denden2', 'uploader_id': '1392194', 'view_count': int, 'comment_count': int, + 'genres': ['ゲーム'], + 'tags': [], }, - 'skip': 'Requires an account', + 'params': {'skip_download': 'm3u8'}, }, { # "New" HTML5 video - # md5 is unstable 'url': 'http://www.nicovideo.jp/watch/sm31464864', 'info_dict': { 'id': 'sm31464864', 'ext': 'mp4', 'title': '新作TVアニメ「戦姫絶唱シンフォギアAXZ」PV 最高画質', 'description': 'md5:e52974af9a96e739196b2c1ca72b5feb', - 'timestamp': 1498514060, + 'timestamp': 1498481660, 'upload_date': '20170626', - 'uploader': 'ゲスト', + 'uploader': 'no-namamae', 'uploader_id': '40826363', 'thumbnail': r're:https?://.*', 'duration': 198, 'view_count': int, 'comment_count': int, + 'genres': ['アニメ'], + 'tags': [], }, - 'skip': 'Requires an account', + 'params': {'skip_download': 'm3u8'}, }, { # Video without owner 'url': 'http://www.nicovideo.jp/watch/sm18238488', - 'md5': 'd265680a1f92bdcbbd2a507fc9e78a9e', 'info_dict': { 'id': 'sm18238488', 'ext': 'mp4', 'title': '【実写版】ミュータントタートルズ', 'description': 'md5:15df8988e47a86f9e978af2064bf6d8e', - 'timestamp': 1341160408, + 'timestamp': 1341128008, 'upload_date': '20120701', - 'uploader': None, - 'uploader_id': None, 'thumbnail': r're:https?://.*', 'duration': 5271, 'view_count': int, 'comment_count': int, + 'genres': ['エンターテイメント'], + 'tags': [], }, - 'skip': 'Requires an account', + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg', 'only_matching': True, @@ -171,9 +177,6 @@ class NiconicoIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch|nico\.ms)/(?P<id>(?:[a-z]{2})?[0-9]+)' _NETRC_MACHINE = 'niconico' - _COMMENT_API_ENDPOINTS = ( - 'https://nvcomment.nicovideo.jp/legacy/api.json', - 'https://nmsg.nicovideo.jp/api.json',) _API_HEADERS = { 'X-Frontend-ID': '6', 'X-Frontend-Version': '0', @@ -206,7 +209,7 @@ def _perform_login(self, username, password): urljoin('https://account.nicovideo.jp', post_url), None, note='Performing MFA', errnote='Unable to complete MFA', data=urlencode_postdata({ - 'otp': self._get_tfa_info('6 digits code') + 'otp': self._get_tfa_info('6 digits code'), }), headers={ 'Content-Type': 'application/x-www-form-urlencoded', }) @@ -257,7 +260,7 @@ def ping(): 'http_output_download_parameters': { 'use_ssl': yesno(session_api_data['urls'][0]['isSsl']), 'use_well_known_port': yesno(session_api_data['urls'][0]['isWellKnownPort']), - } + }, } elif dmc_protocol == 'hls': protocol = 'm3u8' @@ -270,14 +273,14 @@ def ping(): 'transfer_preset': '', 'use_ssl': yesno(session_api_data['urls'][0]['isSsl']), 'use_well_known_port': yesno(session_api_data['urls'][0]['isWellKnownPort']), - } + }, } if 'hls_encryption' in parsed_token and encryption: protocol_parameters['hls_parameters']['encryption'] = { parsed_token['hls_encryption']: { 'encrypted_key': encryption['encryptedKey'], 'key_uri': encryption['keyUri'], - } + }, } else: protocol = 'm3u8_native' @@ -288,7 +291,7 @@ def ping(): session_api_endpoint['url'], video_id, query={'_format': 'json'}, headers={'Content-Type': 'application/json'}, - note='Downloading JSON metadata for %s' % info_dict['format_id'], + note='Downloading JSON metadata for {}'.format(info_dict['format_id']), data=json.dumps({ 'session': { 'client_info': { @@ -298,7 +301,7 @@ def ping(): 'auth_type': try_get(session_api_data, lambda x: x['authTypes'][session_api_data['protocols'][0]]), 'content_key_timeout': session_api_data.get('contentKeyTimeout'), 'service_id': 'nicovideo', - 'service_user_id': session_api_data.get('serviceUserId') + 'service_user_id': session_api_data.get('serviceUserId'), }, 'content_id': session_api_data.get('contentId'), 'content_src_id_sets': [{ @@ -306,34 +309,34 @@ def ping(): 'src_id_to_mux': { 'audio_src_ids': [audio_src_id], 'video_src_ids': [video_src_id], - } - }] + }, + }], }], 'content_type': 'movie', 'content_uri': '', 'keep_method': { 'heartbeat': { - 'lifetime': session_api_data.get('heartbeatLifetime') - } + 'lifetime': session_api_data.get('heartbeatLifetime'), + }, }, 'priority': session_api_data['priority'], 'protocol': { 'name': 'http', 'parameters': { 'http_parameters': { - 'parameters': protocol_parameters - } - } + 'parameters': protocol_parameters, + }, + }, }, 'recipe_id': session_api_data.get('recipeId'), 'session_operation_auth': { 'session_operation_auth_by_signature': { 'signature': session_api_data.get('signature'), 'token': session_api_data.get('token'), - } + }, }, - 'timing_constraint': 'unlimited' - } + 'timing_constraint': 'unlimited', + }, }).encode()) info_dict['url'] = session_response['data']['session']['content_uri'] @@ -345,7 +348,7 @@ def ping(): 'data': json.dumps(session_response['data']), # interval, convert milliseconds to seconds, then halve to make a buffer. 'interval': float_or_none(session_api_data.get('heartbeatLifetime'), scale=3000), - 'ping': ping + 'ping': ping, } return info_dict, heartbeat_info_dict @@ -355,36 +358,93 @@ def _extract_format_for_quality(self, video_id, audio_quality, video_quality, dm if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'): return None - def extract_video_quality(video_quality): - return parse_filesize('%sB' % self._search_regex( - r'\| ([0-9]*\.?[0-9]*[MK])', video_quality, 'vbr', default='')) - format_id = '-'.join( [remove_start(s['id'], 'archive_') for s in (video_quality, audio_quality)] + [dmc_protocol]) vid_qual_label = traverse_obj(video_quality, ('metadata', 'label')) - vid_quality = traverse_obj(video_quality, ('metadata', 'bitrate')) return { - 'url': 'niconico_dmc:%s/%s/%s' % (video_id, video_quality['id'], audio_quality['id']), + 'url': 'niconico_dmc:{}/{}/{}'.format(video_id, video_quality['id'], audio_quality['id']), 'format_id': format_id, 'format_note': join_nonempty('DMC', vid_qual_label, dmc_protocol.upper(), delim=' '), 'ext': 'mp4', # Session API are used in HTML5, which always serves mp4 'acodec': 'aac', 'vcodec': 'h264', - 'abr': float_or_none(traverse_obj(audio_quality, ('metadata', 'bitrate')), 1000), - 'vbr': float_or_none(vid_quality if vid_quality > 0 else extract_video_quality(vid_qual_label), 1000), - 'height': traverse_obj(video_quality, ('metadata', 'resolution', 'height')), - 'width': traverse_obj(video_quality, ('metadata', 'resolution', 'width')), + **traverse_obj(audio_quality, ('metadata', { + 'abr': ('bitrate', {functools.partial(float_or_none, scale=1000)}), + 'asr': ('samplingRate', {int_or_none}), + })), + **traverse_obj(video_quality, ('metadata', { + 'vbr': ('bitrate', {functools.partial(float_or_none, scale=1000)}), + 'height': ('resolution', 'height', {int_or_none}), + 'width': ('resolution', 'width', {int_or_none}), + })), 'quality': -2 if 'low' in video_quality['id'] else None, 'protocol': 'niconico_dmc', 'expected_protocol': dmc_protocol, # XXX: This is not a documented field 'http_headers': { 'Origin': 'https://www.nicovideo.jp', 'Referer': 'https://www.nicovideo.jp/watch/' + video_id, - } + }, } + def _yield_dmc_formats(self, api_data, video_id): + dmc_data = traverse_obj(api_data, ('media', 'delivery', 'movie')) + audios = traverse_obj(dmc_data, ('audios', ..., {dict})) + videos = traverse_obj(dmc_data, ('videos', ..., {dict})) + protocols = traverse_obj(dmc_data, ('session', 'protocols', ..., {str})) + if not all((audios, videos, protocols)): + return + + for audio_quality, video_quality, protocol in itertools.product(audios, videos, protocols): + if fmt := self._extract_format_for_quality(video_id, audio_quality, video_quality, protocol): + yield fmt + + def _yield_dms_formats(self, api_data, video_id): + fmt_filter = lambda _, v: v['isAvailable'] and v['id'] + videos = traverse_obj(api_data, ('media', 'domand', 'videos', fmt_filter)) + audios = traverse_obj(api_data, ('media', 'domand', 'audios', fmt_filter)) + access_key = traverse_obj(api_data, ('media', 'domand', 'accessRightKey', {str})) + track_id = traverse_obj(api_data, ('client', 'watchTrackId', {str})) + if not all((videos, audios, access_key, track_id)): + return + + dms_m3u8_url = self._download_json( + f'https://nvapi.nicovideo.jp/v1/watch/{video_id}/access-rights/hls', video_id, + data=json.dumps({ + 'outputs': list(itertools.product((v['id'] for v in videos), (a['id'] for a in audios))), + }).encode(), query={'actionTrackId': track_id}, headers={ + 'x-access-right-key': access_key, + 'x-frontend-id': 6, + 'x-frontend-version': 0, + 'x-request-with': 'https://www.nicovideo.jp', + })['data']['contentUrl'] + # Getting all audio formats results in duplicate video formats which we filter out later + dms_fmts = self._extract_m3u8_formats(dms_m3u8_url, video_id) + + # m3u8 extraction does not provide audio bitrates, so extract from the API data and fix + for audio_fmt in traverse_obj(dms_fmts, lambda _, v: v['vcodec'] == 'none'): + yield { + **audio_fmt, + **traverse_obj(audios, (lambda _, v: audio_fmt['format_id'].startswith(v['id']), { + 'format_id': ('id', {str}), + 'abr': ('bitRate', {functools.partial(float_or_none, scale=1000)}), + 'asr': ('samplingRate', {int_or_none}), + }), get_all=False), + 'acodec': 'aac', + 'ext': 'm4a', + } + + # Sort before removing dupes to keep the format dicts with the lowest tbr + video_fmts = sorted((fmt for fmt in dms_fmts if fmt['vcodec'] != 'none'), key=lambda f: f['tbr']) + self._remove_duplicate_formats(video_fmts) + # Calculate the true vbr/tbr by subtracting the lowest abr + min_abr = min(traverse_obj(audios, (..., 'bitRate', {float_or_none})), default=0) / 1000 + for video_fmt in video_fmts: + video_fmt['tbr'] -= min_abr + video_fmt['format_id'] = f'video-{video_fmt["tbr"]:.0f}' + yield video_fmt + def _real_extract(self, url): video_id = self._match_id(url) @@ -392,38 +452,50 @@ def _real_extract(self, url): webpage, handle = self._download_webpage_handle( 'https://www.nicovideo.jp/watch/' + video_id, video_id) if video_id.startswith('so'): - video_id = self._match_id(handle.geturl()) + video_id = self._match_id(handle.url) - api_data = self._parse_json(self._html_search_regex( - 'data-api-data="([^"]+)"', webpage, - 'API data', default='{}'), video_id) + api_data = traverse_obj( + self._parse_json(self._html_search_meta('server-response', webpage) or '', video_id), + ('data', 'response', {dict})) + if not api_data: + raise ExtractorError('Server response data not found') except ExtractorError as e: try: api_data = self._download_json( - 'https://www.nicovideo.jp/api/watch/v3/%s?_frontendId=6&_frontendVersion=0&actionTrackId=AAAAAAAAAA_%d' % (video_id, round(time.time() * 1000)), video_id, + f'https://www.nicovideo.jp/api/watch/v3/{video_id}?_frontendId=6&_frontendVersion=0&actionTrackId=AAAAAAAAAA_{round(time.time() * 1000)}', video_id, note='Downloading API JSON', errnote='Unable to fetch data')['data'] except ExtractorError: - if not isinstance(e.cause, compat_HTTPError): + if not isinstance(e.cause, HTTPError): raise - webpage = e.cause.read().decode('utf-8', 'replace') + webpage = e.cause.response.read().decode('utf-8', 'replace') error_msg = self._html_search_regex( r'(?s)<section\s+class="(?:(?:ErrorMessage|WatchExceptionPage-message)\s*)+">(.+?)</section>', webpage, 'error reason', default=None) if not error_msg: raise - raise ExtractorError(re.sub(r'\s+', ' ', error_msg), expected=True) + raise ExtractorError(clean_html(error_msg), expected=True) - formats = [] - - def get_video_info(*items, get_first=True, **kwargs): - return traverse_obj(api_data, ('video', *items), get_all=not get_first, **kwargs) - - quality_info = api_data['media']['delivery']['movie'] - session_api_data = quality_info['session'] - for (audio_quality, video_quality, protocol) in itertools.product(quality_info['audios'], quality_info['videos'], session_api_data['protocols']): - fmt = self._extract_format_for_quality(video_id, audio_quality, video_quality, protocol) - if fmt: - formats.append(fmt) + availability = self._availability(**(traverse_obj(api_data, ('payment', 'video', { + 'needs_premium': ('isPremium', {bool}), + 'needs_subscription': ('isAdmission', {bool}), + })) or {'needs_auth': True})) + formats = [*self._yield_dmc_formats(api_data, video_id), + *self._yield_dms_formats(api_data, video_id)] + if not formats: + fail_msg = clean_html(self._html_search_regex( + r'<p[^>]+\bclass="fail-message"[^>]*>(?P<msg>.+?)</p>', + webpage, 'fail message', default=None, group='msg')) + if fail_msg: + self.to_screen(f'Niconico said: {fail_msg}') + if fail_msg and 'された地域と同じ地域からのみ視聴できます。' in fail_msg: + availability = None + self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + elif availability == 'premium_only': + self.raise_login_required('This video requires premium', metadata_available=True) + elif availability == 'subscriber_only': + self.raise_login_required('This video is for members only', metadata_available=True) + elif availability == 'needs_auth': + self.raise_login_required(metadata_available=False) # Start extracting information tags = None @@ -442,11 +514,15 @@ def get_video_info(*items, get_first=True, **kwargs): thumb_prefs = qualities(['url', 'middleUrl', 'largeUrl', 'player', 'ogp']) + def get_video_info(*items, get_first=True, **kwargs): + return traverse_obj(api_data, ('video', *items), get_all=not get_first, **kwargs) + return { 'id': video_id, '_api_data': api_data, 'title': get_video_info(('originalTitle', 'title')) or self._og_search_title(webpage, default=None), 'formats': formats, + 'availability': availability, 'thumbnails': [{ 'id': key, 'url': url, @@ -469,81 +545,38 @@ def get_video_info(*items, get_first=True, **kwargs): parse_duration(self._html_search_meta('video:duration', webpage, 'video duration', default=None)) or get_video_info('duration')), 'webpage_url': url_or_none(url) or f'https://www.nicovideo.jp/watch/{video_id}', - 'subtitles': self.extract_subtitles(video_id, api_data, session_api_data), + 'subtitles': self.extract_subtitles(video_id, api_data), } - def _get_subtitles(self, video_id, api_data, session_api_data): - comment_user_key = traverse_obj(api_data, ('comment', 'keys', 'userKey')) - user_id_str = session_api_data.get('serviceUserId') - - thread_ids = traverse_obj(api_data, ('comment', 'threads', lambda _, v: v['isActive'])) - raw_danmaku = self._extract_all_comments(video_id, thread_ids, user_id_str, comment_user_key) - if not raw_danmaku: - self.report_warning(f'Failed to get comments. {bug_reports_message()}') + def _get_subtitles(self, video_id, api_data): + comments_info = traverse_obj(api_data, ('comment', 'nvComment', {dict})) or {} + if not comments_info.get('server'): return + + danmaku = traverse_obj(self._download_json( + f'{comments_info["server"]}/v1/threads', video_id, data=json.dumps({ + 'additionals': {}, + 'params': comments_info.get('params'), + 'threadKey': comments_info.get('threadKey'), + }).encode(), fatal=False, + headers={ + 'Referer': 'https://www.nicovideo.jp/', + 'Origin': 'https://www.nicovideo.jp', + 'Content-Type': 'text/plain;charset=UTF-8', + 'x-client-os-type': 'others', + 'x-frontend-id': '6', + 'x-frontend-version': '0', + }, + note='Downloading comments', errnote='Failed to download comments'), + ('data', 'threads', ..., 'comments', ...)) + return { 'comments': [{ 'ext': 'json', - 'data': json.dumps(raw_danmaku), + 'data': json.dumps(danmaku), }], } - def _extract_all_comments(self, video_id, threads, user_id, user_key): - auth_data = { - 'user_id': user_id, - 'userkey': user_key, - } if user_id and user_key else {'user_id': ''} - - # Request Start - post_data = [{'ping': {'content': 'rs:0'}}] - for i, thread in enumerate(threads): - thread_id = thread['id'] - thread_fork = thread['fork'] - # Post Start (2N) - post_data.append({'ping': {'content': f'ps:{i * 2}'}}) - post_data.append({'thread': { - 'fork': thread_fork, - 'language': 0, - 'nicoru': 3, - 'scores': 1, - 'thread': thread_id, - 'version': '20090904', - 'with_global': 1, - **auth_data, - }}) - # Post Final (2N) - post_data.append({'ping': {'content': f'pf:{i * 2}'}}) - - # Post Start (2N+1) - post_data.append({'ping': {'content': f'ps:{i * 2 + 1}'}}) - post_data.append({'thread_leaves': { - # format is '<bottom of minute range>-<top of minute range>:<comments per minute>,<total last comments' - # unfortunately NND limits (deletes?) comment returns this way, so you're only able to grab the last 1000 per language - 'content': '0-999999:999999,999999,nicoru:999999', - 'fork': thread_fork, - 'language': 0, - 'nicoru': 3, - 'scores': 1, - 'thread': thread_id, - **auth_data, - }}) - # Post Final (2N+1) - post_data.append({'ping': {'content': f'pf:{i * 2 + 1}'}}) - # Request Final - post_data.append({'ping': {'content': 'rf:0'}}) - - for api_url in self._COMMENT_API_ENDPOINTS: - comments = self._download_json( - api_url, video_id, data=json.dumps(post_data).encode(), fatal=False, - headers={ - 'Referer': 'https://www.nicovideo.jp/watch/%s' % video_id, - 'Origin': 'https://www.nicovideo.jp', - 'Content-Type': 'text/plain;charset=UTF-8', - }, - note='Downloading comments', errnote=f'Failed to access endpoint {api_url}') - if comments: - return comments - class NiconicoPlaylistBaseIE(InfoExtractor): _PAGE_SIZE = 100 @@ -551,7 +584,7 @@ class NiconicoPlaylistBaseIE(InfoExtractor): _API_HEADERS = { 'X-Frontend-ID': '6', 'X-Frontend-Version': '0', - 'X-Niconico-Language': 'en-us' + 'X-Niconico-Language': 'en-us', } def _call_api(self, list_id, resource, query): @@ -566,7 +599,7 @@ def _parse_owner(item): def _fetch_page(self, list_id, page): page += 1 - resp = self._call_api(list_id, 'page %d' % page, { + resp = self._call_api(list_id, f'page {page}', { 'page': page, 'pageSize': self._PAGE_SIZE, }) @@ -636,10 +669,10 @@ def _real_extract(self, url): class NiconicoSeriesIE(InfoExtractor): IE_NAME = 'niconico:series' - _VALID_URL = r'https?://(?:(?:www\.|sp\.)?nicovideo\.jp|nico\.ms)/series/(?P<id>\d+)' + _VALID_URL = r'https?://(?:(?:www\.|sp\.)?nicovideo\.jp(?:/user/\d+)?|nico\.ms)/series/(?P<id>\d+)' _TESTS = [{ - 'url': 'https://www.nicovideo.jp/series/110226', + 'url': 'https://www.nicovideo.jp/user/44113208/series/110226', 'info_dict': { 'id': '110226', 'title': 'ご立派ァ!のシリーズ', @@ -659,7 +692,7 @@ class NiconicoSeriesIE(InfoExtractor): def _real_extract(self, url): list_id = self._match_id(url) - webpage = self._download_webpage(f'https://www.nicovideo.jp/series/{list_id}', list_id) + webpage = self._download_webpage(url, list_id) title = self._search_regex( (r'<title>「(.+)(全', @@ -667,10 +700,9 @@ def _real_extract(self, url): webpage, 'title', fatal=False) if title: title = unescapeHTML(title) - playlist = [ - self.url_result(f'https://www.nicovideo.jp/watch/{v_id}', video_id=v_id) - for v_id in re.findall(r'data-href=[\'"](?:https://www\.nicovideo\.jp)?/watch/([a-z0-9]+)', webpage)] - return self.playlist_result(playlist, list_id, title) + json_data = next(self._yield_json_ld(webpage, None, fatal=False)) + return self.playlist_from_matches( + traverse_obj(json_data, ('itemListElement', ..., 'url')), list_id, title, ie=NiconicoIE) class NiconicoHistoryIE(NiconicoPlaylistBaseIE): @@ -715,7 +747,7 @@ def _real_extract(self, url): try: mylist = self._call_api(list_id, 'list', {'pageSize': 1}) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: self.raise_login_required('You have to be logged in to get your history') raise return self.playlist_result(self._entries(list_id), list_id, **self._parse_owner(mylist)) @@ -755,14 +787,14 @@ class NicovideoSearchURLIE(NicovideoSearchBaseIE): 'url': 'http://www.nicovideo.jp/search/sm9', 'info_dict': { 'id': 'sm9', - 'title': 'sm9' + 'title': 'sm9', }, 'playlist_mincount': 40, }, { 'url': 'https://www.nicovideo.jp/search/sm9?sort=h&order=d&end=2020-12-31&start=2020-01-01', 'info_dict': { 'id': 'sm9', - 'title': 'sm9' + 'title': 'sm9', }, 'playlist_count': 31, }] @@ -780,17 +812,17 @@ class NicovideoSearchDateIE(NicovideoSearchBaseIE, SearchInfoExtractor): 'url': 'nicosearchdateall:a', 'info_dict': { 'id': 'a', - 'title': 'a' + 'title': 'a', }, 'playlist_mincount': 1610, }] - _START_DATE = datetime.date(2007, 1, 1) + _START_DATE = dt.date(2007, 1, 1) _RESULTS_PER_PAGE = 32 _MAX_PAGES = 50 def _entries(self, url, item_id, start_date=None, end_date=None): - start_date, end_date = start_date or self._START_DATE, end_date or datetime.datetime.now().date() + start_date, end_date = start_date or self._START_DATE, end_date or dt.datetime.now().date() # If the last page has a full page of videos, we need to break down the query interval further last_page_len = len(list(self._get_entries_for_date( @@ -827,7 +859,7 @@ class NicovideoTagURLIE(NicovideoSearchBaseIE): 'url': 'https://www.nicovideo.jp/tag/ドキュメンタリー淫夢', 'info_dict': { 'id': 'ドキュメンタリー淫夢', - 'title': 'ドキュメンタリー淫夢' + 'title': 'ドキュメンタリー淫夢', }, 'playlist_mincount': 400, }] @@ -846,12 +878,12 @@ class NiconicoUserIE(InfoExtractor): }, 'playlist_mincount': 101, } - _API_URL = "https://nvapi.nicovideo.jp/v1/users/%s/videos?sortKey=registeredAt&sortOrder=desc&pageSize=%s&page=%s" + _API_URL = 'https://nvapi.nicovideo.jp/v1/users/%s/videos?sortKey=registeredAt&sortOrder=desc&pageSize=%s&page=%s' _PAGE_SIZE = 100 _API_HEADERS = { 'X-Frontend-ID': '6', - 'X-Frontend-Version': '0' + 'X-Frontend-Version': '0', } def _entries(self, list_id): @@ -861,14 +893,166 @@ def _entries(self, list_id): json_parsed = self._download_json( self._API_URL % (list_id, self._PAGE_SIZE, page_num + 1), list_id, headers=self._API_HEADERS, - note='Downloading JSON metadata%s' % (' page %d' % page_num if page_num else '')) + note='Downloading JSON metadata%s' % (f' page {page_num}' if page_num else '')) if not page_num: total_count = int_or_none(json_parsed['data'].get('totalCount')) - for entry in json_parsed["data"]["items"]: + for entry in json_parsed['data']['items']: count += 1 - yield self.url_result('https://www.nicovideo.jp/watch/%s' % entry['id']) + yield self.url_result('https://www.nicovideo.jp/watch/{}'.format(entry['id'])) page_num += 1 def _real_extract(self, url): list_id = self._match_id(url) return self.playlist_result(self._entries(list_id), list_id, ie=NiconicoIE.ie_key()) + + +class NiconicoLiveIE(InfoExtractor): + IE_NAME = 'niconico:live' + IE_DESC = 'ニコニコ生放送' + _VALID_URL = r'https?://(?:sp\.)?live2?\.nicovideo\.jp/(?:watch|gate)/(?P<id>lv\d+)' + _TESTS = [{ + 'note': 'this test case includes invisible characters for title, pasting them as-is', + 'url': 'https://live.nicovideo.jp/watch/lv339533123', + 'info_dict': { + 'id': 'lv339533123', + 'title': '激辛ペヤング食べます\u202a( ;ᯅ; )\u202c(歌枠オーディション参加中)', + 'view_count': 1526, + 'comment_count': 1772, + 'description': '初めましてもかって言います❕\nのんびり自由に適当に暮らしてます', + 'uploader': 'もか', + 'channel': 'ゲストさんのコミュニティ', + 'channel_id': 'co5776900', + 'channel_url': 'https://com.nicovideo.jp/community/co5776900', + 'timestamp': 1670677328, + 'is_live': True, + }, + 'skip': 'livestream', + }, { + 'url': 'https://live2.nicovideo.jp/watch/lv339533123', + 'only_matching': True, + }, { + 'url': 'https://sp.live.nicovideo.jp/watch/lv339533123', + 'only_matching': True, + }, { + 'url': 'https://sp.live2.nicovideo.jp/watch/lv339533123', + 'only_matching': True, + }] + + _KNOWN_LATENCY = ('high', 'low') + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage, urlh = self._download_webpage_handle(f'https://live.nicovideo.jp/watch/{video_id}', video_id) + + embedded_data = self._parse_json(unescapeHTML(self._search_regex( + r'<script\s+id="embedded-data"\s*data-props="(.+?)"', webpage, 'embedded data')), video_id) + + ws_url = traverse_obj(embedded_data, ('site', 'relive', 'webSocketUrl')) + if not ws_url: + raise ExtractorError('The live hasn\'t started yet or already ended.', expected=True) + ws_url = update_url_query(ws_url, { + 'frontend_id': traverse_obj(embedded_data, ('site', 'frontendId')) or '9', + }) + + hostname = remove_start(urllib.parse.urlparse(urlh.url).hostname, 'sp.') + latency = try_get(self._configuration_arg('latency'), lambda x: x[0]) + if latency not in self._KNOWN_LATENCY: + latency = 'high' + + ws = self._request_webpage( + Request(ws_url, headers={'Origin': f'https://{hostname}'}), + video_id=video_id, note='Connecting to WebSocket server') + + self.write_debug('[debug] Sending HLS server request') + ws.send(json.dumps({ + 'type': 'startWatching', + 'data': { + 'stream': { + 'quality': 'abr', + 'protocol': 'hls+fmp4', + 'latency': latency, + 'chasePlay': False, + }, + 'room': { + 'protocol': 'webSocket', + 'commentable': True, + }, + 'reconnect': False, + }, + })) + + while True: + recv = ws.recv() + if not recv: + continue + data = json.loads(recv) + if not isinstance(data, dict): + continue + if data.get('type') == 'stream': + m3u8_url = data['data']['uri'] + qualities = data['data']['availableQualities'] + break + elif data.get('type') == 'disconnect': + self.write_debug(recv) + raise ExtractorError('Disconnected at middle of extraction') + elif data.get('type') == 'error': + self.write_debug(recv) + message = traverse_obj(data, ('body', 'code')) or recv + raise ExtractorError(message) + elif self.get_param('verbose', False): + if len(recv) > 100: + recv = recv[:100] + '...' + self.write_debug(f'Server said: {recv}') + + title = traverse_obj(embedded_data, ('program', 'title')) or self._html_search_meta( + ('og:title', 'twitter:title'), webpage, 'live title', fatal=False) + + raw_thumbs = traverse_obj(embedded_data, ('program', 'thumbnail')) or {} + thumbnails = [] + for name, value in raw_thumbs.items(): + if not isinstance(value, dict): + thumbnails.append({ + 'id': name, + 'url': value, + **parse_resolution(value, lenient=True), + }) + continue + + for k, img_url in value.items(): + res = parse_resolution(k, lenient=True) or parse_resolution(img_url, lenient=True) + width, height = res.get('width'), res.get('height') + + thumbnails.append({ + 'id': f'{name}_{width}x{height}', + 'url': img_url, + **res, + }) + + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True) + for fmt, q in zip(formats, reversed(qualities[1:])): + fmt.update({ + 'format_id': q, + 'protocol': 'niconico_live', + 'ws': ws, + 'video_id': video_id, + 'live_latency': latency, + 'origin': hostname, + }) + + return { + 'id': video_id, + 'title': title, + **traverse_obj(embedded_data, { + 'view_count': ('program', 'statistics', 'watchCount'), + 'comment_count': ('program', 'statistics', 'commentCount'), + 'uploader': ('program', 'supplier', 'name'), + 'channel': ('socialGroup', 'name'), + 'channel_id': ('socialGroup', 'id'), + 'channel_url': ('socialGroup', 'socialGroupPageUrl'), + }), + 'description': clean_html(traverse_obj(embedded_data, ('program', 'description'))), + 'timestamp': int_or_none(traverse_obj(embedded_data, ('program', 'openTime'))), + 'is_live': True, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/yt_dlp/extractor/niconicochannelplus.py b/yt_dlp/extractor/niconicochannelplus.py new file mode 100644 index 0000000000..f39d0000dc --- /dev/null +++ b/yt_dlp/extractor/niconicochannelplus.py @@ -0,0 +1,426 @@ +import functools +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + OnDemandPagedList, + filter_dict, + int_or_none, + parse_qs, + str_or_none, + traverse_obj, + unified_timestamp, + url_or_none, +) + + +class NiconicoChannelPlusBaseIE(InfoExtractor): + _WEBPAGE_BASE_URL = 'https://nicochannel.jp' + + def _call_api(self, path, item_id, **kwargs): + return self._download_json( + f'https://nfc-api.nicochannel.jp/fc/{path}', video_id=item_id, **kwargs) + + def _find_fanclub_site_id(self, channel_name): + fanclub_list_json = self._call_api( + 'content_providers/channels', item_id=f'channels/{channel_name}', + note='Fetching channel list', errnote='Unable to fetch channel list', + )['data']['content_providers'] + fanclub_id = traverse_obj(fanclub_list_json, ( + lambda _, v: v['domain'] == f'{self._WEBPAGE_BASE_URL}/{channel_name}', 'id'), + get_all=False) + if not fanclub_id: + raise ExtractorError(f'Channel {channel_name} does not exist', expected=True) + return fanclub_id + + def _get_channel_base_info(self, fanclub_site_id): + return traverse_obj(self._call_api( + f'fanclub_sites/{fanclub_site_id}/page_base_info', item_id=f'fanclub_sites/{fanclub_site_id}', + note='Fetching channel base info', errnote='Unable to fetch channel base info', fatal=False, + ), ('data', 'fanclub_site', {dict})) or {} + + def _get_channel_user_info(self, fanclub_site_id): + return traverse_obj(self._call_api( + f'fanclub_sites/{fanclub_site_id}/user_info', item_id=f'fanclub_sites/{fanclub_site_id}', + note='Fetching channel user info', errnote='Unable to fetch channel user info', fatal=False, + data=json.dumps('null').encode('ascii'), + ), ('data', 'fanclub_site', {dict})) or {} + + +class NiconicoChannelPlusIE(NiconicoChannelPlusBaseIE): + IE_NAME = 'NiconicoChannelPlus' + IE_DESC = 'ニコニコチャンネルプラス' + _VALID_URL = r'https?://nicochannel\.jp/(?P<channel>[\w.-]+)/(?:video|live)/(?P<code>sm\w+)' + _TESTS = [{ + 'url': 'https://nicochannel.jp/kaorin/video/smsDd8EdFLcVZk9yyAhD6H7H', + 'info_dict': { + 'id': 'smsDd8EdFLcVZk9yyAhD6H7H', + 'title': '前田佳織里はニコ生がしたい!', + 'ext': 'mp4', + 'channel': '前田佳織里の世界攻略計画', + 'channel_id': 'kaorin', + 'channel_url': 'https://nicochannel.jp/kaorin', + 'live_status': 'not_live', + 'thumbnail': 'https://nicochannel.jp/public_html/contents/video_pages/74/thumbnail_path', + 'description': '2021年11月に放送された\n「前田佳織里はニコ生がしたい!」アーカイブになります。', + 'timestamp': 1641360276, + 'duration': 4097, + 'comment_count': int, + 'view_count': int, + 'tags': [], + 'upload_date': '20220105', + }, + 'params': { + 'skip_download': True, + }, + }, { + # age limited video; test purpose channel. + 'url': 'https://nicochannel.jp/testman/video/smDXbcrtyPNxLx9jc4BW69Ve', + 'info_dict': { + 'id': 'smDXbcrtyPNxLx9jc4BW69Ve', + 'title': 'test oshiro', + 'ext': 'mp4', + 'channel': '本番チャンネルプラステストマン', + 'channel_id': 'testman', + 'channel_url': 'https://nicochannel.jp/testman', + 'age_limit': 18, + 'live_status': 'was_live', + 'timestamp': 1666344616, + 'duration': 86465, + 'comment_count': int, + 'view_count': int, + 'tags': [], + 'upload_date': '20221021', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + content_code, channel_id = self._match_valid_url(url).group('code', 'channel') + fanclub_site_id = self._find_fanclub_site_id(channel_id) + + data_json = self._call_api( + f'video_pages/{content_code}', item_id=content_code, headers={'fc_use_device': 'null'}, + note='Fetching video page info', errnote='Unable to fetch video page info', + )['data']['video_page'] + + live_status, session_id = self._get_live_status_and_session_id(content_code, data_json) + + release_timestamp_str = data_json.get('live_scheduled_start_at') + + formats = [] + + if live_status == 'is_upcoming': + if release_timestamp_str: + msg = f'This live event will begin at {release_timestamp_str} UTC' + else: + msg = 'This event has not started yet' + self.raise_no_formats(msg, expected=True, video_id=content_code) + else: + formats = self._extract_m3u8_formats( + # "authenticated_url" is a format string that contains "{session_id}". + m3u8_url=data_json['video_stream']['authenticated_url'].format(session_id=session_id), + video_id=content_code) + + return { + 'id': content_code, + 'formats': formats, + '_format_sort_fields': ('tbr', 'vcodec', 'acodec'), + 'channel': self._get_channel_base_info(fanclub_site_id).get('fanclub_site_name'), + 'channel_id': channel_id, + 'channel_url': f'{self._WEBPAGE_BASE_URL}/{channel_id}', + 'age_limit': traverse_obj(self._get_channel_user_info(fanclub_site_id), ('content_provider', 'age_limit')), + 'live_status': live_status, + 'release_timestamp': unified_timestamp(release_timestamp_str), + **traverse_obj(data_json, { + 'title': ('title', {str}), + 'thumbnail': ('thumbnail_url', {url_or_none}), + 'description': ('description', {str}), + 'timestamp': ('released_at', {unified_timestamp}), + 'duration': ('active_video_filename', 'length', {int_or_none}), + 'comment_count': ('video_aggregate_info', 'number_of_comments', {int_or_none}), + 'view_count': ('video_aggregate_info', 'total_views', {int_or_none}), + 'tags': ('video_tags', ..., 'tag', {str}), + }), + '__post_extractor': self.extract_comments( + content_code=content_code, + comment_group_id=traverse_obj(data_json, ('video_comment_setting', 'comment_group_id'))), + } + + def _get_comments(self, content_code, comment_group_id): + item_id = f'{content_code}/comments' + + if not comment_group_id: + return None + + comment_access_token = self._call_api( + f'video_pages/{content_code}/comments_user_token', item_id, + note='Getting comment token', errnote='Unable to get comment token', + )['data']['access_token'] + + comment_list = self._download_json( + 'https://comm-api.sheeta.com/messages.history', video_id=item_id, + note='Fetching comments', errnote='Unable to fetch comments', + headers={'Content-Type': 'application/json'}, + query={ + 'sort_direction': 'asc', + 'limit': int_or_none(self._configuration_arg('max_comments', [''])[0]) or 120, + }, + data=json.dumps({ + 'token': comment_access_token, + 'group_id': comment_group_id, + }).encode('ascii')) + + for comment in traverse_obj(comment_list, ...): + yield traverse_obj(comment, { + 'author': ('nickname', {str}), + 'author_id': ('sender_id', {str_or_none}), + 'id': ('id', {str_or_none}), + 'text': ('message', {str}), + 'timestamp': (('updated_at', 'sent_at', 'created_at'), {unified_timestamp}), + 'author_is_uploader': ('sender_id', {lambda x: x == '-1'}), + }, get_all=False) + + def _get_live_status_and_session_id(self, content_code, data_json): + video_type = data_json.get('type') + live_finished_at = data_json.get('live_finished_at') + + payload = {} + if video_type == 'vod': + if live_finished_at: + live_status = 'was_live' + else: + live_status = 'not_live' + elif video_type == 'live': + if not data_json.get('live_started_at'): + return 'is_upcoming', '' + + if not live_finished_at: + live_status = 'is_live' + else: + live_status = 'was_live' + payload = {'broadcast_type': 'dvr'} + + video_allow_dvr_flg = traverse_obj(data_json, ('video', 'allow_dvr_flg')) + video_convert_to_vod_flg = traverse_obj(data_json, ('video', 'convert_to_vod_flg')) + + self.write_debug(f'allow_dvr_flg = {video_allow_dvr_flg}, convert_to_vod_flg = {video_convert_to_vod_flg}.') + + if not (video_allow_dvr_flg and video_convert_to_vod_flg): + raise ExtractorError( + 'Live was ended, there is no video for download.', video_id=content_code, expected=True) + else: + raise ExtractorError(f'Unknown type: {video_type}', video_id=content_code, expected=False) + + self.write_debug(f'{content_code}: video_type={video_type}, live_status={live_status}') + + session_id = self._call_api( + f'video_pages/{content_code}/session_ids', item_id=f'{content_code}/session', + data=json.dumps(payload).encode('ascii'), headers={ + 'Content-Type': 'application/json', + 'fc_use_device': 'null', + 'origin': 'https://nicochannel.jp', + }, + note='Getting session id', errnote='Unable to get session id', + )['data']['session_id'] + + return live_status, session_id + + +class NiconicoChannelPlusChannelBaseIE(NiconicoChannelPlusBaseIE): + _PAGE_SIZE = 12 + + def _fetch_paged_channel_video_list(self, path, query, channel_name, item_id, page): + response = self._call_api( + path, item_id, query={ + **query, + 'page': (page + 1), + 'per_page': self._PAGE_SIZE, + }, + headers={'fc_use_device': 'null'}, + note=f'Getting channel info (page {page + 1})', + errnote=f'Unable to get channel info (page {page + 1})') + + for content_code in traverse_obj(response, ('data', 'video_pages', 'list', ..., 'content_code')): + # "video/{content_code}" works for both VOD and live, but "live/{content_code}" doesn't work for VOD + yield self.url_result( + f'{self._WEBPAGE_BASE_URL}/{channel_name}/video/{content_code}', NiconicoChannelPlusIE) + + +class NiconicoChannelPlusChannelVideosIE(NiconicoChannelPlusChannelBaseIE): + IE_NAME = 'NiconicoChannelPlus:channel:videos' + IE_DESC = 'ニコニコチャンネルプラス - チャンネル - 動画リスト. nicochannel.jp/channel/videos' + _VALID_URL = r'https?://nicochannel\.jp/(?P<id>[a-z\d\._-]+)/videos(?:\?.*)?' + _TESTS = [{ + # query: None + 'url': 'https://nicochannel.jp/testman/videos', + 'info_dict': { + 'id': 'testman-videos', + 'title': '本番チャンネルプラステストマン-videos', + }, + 'playlist_mincount': 18, + }, { + # query: None + 'url': 'https://nicochannel.jp/testtarou/videos', + 'info_dict': { + 'id': 'testtarou-videos', + 'title': 'チャンネルプラステスト太郎-videos', + }, + 'playlist_mincount': 2, + }, { + # query: None + 'url': 'https://nicochannel.jp/testjirou/videos', + 'info_dict': { + 'id': 'testjirou-videos', + 'title': 'チャンネルプラステスト二郎-videos', + }, + 'playlist_mincount': 12, + }, { + # query: tag + 'url': 'https://nicochannel.jp/testman/videos?tag=%E6%A4%9C%E8%A8%BC%E7%94%A8', + 'info_dict': { + 'id': 'testman-videos', + 'title': '本番チャンネルプラステストマン-videos', + }, + 'playlist_mincount': 6, + }, { + # query: vodType + 'url': 'https://nicochannel.jp/testman/videos?vodType=1', + 'info_dict': { + 'id': 'testman-videos', + 'title': '本番チャンネルプラステストマン-videos', + }, + 'playlist_mincount': 18, + }, { + # query: sort + 'url': 'https://nicochannel.jp/testman/videos?sort=-released_at', + 'info_dict': { + 'id': 'testman-videos', + 'title': '本番チャンネルプラステストマン-videos', + }, + 'playlist_mincount': 18, + }, { + # query: tag, vodType + 'url': 'https://nicochannel.jp/testman/videos?tag=%E6%A4%9C%E8%A8%BC%E7%94%A8&vodType=1', + 'info_dict': { + 'id': 'testman-videos', + 'title': '本番チャンネルプラステストマン-videos', + }, + 'playlist_mincount': 6, + }, { + # query: tag, sort + 'url': 'https://nicochannel.jp/testman/videos?tag=%E6%A4%9C%E8%A8%BC%E7%94%A8&sort=-released_at', + 'info_dict': { + 'id': 'testman-videos', + 'title': '本番チャンネルプラステストマン-videos', + }, + 'playlist_mincount': 6, + }, { + # query: vodType, sort + 'url': 'https://nicochannel.jp/testman/videos?vodType=1&sort=-released_at', + 'info_dict': { + 'id': 'testman-videos', + 'title': '本番チャンネルプラステストマン-videos', + }, + 'playlist_mincount': 18, + }, { + # query: tag, vodType, sort + 'url': 'https://nicochannel.jp/testman/videos?tag=%E6%A4%9C%E8%A8%BC%E7%94%A8&vodType=1&sort=-released_at', + 'info_dict': { + 'id': 'testman-videos', + 'title': '本番チャンネルプラステストマン-videos', + }, + 'playlist_mincount': 6, + }] + + def _real_extract(self, url): + """ + API parameters: + sort: + -released_at 公開日が新しい順 (newest to oldest) + released_at 公開日が古い順 (oldest to newest) + -number_of_vod_views 再生数が多い順 (most play count) + number_of_vod_views コメントが多い順 (most comments) + vod_type (is "vodType" in "url"): + 0 すべて (all) + 1 会員限定 (members only) + 2 一部無料 (partially free) + 3 レンタル (rental) + 4 生放送アーカイブ (live archives) + 5 アップロード動画 (uploaded videos) + """ + + channel_id = self._match_id(url) + fanclub_site_id = self._find_fanclub_site_id(channel_id) + channel_name = self._get_channel_base_info(fanclub_site_id).get('fanclub_site_name') + qs = parse_qs(url) + + return self.playlist_result( + OnDemandPagedList( + functools.partial( + self._fetch_paged_channel_video_list, f'fanclub_sites/{fanclub_site_id}/video_pages', + filter_dict({ + 'tag': traverse_obj(qs, ('tag', 0)), + 'sort': traverse_obj(qs, ('sort', 0), default='-released_at'), + 'vod_type': traverse_obj(qs, ('vodType', 0), default='0'), + }), + channel_id, f'{channel_id}/videos'), + self._PAGE_SIZE), + playlist_id=f'{channel_id}-videos', playlist_title=f'{channel_name}-videos') + + +class NiconicoChannelPlusChannelLivesIE(NiconicoChannelPlusChannelBaseIE): + IE_NAME = 'NiconicoChannelPlus:channel:lives' + IE_DESC = 'ニコニコチャンネルプラス - チャンネル - ライブリスト. nicochannel.jp/channel/lives' + _VALID_URL = r'https?://nicochannel\.jp/(?P<id>[a-z\d\._-]+)/lives' + _TESTS = [{ + 'url': 'https://nicochannel.jp/testman/lives', + 'info_dict': { + 'id': 'testman-lives', + 'title': '本番チャンネルプラステストマン-lives', + }, + 'playlist_mincount': 18, + }, { + 'url': 'https://nicochannel.jp/testtarou/lives', + 'info_dict': { + 'id': 'testtarou-lives', + 'title': 'チャンネルプラステスト太郎-lives', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://nicochannel.jp/testjirou/lives', + 'info_dict': { + 'id': 'testjirou-lives', + 'title': 'チャンネルプラステスト二郎-lives', + }, + 'playlist_mincount': 6, + }] + + def _real_extract(self, url): + """ + API parameters: + live_type: + 1 放送中 (on air) + 2 放送予定 (scheduled live streams, oldest to newest) + 3 過去の放送 - すべて (all ended live streams, newest to oldest) + 4 過去の放送 - 生放送アーカイブ (all archives for live streams, oldest to newest) + We use "4" instead of "3" because some recently ended live streams could not be downloaded. + """ + + channel_id = self._match_id(url) + fanclub_site_id = self._find_fanclub_site_id(channel_id) + channel_name = self._get_channel_base_info(fanclub_site_id).get('fanclub_site_name') + + return self.playlist_result( + OnDemandPagedList( + functools.partial( + self._fetch_paged_channel_video_list, f'fanclub_sites/{fanclub_site_id}/live_pages', + { + 'live_type': 4, + }, + channel_id, f'{channel_id}/lives'), + self._PAGE_SIZE), + playlist_id=f'{channel_id}-lives', playlist_title=f'{channel_name}-lives') diff --git a/yt_dlp/extractor/ninaprotocol.py b/yt_dlp/extractor/ninaprotocol.py new file mode 100644 index 0000000000..c8063fbd12 --- /dev/null +++ b/yt_dlp/extractor/ninaprotocol.py @@ -0,0 +1,225 @@ +from .common import InfoExtractor +from ..utils import int_or_none, mimetype2ext, parse_iso8601, url_or_none +from ..utils.traversal import traverse_obj + + +class NinaProtocolIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ninaprotocol\.com/releases/(?P<id>[^/#?]+)' + _TESTS = [{ + 'url': 'https://www.ninaprotocol.com/releases/3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ', + 'info_dict': { + 'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ', + 'title': 'The Spatulas - March Chant', + 'tags': ['punk', 'postpresentmedium', 'cambridge'], + 'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A', + 'channel': 'ppm', + 'description': 'md5:bb9f9d39d8f786449cd5d0ff7c5772db', + 'album': 'The Spatulas - March Chant', + 'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50', + 'timestamp': 1701417610, + 'uploader': 'ppmrecs', + 'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP', + 'display_id': 'the-spatulas-march-chant', + 'upload_date': '20231201', + 'album_artist': 'Post Present Medium ', + }, + 'playlist': [{ + 'info_dict': { + 'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ_1', + 'title': 'March Chant In April', + 'track': 'March Chant In April', + 'ext': 'mp3', + 'duration': 152, + 'track_number': 1, + 'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A', + 'uploader': 'ppmrecs', + 'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50', + 'timestamp': 1701417610, + 'channel': 'ppm', + 'album': 'The Spatulas - March Chant', + 'tags': ['punk', 'postpresentmedium', 'cambridge'], + 'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP', + 'upload_date': '20231201', + 'album_artist': 'Post Present Medium ', + }, + }, { + 'info_dict': { + 'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ_2', + 'title': 'Rescue Mission', + 'track': 'Rescue Mission', + 'ext': 'mp3', + 'duration': 212, + 'track_number': 2, + 'album_artist': 'Post Present Medium ', + 'uploader': 'ppmrecs', + 'tags': ['punk', 'postpresentmedium', 'cambridge'], + 'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50', + 'channel': 'ppm', + 'upload_date': '20231201', + 'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP', + 'timestamp': 1701417610, + 'album': 'The Spatulas - March Chant', + 'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A', + }, + }, { + 'info_dict': { + 'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ_3', + 'title': 'Slinger Style', + 'track': 'Slinger Style', + 'ext': 'mp3', + 'duration': 179, + 'track_number': 3, + 'timestamp': 1701417610, + 'upload_date': '20231201', + 'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP', + 'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A', + 'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50', + 'album_artist': 'Post Present Medium ', + 'album': 'The Spatulas - March Chant', + 'tags': ['punk', 'postpresentmedium', 'cambridge'], + 'uploader': 'ppmrecs', + 'channel': 'ppm', + }, + }, { + 'info_dict': { + 'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ_4', + 'title': 'Psychic Signal', + 'track': 'Psychic Signal', + 'ext': 'mp3', + 'duration': 220, + 'track_number': 4, + 'tags': ['punk', 'postpresentmedium', 'cambridge'], + 'upload_date': '20231201', + 'album': 'The Spatulas - March Chant', + 'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50', + 'timestamp': 1701417610, + 'album_artist': 'Post Present Medium ', + 'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP', + 'channel': 'ppm', + 'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A', + 'uploader': 'ppmrecs', + }, + }, { + 'info_dict': { + 'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ_5', + 'title': 'Curvy Color', + 'track': 'Curvy Color', + 'ext': 'mp3', + 'duration': 148, + 'track_number': 5, + 'timestamp': 1701417610, + 'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A', + 'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50', + 'album': 'The Spatulas - March Chant', + 'album_artist': 'Post Present Medium ', + 'channel': 'ppm', + 'tags': ['punk', 'postpresentmedium', 'cambridge'], + 'uploader': 'ppmrecs', + 'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP', + 'upload_date': '20231201', + }, + }, { + 'info_dict': { + 'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ_6', + 'title': 'Caveman Star', + 'track': 'Caveman Star', + 'ext': 'mp3', + 'duration': 121, + 'track_number': 6, + 'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP', + 'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50', + 'tags': ['punk', 'postpresentmedium', 'cambridge'], + 'album_artist': 'Post Present Medium ', + 'uploader': 'ppmrecs', + 'timestamp': 1701417610, + 'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A', + 'album': 'The Spatulas - March Chant', + 'channel': 'ppm', + 'upload_date': '20231201', + }, + }], + }, { + 'url': 'https://www.ninaprotocol.com/releases/f-g-s-american-shield', + 'info_dict': { + 'id': '76PZnJwaMgViQHYfA4NYJXds7CmW6vHQKAtQUxGene6J', + 'description': 'md5:63f08d5db558b4b36e1896f317062721', + 'title': 'F.G.S. - American Shield', + 'uploader_id': 'Ej3rozs11wYqFk1Gs6oggGCkGLz8GzBhmJfnUxf6gPci', + 'channel_id': '6JuksCZPXuP16wJ1BUfwuukJzh42C7guhLrFPPkVJfyE', + 'channel': 'tinkscough', + 'tags': [], + 'album_artist': 'F.G.S.', + 'album': 'F.G.S. - American Shield', + 'thumbnail': 'https://www.arweave.net/YJpgImkXLT9SbpFb576KuZ5pm6bdvs452LMs3Rx6lm8', + 'display_id': 'f-g-s-american-shield', + 'uploader': 'flannerysilva', + 'timestamp': 1702395858, + 'upload_date': '20231212', + }, + 'playlist_count': 1, + }, { + 'url': 'https://www.ninaprotocol.com/releases/time-to-figure-things-out', + 'info_dict': { + 'id': '6Zi1nC5hj6b13NkpxVYwRhFy6mYA7oLBbe9DMrgGDcYh', + 'display_id': 'time-to-figure-things-out', + 'description': 'md5:960202ed01c3134bb8958f1008527e35', + 'timestamp': 1706283607, + 'title': 'DJ STEPDAD - time to figure things out', + 'album_artist': 'DJ STEPDAD', + 'uploader': 'tddvsss', + 'upload_date': '20240126', + 'album': 'time to figure things out', + 'uploader_id': 'AXQNRgTyYsySyAMFDwxzumuGjfmoXshorCesjpquwCBi', + 'thumbnail': 'https://www.arweave.net/O4i8bcKVqJVZvNeHHFp6r8knpFGh9ZwEgbeYacr4nss', + 'tags': [], + }, + 'playlist_count': 4, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + release = self._download_json( + f'https://api.ninaprotocol.com/v1/releases/{video_id}', video_id)['release'] + + video_id = release.get('publicKey') or video_id + + common_info = traverse_obj(release, { + 'album': ('metadata', 'properties', 'title', {str}), + 'album_artist': ((('hub', 'data'), 'publisherAccount'), 'displayName', {str}), + 'timestamp': ('datetime', {parse_iso8601}), + 'thumbnail': ('metadata', 'image', {url_or_none}), + 'uploader': ('publisherAccount', 'handle', {str}), + 'uploader_id': ('publisherAccount', 'publicKey', {str}), + 'channel': ('hub', 'handle', {str}), + 'channel_id': ('hub', 'publicKey', {str}), + }, get_all=False) + common_info['tags'] = traverse_obj(release, ('metadata', 'properties', 'tags', ..., {str})) + + entries = [] + for track_num, track in enumerate(traverse_obj(release, ( + 'metadata', 'properties', 'files', lambda _, v: url_or_none(v['uri']))), 1): + entries.append({ + 'id': f'{video_id}_{track_num}', + 'url': track['uri'], + **traverse_obj(track, { + 'title': ('track_title', {str}), + 'track': ('track_title', {str}), + 'ext': ('type', {mimetype2ext}), + 'track_number': ('track', {int_or_none}), + 'duration': ('duration', {int_or_none}), + }), + 'vcodec': 'none', + **common_info, + }) + + return { + '_type': 'playlist', + 'id': video_id, + 'entries': entries, + **traverse_obj(release, { + 'display_id': ('slug', {str}), + 'title': ('metadata', 'name', {str}), + 'description': ('metadata', 'description', {str}), + }), + **common_info, + } diff --git a/yt_dlp/extractor/ninecninemedia.py b/yt_dlp/extractor/ninecninemedia.py index 31df42f4f6..93e28624b7 100644 --- a/yt_dlp/extractor/ninecninemedia.py +++ b/yt_dlp/extractor/ninecninemedia.py @@ -3,6 +3,7 @@ float_or_none, int_or_none, parse_iso8601, + str_or_none, try_get, ) @@ -22,7 +23,7 @@ def _real_extract(self, url): title = content['Name'] content_package = content['ContentPackages'][0] package_id = content_package['Id'] - content_package_url = api_base_url + 'contentpackages/%s/' % package_id + content_package_url = api_base_url + f'contentpackages/{package_id}/' content_package = self._download_json( content_package_url, content_id, query={ '$include': '[HasClosedCaptions]', @@ -73,7 +74,7 @@ def _real_extract(self, url): 'episode_number': int_or_none(content.get('Episode')), 'season': season.get('Name'), 'season_number': int_or_none(season.get('Number')), - 'season_id': season.get('Id'), + 'season_id': str_or_none(season.get('Id')), 'series': try_get(content, lambda x: x['Media']['Name']), 'tags': tags, 'categories': categories, @@ -90,7 +91,7 @@ def _real_extract(self, url): }, { 'url': manifest_base_url + 'srt', 'ext': 'srt', - }] + }], } return info @@ -109,22 +110,21 @@ class CPTwentyFourIE(InfoExtractor): 'title': 'WATCH: Truck rips ATM from Mississauga business', 'description': 'md5:cf7498480885f080a754389a2b2f7073', 'timestamp': 1637618377, - 'episode_number': None, 'season': 'Season 0', 'season_number': 0, - 'season_id': 57974, + 'season_id': '57974', 'series': 'CTV News Toronto', 'duration': 26.86, 'thumbnail': 'http://images2.9c9media.com/image_asset/2014_11_5_2eb609a0-475b-0132-fbd6-34b52f6f1279_jpg_2000x1125.jpg', 'upload_date': '20211122', }, - 'params': {'skip_download': True, 'format': 'bv'} + 'params': {'skip_download': True, 'format': 'bv'}, }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - id, destination = self._search_regex( + video_id, destination = self._search_regex( r'getAuthStates\("(?P<id>[^"]+)",\s?"(?P<destination>[^"]+)"\);', webpage, 'video id and destination', group=('id', 'destination')) - return self.url_result(f'9c9media:{destination}:{id}', ie=NineCNineMediaIE.ie_key(), video_id=id) + return self.url_result(f'9c9media:{destination}:{video_id}', NineCNineMediaIE, video_id) diff --git a/yt_dlp/extractor/ninegag.py b/yt_dlp/extractor/ninegag.py index 865ad99ac5..2979f3a50e 100644 --- a/yt_dlp/extractor/ninegag.py +++ b/yt_dlp/extractor/ninegag.py @@ -29,7 +29,7 @@ class NineGagIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'comment_count': int, - } + }, }, { # HTML escaped title 'url': 'https://9gag.com/gag/av5nvyb', @@ -53,14 +53,14 @@ class NineGagIE(InfoExtractor): 'uploader': 'Peter Klaus', 'uploader_id': 'peterklaus12', 'uploader_url': 'https://9gag.com/u/peterklaus12', - } + }, }] def _real_extract(self, url): post_id = self._match_id(url) post = self._download_json( 'https://9gag.com/v1/post', post_id, query={ - 'id': post_id + 'id': post_id, })['data']['post'] if post.get('type') != 'Animated': diff --git a/yt_dlp/extractor/ninenews.py b/yt_dlp/extractor/ninenews.py new file mode 100644 index 0000000000..08d7fe47df --- /dev/null +++ b/yt_dlp/extractor/ninenews.py @@ -0,0 +1,72 @@ +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor +from ..utils import ExtractorError +from ..utils.traversal import traverse_obj + + +class NineNewsIE(InfoExtractor): + IE_NAME = '9News' + _VALID_URL = r'https?://(?:www\.)?9news\.com\.au/(?:[\w-]+/){2,3}(?P<id>[\w-]+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://www.9news.com.au/videos/national/fair-trading-pulls-dozens-of-toys-from-shelves/clqgc7dvj000y0jnvfism0w5m', + 'md5': 'd1a65b2e9d126e5feb9bc5cb96e62c80', + 'info_dict': { + 'id': '6343717246112', + 'ext': 'mp4', + 'title': 'Fair Trading pulls dozens of toys from shelves', + 'description': 'Fair Trading Australia have been forced to pull dozens of toys from shelves over hazard fears.', + 'thumbnail': 'md5:bdbe44294e2323b762d97acf8843f66c', + 'duration': 93.44, + 'timestamp': 1703231748, + 'upload_date': '20231222', + 'uploader_id': '664969388001', + 'tags': ['networkclip', 'aunews_aunationalninenews', 'christmas presents', 'toys', 'fair trading', 'au_news'], + }, + }, { + 'url': 'https://www.9news.com.au/world/tape-reveals-donald-trump-pressured-michigan-officials-not-to-certify-2020-vote-a-new-report-says/0b8b880e-7d3c-41b9-b2bd-55bc7e492259', + 'md5': 'a885c44d20898c3e70e9a53e8188cea1', + 'info_dict': { + 'id': '6343587450112', + 'ext': 'mp4', + 'title': 'Trump found ineligible to run for president by state court', + 'description': 'md5:40e6e7db7a4ac6be0e960569a5af6066', + 'thumbnail': 'md5:3e132c48c186039fd06c10787de9bff2', + 'duration': 104.64, + 'timestamp': 1703058034, + 'upload_date': '20231220', + 'uploader_id': '664969388001', + 'tags': ['networkclip', 'aunews_aunationalninenews', 'ineligible', 'presidential candidate', 'donald trump', 'au_news'], + }, + }, { + 'url': 'https://www.9news.com.au/national/outrage-as-parents-banned-from-giving-gifts-to-kindergarten-teachers/e19b49d4-a1a4-4533-9089-6e10e2d9386a', + 'info_dict': { + 'id': '6343716797112', + 'ext': 'mp4', + 'title': 'Outrage as parents banned from giving gifts to kindergarten teachers', + 'description': 'md5:7a8b0ed2f9e08875fd9a3e86e462bc46', + 'thumbnail': 'md5:5ee4d66717bdd0dee9fc9a705ef041b8', + 'duration': 91.307, + 'timestamp': 1703229584, + 'upload_date': '20231222', + 'uploader_id': '664969388001', + 'tags': ['networkclip', 'aunews_aunationalninenews', 'presents', 'teachers', 'kindergarten', 'au_news'], + }, + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + initial_state = self._search_json( + r'var\s+__INITIAL_STATE__\s*=', webpage, 'initial state', article_id) + video_id = traverse_obj( + initial_state, ('videoIndex', 'currentVideo', 'brightcoveId', {str}), + ('article', ..., 'media', lambda _, v: v['type'] == 'video', 'urn', {str}), get_all=False) + account = traverse_obj(initial_state, ( + 'videoIndex', 'config', (None, 'video'), 'account', {str}), get_all=False) + + if not video_id or not account: + raise ExtractorError('Unable to get the required video data') + + return self.url_result( + f'https://players.brightcove.net/{account}/default_default/index.html?videoId={video_id}', + BrightcoveNewIE, video_id) diff --git a/yt_dlp/extractor/ninenow.py b/yt_dlp/extractor/ninenow.py index b970f8ccb5..f17531e622 100644 --- a/yt_dlp/extractor/ninenow.py +++ b/yt_dlp/extractor/ninenow.py @@ -1,9 +1,8 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, - int_or_none, float_or_none, + int_or_none, smuggle_url, str_or_none, try_get, @@ -53,9 +52,9 @@ class NineNowIE(InfoExtractor): 'upload_date': '20210421', }, 'expected_warnings': ['Ignoring subtitle tracks'], - 'params':{ + 'params': { 'skip_download': True, - } + }, }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4460760524001/default_default/index.html?videoId=%s' @@ -72,15 +71,15 @@ def _real_extract(self, url): for kind in ('episode', 'clip'): current_key = page_data.get(kind, {}).get( - 'current%sKey' % kind.capitalize()) + f'current{kind.capitalize()}Key') if not current_key: continue - cache = page_data.get(kind, {}).get('%sCache' % kind, {}) + cache = page_data.get(kind, {}).get(f'{kind}Cache', {}) if not cache: continue common_data = { - 'episode': (cache.get(current_key) or list(cache.values())[0])[kind], - 'season': (cache.get(current_key) or list(cache.values())[0]).get('season', None) + 'episode': (cache.get(current_key) or next(iter(cache.values())))[kind], + 'season': (cache.get(current_key) or next(iter(cache.values()))).get('season', None), } break else: @@ -89,14 +88,14 @@ def _real_extract(self, url): if not self.get_param('allow_unplayable_formats') and try_get(common_data, lambda x: x['episode']['video']['drm'], bool): self.report_drm(display_id) brightcove_id = try_get( - common_data, lambda x: x['episode']['video']['brightcoveId'], compat_str) or 'ref:%s' % common_data['episode']['video']['referenceId'] + common_data, lambda x: x['episode']['video']['brightcoveId'], str) or 'ref:{}'.format(common_data['episode']['video']['referenceId']) video_id = str_or_none(try_get(common_data, lambda x: x['episode']['video']['id'])) or brightcove_id - title = try_get(common_data, lambda x: x['episode']['name'], compat_str) + title = try_get(common_data, lambda x: x['episode']['name'], str) season_number = try_get(common_data, lambda x: x['season']['seasonNumber'], int) episode_number = try_get(common_data, lambda x: x['episode']['episodeNumber'], int) - timestamp = unified_timestamp(try_get(common_data, lambda x: x['episode']['airDate'], compat_str)) - release_date = unified_strdate(try_get(common_data, lambda x: x['episode']['availability'], compat_str)) + timestamp = unified_timestamp(try_get(common_data, lambda x: x['episode']['airDate'], str)) + release_date = unified_strdate(try_get(common_data, lambda x: x['episode']['availability'], str)) thumbnails_data = try_get(common_data, lambda x: x['episode']['image']['sizes'], dict) or {} thumbnails = [{ 'id': thumbnail_id, @@ -111,7 +110,7 @@ def _real_extract(self, url): {'geo_countries': self._GEO_COUNTRIES}), 'id': video_id, 'title': title, - 'description': try_get(common_data, lambda x: x['episode']['description'], compat_str), + 'description': try_get(common_data, lambda x: x['episode']['description'], str), 'duration': float_or_none(try_get(common_data, lambda x: x['episode']['video']['duration'], float), 1000), 'thumbnails': thumbnails, 'ie_key': 'BrightcoveNew', diff --git a/yt_dlp/extractor/nintendo.py b/yt_dlp/extractor/nintendo.py index ed839af25d..d8eb85306d 100644 --- a/yt_dlp/extractor/nintendo.py +++ b/yt_dlp/extractor/nintendo.py @@ -1,57 +1,131 @@ -import re +import json +import urllib.parse from .common import InfoExtractor -from .ooyala import OoyalaIE +from ..utils import ( + ExtractorError, + make_archive_id, + unified_timestamp, + urljoin, +) +from ..utils.traversal import traverse_obj class NintendoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nintendo\.com/(?:games/detail|nintendo-direct)/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?nintendo\.com/(?:(?P<locale>\w{2}(?:-\w{2})?)/)?nintendo-direct/(?P<slug>[^/?#]+)' _TESTS = [{ - 'url': 'https://www.nintendo.com/games/detail/duck-hunt-wii-u/', - 'info_dict': { - 'id': 'MzMmticjp0VPzO3CCj4rmFOuohEuEWoW', - 'ext': 'flv', - 'title': 'Duck Hunt Wii U VC NES - Trailer', - 'duration': 60.326, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Ooyala'], - }, { - 'url': 'http://www.nintendo.com/games/detail/tokyo-mirage-sessions-fe-wii-u', - 'info_dict': { - 'id': 'tokyo-mirage-sessions-fe-wii-u', - 'title': 'Tokyo Mirage Sessions ♯FE', - }, - 'playlist_count': 4, - }, { 'url': 'https://www.nintendo.com/nintendo-direct/09-04-2019/', 'info_dict': { - 'id': 'J2bXdmaTE6fe3dWJTPcc7m23FNbc_A1V', 'ext': 'mp4', - 'title': 'Switch_ROS_ND0904-H264.mov', - 'duration': 2324.758, + 'id': '2oPmiviVePUA1IqAZzjuVh', + 'display_id': '09-04-2019', + 'title': 'Nintendo Direct 9.4.2019', + 'timestamp': 1567580400, + 'description': 'md5:8aac2780361d8cb772b6d1de66d7d6f4', + 'upload_date': '20190904', + 'age_limit': 17, + '_old_archive_ids': ['nintendo J2bXdmaTE6fe3dWJTPcc7m23FNbc_A1V'], }, - 'params': { - 'skip_download': True, + }, { + 'url': 'https://www.nintendo.com/en-ca/nintendo-direct/08-31-2023/', + 'info_dict': { + 'ext': 'mp4', + 'id': '2TB2w2rJhNYF84qQ9E57hU', + 'display_id': '08-31-2023', + 'title': 'Super Mario Bros. Wonder Direct 8.31.2023', + 'timestamp': 1693465200, + 'description': 'md5:3067c5b824bcfdae9090a7f38ab2d200', + 'tags': ['Mild Fantasy Violence', 'In-Game Purchases'], + 'upload_date': '20230831', + 'age_limit': 6, + }, + }, { + 'url': 'https://www.nintendo.com/us/nintendo-direct/50-fact-extravaganza/', + 'info_dict': { + 'ext': 'mp4', + 'id': 'j0BBGzfw0pQ', + 'channel_follower_count': int, + 'view_count': int, + 'description': 'Learn new details about Super Smash Bros. for Wii U, which launches on November 21.', + 'duration': 2123, + 'availability': 'public', + 'thumbnail': 'https://i.ytimg.com/vi_webp/j0BBGzfw0pQ/maxresdefault.webp', + 'timestamp': 1414047600, + 'channel_id': 'UCGIY_O-8vW4rfX98KlMkvRg', + 'chapters': 'count:53', + 'heatmap': 'count:100', + 'upload_date': '20141023', + 'uploader_id': '@NintendoAmerica', + 'playable_in_embed': True, + 'categories': ['Gaming'], + 'display_id': '50-fact-extravaganza', + 'channel': 'Nintendo of America', + 'tags': ['Comic Mischief', 'Cartoon Violence', 'Mild Suggestive Themes'], + 'like_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCGIY_O-8vW4rfX98KlMkvRg', + 'age_limit': 10, + 'uploader_url': 'https://www.youtube.com/@NintendoAmerica', + 'comment_count': int, + 'live_status': 'not_live', + 'uploader': 'Nintendo of America', + 'title': '50-FACT Extravaganza', }, - 'add_ie': ['Ooyala'], }] + def _create_asset_url(self, path): + return urljoin('https://assets.nintendo.com/', urllib.parse.quote(path)) + def _real_extract(self, url): - page_id = self._match_id(url) + locale, slug = self._match_valid_url(url).group('locale', 'slug') - webpage = self._download_webpage(url, page_id) + language, _, country = (locale or 'US').rpartition('-') + parsed_locale = f'{language.lower() or "en"}_{country.upper()}' + self.write_debug(f'Using locale {parsed_locale} (from {locale})', only_once=True) - entries = [ - OoyalaIE._build_url_result(m.group('code')) - for m in re.finditer( - r'data-(?:video-id|directVideoId)=(["\'])(?P<code>(?:(?!\1).)+)\1', webpage)] + response = self._download_json('https://graph.nintendo.com/', slug, query={ + 'operationName': 'NintendoDirect', + 'variables': json.dumps({ + 'locale': parsed_locale, + 'slug': slug, + }, separators=(',', ':')), + 'extensions': json.dumps({ + 'persistedQuery': { + 'version': 1, + 'sha256Hash': '969b16fe9f08b686fa37bc44d1fd913b6188e65794bb5e341c54fa683a8004cb', + }, + }, separators=(',', ':')), + }) + # API returns `{"data": {"direct": null}}` if no matching id + direct_info = traverse_obj(response, ('data', 'direct', {dict})) + if not direct_info: + raise ExtractorError(f'No Nintendo Direct with id {slug} exists', expected=True) - title = self._html_search_regex( - r'(?s)<(?:span|div)[^>]+class="(?:title|wrapper)"[^>]*>.*?<h1>(.+?)</h1>', - webpage, 'title', fatal=False) + errors = ', '.join(traverse_obj(response, ('errors', ..., 'message'))) + if errors: + raise ExtractorError(f'GraphQL API error: {errors or "Unknown error"}') - return self.playlist_result( - entries, page_id, title) + result = traverse_obj(direct_info, { + 'id': ('id', {str}), + 'title': ('name', {str}), + 'timestamp': ('startDate', {unified_timestamp}), + 'description': ('description', 'text', {str}), + 'age_limit': ('contentRating', 'order', {int}), + 'tags': ('contentDescriptors', ..., 'label', {str}), + 'thumbnail': ('thumbnail', {self._create_asset_url}), + }) + result['display_id'] = slug + + asset_id = traverse_obj(direct_info, ('video', 'publicId', {str})) + if not asset_id: + youtube_id = traverse_obj(direct_info, ('liveStream', {str})) + if not youtube_id: + self.raise_no_formats('Could not find any video formats', video_id=slug) + + return self.url_result(youtube_id, **result, url_transparent=True) + + if asset_id.startswith('Legacy Videos/'): + result['_old_archive_ids'] = [make_archive_id(self, asset_id[14:])] + result['formats'] = self._extract_m3u8_formats( + self._create_asset_url(f'/video/upload/sp_full_hd/v1/{asset_id}.m3u8'), slug) + + return result diff --git a/yt_dlp/extractor/nitter.py b/yt_dlp/extractor/nitter.py index 5d1ca1f5d0..7609b40178 100644 --- a/yt_dlp/extractor/nitter.py +++ b/yt_dlp/extractor/nitter.py @@ -1,13 +1,14 @@ -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - parse_count, - unified_timestamp, - remove_end, - determine_ext, -) -import re import random +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + parse_count, + remove_end, + unified_timestamp, +) class NitterIE(InfoExtractor): @@ -264,13 +265,33 @@ class NitterIE(InfoExtractor): 'like_count': int, 'repost_count': int, 'comment_count': int, - } - } + }, + }, { # no OpenGraph title + 'url': f'https://{current_instance}/LocalBateman/status/1678455464038735895#m', + 'info_dict': { + 'id': '1678455464038735895', + 'ext': 'mp4', + 'title': 'Your Typical Local Man - Local man, what did Romanians ever do to you?', + 'description': 'Local man, what did Romanians ever do to you?', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Your Typical Local Man', + 'uploader_id': 'LocalBateman', + 'uploader_url': f'https://{current_instance}/LocalBateman', + 'upload_date': '20230710', + 'timestamp': 1689009900, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], + 'params': {'skip_download': 'm3u8'}, + }, ] def _real_extract(self, url): video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') - parsed_url = compat_urlparse.urlparse(url) + parsed_url = urllib.parse.urlparse(url) base_url = f'{parsed_url.scheme}://{parsed_url.netloc}' self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on') @@ -280,7 +301,7 @@ def _real_extract(self, url): if main_tweet_start > 0: webpage = full_webpage[main_tweet_start:] - video_url = '%s%s' % (base_url, self._html_search_regex( + video_url = '{}{}'.format(base_url, self._html_search_regex( r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url')) ext = determine_ext(video_url) @@ -289,10 +310,10 @@ def _real_extract(self, url): else: formats = [{ 'url': video_url, - 'ext': ext + 'ext': ext, }] - title = description = self._og_search_description(full_webpage) or self._html_search_regex( + title = description = self._og_search_description(full_webpage, default=None) or self._html_search_regex( r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False) uploader_id = self._html_search_regex( @@ -313,12 +334,12 @@ def _real_extract(self, url): thumbnail = ( self._html_search_meta('og:image', full_webpage, 'thumbnail url') - or remove_end('%s%s' % (base_url, self._html_search_regex( + or remove_end('{}{}'.format(base_url, self._html_search_regex( r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall')) thumbnails = [ - {'id': id, 'url': f'{thumbnail}%3A{id}'} - for id in ('thumb', 'small', 'large', 'medium', 'orig') + {'id': id_, 'url': f'{thumbnail}%3A{id_}'} + for id_ in ('thumb', 'small', 'large', 'medium', 'orig') ] date = self._html_search_regex( diff --git a/yt_dlp/extractor/njpwworld.py b/yt_dlp/extractor/njpwworld.py deleted file mode 100644 index 7b8a526f02..0000000000 --- a/yt_dlp/extractor/njpwworld.py +++ /dev/null @@ -1,82 +0,0 @@ -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - get_element_by_class, - urlencode_postdata, -) - - -class NJPWWorldIE(InfoExtractor): - _VALID_URL = r'https?://(front\.)?njpwworld\.com/p/(?P<id>[a-z0-9_]+)' - IE_DESC = '新日本プロレスワールド' - _NETRC_MACHINE = 'njpwworld' - - _TESTS = [{ - 'url': 'http://njpwworld.com/p/s_series_00155_1_9/', - 'info_dict': { - 'id': 's_series_00155_1_9', - 'ext': 'mp4', - 'title': '闘強導夢2000 2000年1月4日 東京ドーム 第9試合 ランディ・サベージ VS リック・スタイナー', - 'tags': list, - }, - 'params': { - 'skip_download': True, # AES-encrypted m3u8 - }, - 'skip': 'Requires login', - }, { - 'url': 'https://front.njpwworld.com/p/s_series_00563_16_bs', - 'info_dict': { - 'id': 's_series_00563_16_bs', - 'ext': 'mp4', - 'title': 'WORLD TAG LEAGUE 2020 & BEST OF THE SUPER Jr.27 2020年12月6日 福岡・福岡国際センター バックステージコメント(字幕あり)', - 'tags': ["福岡・福岡国際センター", "バックステージコメント", "2020", "20年代"], - }, - 'params': { - 'skip_download': True, - }, - }] - - _LOGIN_URL = 'https://front.njpwworld.com/auth/login' - - def _perform_login(self, username, password): - # Setup session (will set necessary cookies) - self._request_webpage( - 'https://njpwworld.com/', None, note='Setting up session') - - webpage, urlh = self._download_webpage_handle( - self._LOGIN_URL, None, - note='Logging in', errnote='Unable to login', - data=urlencode_postdata({'login_id': username, 'pw': password}), - headers={'Referer': 'https://front.njpwworld.com/auth'}) - # /auth/login will return 302 for successful logins - if urlh.geturl() == self._LOGIN_URL: - self.report_warning('unable to login') - return False - - return True - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - formats = [] - for kind, vid in re.findall(r'if\s+\(\s*imageQualityType\s*==\s*\'([^\']+)\'\s*\)\s*{\s*video_id\s*=\s*"(\d+)"', webpage): - player_path = '/intent?id=%s&type=url' % vid - player_url = compat_urlparse.urljoin(url, player_path) - formats += self._extract_m3u8_formats( - player_url, video_id, 'mp4', 'm3u8_native', m3u8_id=kind, fatal=False, quality=int(kind == 'high')) - - tag_block = get_element_by_class('tag-block', webpage) - tags = re.findall( - r'<a[^>]+class="tag-[^"]+"[^>]*>([^<]+)</a>', tag_block - ) if tag_block else None - - return { - 'id': video_id, - 'title': get_element_by_class('article-title', webpage) or self._og_search_title(webpage), - 'formats': formats, - 'tags': tags, - } diff --git a/yt_dlp/extractor/nobelprize.py b/yt_dlp/extractor/nobelprize.py index 1aa9705be4..536ca27f75 100644 --- a/yt_dlp/extractor/nobelprize.py +++ b/yt_dlp/extractor/nobelprize.py @@ -1,15 +1,16 @@ from .common import InfoExtractor from ..utils import ( - js_to_json, - mimetype2ext, determine_ext, - update_url_query, get_element_by_attribute, int_or_none, + js_to_json, + mimetype2ext, + update_url_query, ) class NobelPrizeIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?nobelprize\.org/mediaplayer.*?\bid=(?P<id>\d+)' _TEST = { 'url': 'http://www.nobelprize.org/mediaplayer/?id=2636', @@ -19,7 +20,7 @@ class NobelPrizeIE(InfoExtractor): 'ext': 'mp4', 'title': 'Announcement of the 2016 Nobel Prize in Physics', 'description': 'md5:05beba57f4f5a4bbd4cf2ef28fcff739', - } + }, } def _real_extract(self, url): diff --git a/yt_dlp/extractor/noice.py b/yt_dlp/extractor/noice.py index e6e343303a..f413055b31 100644 --- a/yt_dlp/extractor/noice.py +++ b/yt_dlp/extractor/noice.py @@ -35,7 +35,7 @@ class NoicePodcastIE(InfoExtractor): 'comment_count': int, 'dislike_count': int, 'channel_follower_count': int, - } + }, }, { 'url': 'https://open.noice.id/content/222134e4-99f2-456f-b8a2-b8be404bf063', 'info_dict': { @@ -60,7 +60,7 @@ class NoicePodcastIE(InfoExtractor): 'comment_count': int, 'channel': 'Dear Jerome', 'channel_follower_count': int, - } + }, }] def _get_formats_and_subtitles(self, media_url, video_id): @@ -112,5 +112,5 @@ def _real_extract(self, url): 'dislike_count': 'dislikes', 'comment_count': 'comments', 'channel_follower_count': 'followers', - })) + })), } diff --git a/yt_dlp/extractor/nonktube.py b/yt_dlp/extractor/nonktube.py index f191be33b7..192e6bb52a 100644 --- a/yt_dlp/extractor/nonktube.py +++ b/yt_dlp/extractor/nonktube.py @@ -14,7 +14,7 @@ class NonkTubeIE(NuevoBaseIE): }, 'params': { 'skip_download': True, - } + }, }, { 'url': 'https://www.nonktube.com/embed/118636', 'only_matching': True, diff --git a/yt_dlp/extractor/noodlemagazine.py b/yt_dlp/extractor/noodlemagazine.py index e6208956fb..6414f46efb 100644 --- a/yt_dlp/extractor/noodlemagazine.py +++ b/yt_dlp/extractor/noodlemagazine.py @@ -1,9 +1,12 @@ from .common import InfoExtractor from ..utils import ( - parse_duration, + int_or_none, parse_count, - unified_strdate + parse_duration, + unified_strdate, + urljoin, ) +from ..utils.traversal import traverse_obj class NoodleMagazineIE(InfoExtractor): @@ -22,8 +25,8 @@ class NoodleMagazineIE(InfoExtractor): 'description': 'Aria alexander manojob', 'tags': ['aria', 'alexander', 'manojob'], 'upload_date': '20190218', - 'age_limit': 18 - } + 'age_limit': 18, + }, } def _real_extract(self, url): @@ -37,26 +40,41 @@ def _real_extract(self, url): like_count = parse_count(self._html_search_meta('ya:ovs:likes', webpage, default=None)) upload_date = unified_strdate(self._html_search_meta('ya:ovs:upload_date', webpage, default='')) - key = self._html_search_regex(rf'/{video_id}\?(?:.*&)?m=([^&"\'\s,]+)', webpage, 'key') - playlist_info = self._download_json(f'https://adult.noodlemagazine.com/playlist/{video_id}?m={key}', video_id) - thumbnail = self._og_search_property('image', webpage, default=None) or playlist_info.get('image') + def build_url(url_or_path): + return urljoin('https://adult.noodlemagazine.com', url_or_path) - formats = [{ - 'url': source.get('file'), - 'quality': source.get('label'), - 'ext': source.get('type'), - } for source in playlist_info.get('sources')] + headers = {'Referer': url} + player_path = self._html_search_regex( + r'<iframe[^>]+\bid="iplayer"[^>]+\bsrc="([^"]+)"', webpage, 'player path') + player_iframe = self._download_webpage( + build_url(player_path), video_id, 'Downloading iframe page', headers=headers) + playlist_url = self._search_regex( + r'window\.playlistUrl\s*=\s*["\']([^"\']+)["\']', player_iframe, 'playlist url') + playlist_info = self._download_json(build_url(playlist_url), video_id, headers=headers) + + formats = [] + for source in traverse_obj(playlist_info, ('sources', lambda _, v: v['file'])): + if source.get('type') == 'hls': + formats.extend(self._extract_m3u8_formats( + build_url(source['file']), video_id, 'mp4', fatal=False, m3u8_id='hls')) + else: + formats.append(traverse_obj(source, { + 'url': ('file', {build_url}), + 'format_id': 'label', + 'height': ('label', {int_or_none}), + 'ext': 'type', + })) return { 'id': video_id, 'formats': formats, 'title': title, - 'thumbnail': thumbnail, + 'thumbnail': self._og_search_property('image', webpage, default=None) or playlist_info.get('image'), 'duration': duration, 'description': description, 'tags': tags, 'view_count': view_count, 'like_count': like_count, 'upload_date': upload_date, - 'age_limit': 18 + 'age_limit': 18, } diff --git a/yt_dlp/extractor/noovo.py b/yt_dlp/extractor/noovo.py index acbb74c6eb..772d4ed9e0 100644 --- a/yt_dlp/extractor/noovo.py +++ b/yt_dlp/extractor/noovo.py @@ -1,6 +1,5 @@ from .brightcove import BrightcoveNewIE from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, js_to_json, @@ -65,7 +64,7 @@ def _real_extract(self, url): title = try_get( data, lambda x: x['video']['nom'], - compat_str) or self._html_search_meta( + str) or self._html_search_meta( 'dcterms.Title', webpage, 'title', fatal=True) description = self._html_search_meta( @@ -77,11 +76,11 @@ def _real_extract(self, url): webpage, 'series', default=None) season_el = try_get(data, lambda x: x['emission']['saison'], dict) or {} - season = try_get(season_el, lambda x: x['nom'], compat_str) + season = try_get(season_el, lambda x: x['nom'], str) season_number = int_or_none(try_get(season_el, lambda x: x['numero'])) episode_el = try_get(season_el, lambda x: x['episode'], dict) or {} - episode = try_get(episode_el, lambda x: x['nom'], compat_str) + episode = try_get(episode_el, lambda x: x['nom'], str) episode_number = int_or_none(try_get(episode_el, lambda x: x['numero'])) return { diff --git a/yt_dlp/extractor/normalboots.py b/yt_dlp/extractor/normalboots.py deleted file mode 100644 index 07babcd2c5..0000000000 --- a/yt_dlp/extractor/normalboots.py +++ /dev/null @@ -1,51 +0,0 @@ -from .common import InfoExtractor -from .jwplatform import JWPlatformIE - -from ..utils import ( - unified_strdate, -) - - -class NormalbootsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?normalboots\.com/video/(?P<id>[0-9a-z-]*)/?$' - _TEST = { - 'url': 'http://normalboots.com/video/home-alone-games-jontron/', - 'info_dict': { - 'id': 'home-alone-games-jontron', - 'ext': 'mp4', - 'title': 'Home Alone Games - JonTron - NormalBoots', - 'description': 'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for ‘Tense Battle Theme’:\xa0http://www.youtube.com/Kiamet/', - 'uploader': 'JonTron', - 'upload_date': '20140125', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': ['JWPlatform'], - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_uploader = self._html_search_regex( - r'Posted\sby\s<a\shref="[A-Za-z0-9/]*">(?P<uploader>[A-Za-z]*)\s</a>', - webpage, 'uploader', fatal=False) - video_upload_date = unified_strdate(self._html_search_regex( - r'<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>', - webpage, 'date', fatal=False)) - - jwplatform_url = JWPlatformIE._extract_url(webpage) - - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': jwplatform_url, - 'ie_key': JWPlatformIE.ie_key(), - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader': video_uploader, - 'upload_date': video_upload_date, - } diff --git a/yt_dlp/extractor/nosnl.py b/yt_dlp/extractor/nosnl.py index cea54c98e2..13f908c2c2 100644 --- a/yt_dlp/extractor/nosnl.py +++ b/yt_dlp/extractor/nosnl.py @@ -15,7 +15,7 @@ class NOSNLArticleIE(InfoExtractor): 'title': '\'We hebben een huis vol met scheuren\'', 'duration': 95.0, 'thumbnail': 'https://cdn.nos.nl/image/2022/08/12/887149/3840x2160a.jpg', - } + }, }, { # more than 1 video 'url': 'https://nos.nl/artikel/2440409-vannacht-sliepen-weer-enkele-honderden-asielzoekers-in-ter-apel-buiten', @@ -64,7 +64,7 @@ class NOSNLArticleIE(InfoExtractor): 'categories': ['Buitenland'], }, 'playlist_mincount': 1, - } + }, ] def _entries(self, nextjs_json, display_id): @@ -82,7 +82,7 @@ def _entries(self, nextjs_json, display_id): 'thumbnails': [{ 'url': traverse_obj(image, ('url', ...), get_all=False), 'width': image.get('width'), - 'height': image.get('height') + 'height': image.get('height'), } for image in traverse_obj(item, ('imagesByRatio', ...))[0]], } diff --git a/yt_dlp/extractor/nosvideo.py b/yt_dlp/extractor/nosvideo.py deleted file mode 100644 index b6d3ea40c1..0000000000 --- a/yt_dlp/extractor/nosvideo.py +++ /dev/null @@ -1,72 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - sanitized_Request, - urlencode_postdata, - xpath_text, - xpath_with_ns, -) - -_x = lambda p: xpath_with_ns(p, {'xspf': 'http://xspf.org/ns/0/'}) - - -class NosVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nosvideo\.com/' + \ - r'(?:embed/|\?v=)(?P<id>[A-Za-z0-9]{12})/?' - _PLAYLIST_URL = 'http://nosvideo.com/xml/{xml_id:s}.xml' - _FILE_DELETED_REGEX = r'<b>File Not Found</b>' - _TEST = { - 'url': 'http://nosvideo.com/?v=mu8fle7g7rpq', - 'md5': '6124ed47130d8be3eacae635b071e6b6', - 'info_dict': { - 'id': 'mu8fle7g7rpq', - 'ext': 'mp4', - 'title': 'big_buck_bunny_480p_surround-fix.avi.mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - fields = { - 'id': video_id, - 'op': 'download1', - 'method_free': 'Continue to Video', - } - req = sanitized_Request(url, urlencode_postdata(fields)) - req.add_header('Content-type', 'application/x-www-form-urlencoded') - webpage = self._download_webpage(req, video_id, - 'Downloading download page') - if re.search(self._FILE_DELETED_REGEX, webpage) is not None: - raise ExtractorError('Video %s does not exist' % video_id, - expected=True) - - xml_id = self._search_regex(r'php\|([^\|]+)\|', webpage, 'XML ID') - playlist_url = self._PLAYLIST_URL.format(xml_id=xml_id) - playlist = self._download_xml(playlist_url, video_id) - - track = playlist.find(_x('.//xspf:track')) - if track is None: - raise ExtractorError( - 'XML playlist is missing the \'track\' element', - expected=True) - title = xpath_text(track, _x('./xspf:title'), 'title') - url = xpath_text(track, _x('./xspf:file'), 'URL', fatal=True) - thumbnail = xpath_text(track, _x('./xspf:image'), 'thumbnail') - if title is not None: - title = title.strip() - - formats = [{ - 'format_id': 'sd', - 'url': url, - }] - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - } diff --git a/yt_dlp/extractor/nova.py b/yt_dlp/extractor/nova.py index 8bd3fd4725..e7b69e3706 100644 --- a/yt_dlp/extractor/nova.py +++ b/yt_dlp/extractor/nova.py @@ -6,7 +6,6 @@ determine_ext, int_or_none, js_to_json, - qualities, traverse_obj, unified_strdate, url_or_none, @@ -14,7 +13,7 @@ class NovaEmbedIE(InfoExtractor): - _VALID_URL = r'https?://media\.cms\.nova\.cz/embed/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://media(?:tn)?\.cms\.nova\.cz/embed/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://media.cms.nova.cz/embed/8o0n0r?autoplay=1', 'info_dict': { @@ -38,6 +37,16 @@ class NovaEmbedIE(InfoExtractor): 'duration': 114, }, 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://mediatn.cms.nova.cz/embed/EU5ELEsmOHt?autoplay=1', + 'info_dict': { + 'id': 'EU5ELEsmOHt', + 'ext': 'mp4', + 'title': 'Haptické křeslo, bionická ruka nebo roboti. Reportérka se podívala na Týden inovací', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 1780, + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): @@ -49,77 +58,52 @@ def _real_extract(self, url): duration = None formats = [] - player = self._parse_json( - self._search_regex( - (r'(?:(?:replacePlaceholders|processAdTagModifier).*?:\s*)?(?:replacePlaceholders|processAdTagModifier)\s*\(\s*(?P<json>{.*?})\s*\)(?:\s*\))?\s*,', - r'Player\.init\s*\([^,]+,(?P<cndn>\s*\w+\s*\?)?\s*(?P<json>{(?(cndn).+?|.+)})\s*(?(cndn):|,\s*{.+?}\s*\)\s*;)'), - webpage, 'player', default='{}', group='json'), video_id, fatal=False) - if player: - for format_id, format_list in player['tracks'].items(): - if not isinstance(format_list, list): - format_list = [format_list] - for format_dict in format_list: - if not isinstance(format_dict, dict): - continue - if (not self.get_param('allow_unplayable_formats') - and traverse_obj(format_dict, ('drm', 'keySystem'))): - has_drm = True - continue - format_url = url_or_none(format_dict.get('src')) - format_type = format_dict.get('type') - ext = determine_ext(format_url) - if (format_type == 'application/x-mpegURL' - or format_id == 'HLS' or ext == 'm3u8'): - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - elif (format_type == 'application/dash+xml' - or format_id == 'DASH' or ext == 'mpd'): - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False)) - else: - formats.append({ - 'url': format_url, - }) - duration = int_or_none(player.get('duration')) - else: - # Old path, not actual as of 08.04.2020 - bitrates = self._parse_json( - self._search_regex( - r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'), - video_id, transform_source=js_to_json) - - QUALITIES = ('lq', 'mq', 'hq', 'hd') - quality_key = qualities(QUALITIES) - - for format_id, format_list in bitrates.items(): - if not isinstance(format_list, list): - format_list = [format_list] - for format_url in format_list: - format_url = url_or_none(format_url) - if not format_url: - continue - if format_id == 'hls': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - continue - f = { + def process_format_list(format_list, format_id=''): + nonlocal formats, has_drm + if not isinstance(format_list, list): + format_list = [format_list] + for format_dict in format_list: + if not isinstance(format_dict, dict): + continue + if (not self.get_param('allow_unplayable_formats') + and traverse_obj(format_dict, ('drm', 'keySystem'))): + has_drm = True + continue + format_url = url_or_none(format_dict.get('src')) + format_type = format_dict.get('type') + ext = determine_ext(format_url) + if (format_type == 'application/x-mpegURL' + or format_id == 'HLS' or ext == 'm3u8'): + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + elif (format_type == 'application/dash+xml' + or format_id == 'DASH' or ext == 'mpd'): + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ 'url': format_url, - } - f_id = format_id - for quality in QUALITIES: - if '%s.mp4' % quality in format_url: - f_id += '-%s' % quality - f.update({ - 'quality': quality_key(quality), - 'format_note': quality.upper(), - }) - break - f['format_id'] = f_id - formats.append(f) + }) + + player = self._search_json( + r'player:', webpage, 'player', video_id, fatal=False, end_pattern=r';\s*</script>') + if player: + for src in traverse_obj(player, ('lib', 'source', 'sources', ...)): + process_format_list(src) + duration = traverse_obj(player, ('sourceInfo', 'duration', {int_or_none})) + if not formats and not has_drm: + # older code path, in use before August 2023 + player = self._parse_json( + self._search_regex( + (r'(?:(?:replacePlaceholders|processAdTagModifier).*?:\s*)?(?:replacePlaceholders|processAdTagModifier)\s*\(\s*(?P<json>{.*?})\s*\)(?:\s*\))?\s*,', + r'Player\.init\s*\([^,]+,(?P<cndn>\s*\w+\s*\?)?\s*(?P<json>{(?(cndn).+?|.+)})\s*(?(cndn):|,\s*{.+?}\s*\)\s*;)'), + webpage, 'player', group='json'), video_id) + if player: + for format_id, format_list in player['tracks'].items(): + process_format_list(format_list, format_id) + duration = int_or_none(player.get('duration')) if not formats and has_drm: self.report_drm(video_id) @@ -151,15 +135,16 @@ class NovaIE(InfoExtractor): _VALID_URL = r'https?://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)' _TESTS = [{ 'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260', - 'md5': '249baab7d0104e186e78b0899c7d5f28', + 'md5': 'da8f3f1fcdaf9fb0f112a32a165760a3', 'info_dict': { - 'id': '1757139', - 'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci', + 'id': '8OvQqEvV3MW', + 'display_id': '8OvQqEvV3MW', 'ext': 'mp4', 'title': 'Podzemní nemocnice v pražské Krči', 'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53', 'thumbnail': r're:^https?://.*\.(?:jpg)', - } + 'duration': 151, + }, }, { 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html', 'info_dict': { @@ -226,16 +211,16 @@ def _real_extract(self, url): # novaplus embed_id = self._search_regex( - r'<iframe[^>]+\bsrc=["\'](?:https?:)?//media\.cms\.nova\.cz/embed/([^/?#&]+)', + r'<iframe[^>]+\bsrc=["\'](?:https?:)?//media(?:tn)?\.cms\.nova\.cz/embed/([^/?#&"\']+)', webpage, 'embed url', default=None) if embed_id: return { '_type': 'url_transparent', - 'url': 'https://media.cms.nova.cz/embed/%s' % embed_id, + 'url': f'https://media.cms.nova.cz/embed/{embed_id}', 'ie_key': NovaEmbedIE.ie_key(), 'id': embed_id, 'description': description, - 'upload_date': upload_date + 'upload_date': upload_date, } video_id = self._search_regex( diff --git a/yt_dlp/extractor/novaplay.py b/yt_dlp/extractor/novaplay.py index 92d1d136c7..77d7ce1860 100644 --- a/yt_dlp/extractor/novaplay.py +++ b/yt_dlp/extractor/novaplay.py @@ -3,7 +3,7 @@ class NovaPlayIE(InfoExtractor): - _VALID_URL = r'https://play.nova\.bg/video/.*/(?P<id>\d+)' + _VALID_URL = r'https?://play\.nova\.bg/video/[^?#]+/(?P<id>\d+)' _TESTS = [ { 'url': 'https://play.nova.bg/video/ochakvaite/season-0/ochakvaite-2022-07-22-sybudi-se-sat/606627', @@ -18,7 +18,6 @@ class NovaPlayIE(InfoExtractor): 'upload_date': '20220722', 'thumbnail': 'https://nbg-img.fite.tv/img/606627_460x260.jpg', 'description': '29 сек', - 'view_count': False }, }, { @@ -34,9 +33,8 @@ class NovaPlayIE(InfoExtractor): 'upload_date': '20220722', 'thumbnail': 'https://nbg-img.fite.tv/img/606609_460x260.jpg', 'description': '29 сек', - 'view_count': False }, - } + }, ] _access_token = None @@ -52,7 +50,7 @@ def _real_extract(self, url): video_id, headers={ 'x-flipps-user-agent': 'Flipps/75/9.7', 'x-flipps-version': '2022-05-17', - 'Authorization': f'Bearer {self._access_token}' + 'Authorization': f'Bearer {self._access_token}', })[0]['links']['play']['href'] formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls') diff --git a/yt_dlp/extractor/nowness.py b/yt_dlp/extractor/nowness.py index fc9043bceb..c001a82e9f 100644 --- a/yt_dlp/extractor/nowness.py +++ b/yt_dlp/extractor/nowness.py @@ -3,11 +3,8 @@ BrightcoveNewIE, ) from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - sanitized_Request, -) +from ..networking import Request +from ..utils import ExtractorError class NownessBaseIE(InfoExtractor): @@ -19,7 +16,7 @@ def _extract_url_result(self, post): source = media['source'] if source == 'brightcove': player_code = self._download_webpage( - 'http://www.nowness.com/iframe?id=%s' % video_id, video_id, + f'http://www.nowness.com/iframe?id={video_id}', video_id, note='Downloading player JavaScript', errnote='Unable to download player JavaScript') bc_url = BrightcoveLegacyIE._extract_brightcove_url(player_code) @@ -30,7 +27,7 @@ def _extract_url_result(self, post): return self.url_result(bc_url, BrightcoveNewIE.ie_key()) raise ExtractorError('Could not find player definition') elif source == 'vimeo': - return self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo') + return self.url_result(f'http://vimeo.com/{video_id}', 'Vimeo') elif source == 'youtube': return self.url_result(video_id, 'Youtube') elif source == 'cinematique': @@ -40,7 +37,7 @@ def _extract_url_result(self, post): def _api_request(self, url, request_path): display_id = self._match_id(url) - request = sanitized_Request( + request = Request( 'http://api.nowness.com/api/' + request_path % display_id, headers={ 'X-Nowness-Language': 'zh-cn' if 'cn.nowness.com' in url else 'en-us', @@ -141,4 +138,4 @@ def _real_extract(self, url): series_title = translations[0].get('title') or translations[0]['seoTitle'] series_description = translations[0].get('seoDescription') return self.playlist_result( - entries, compat_str(series['id']), series_title, series_description) + entries, str(series['id']), series_title, series_description) diff --git a/yt_dlp/extractor/noz.py b/yt_dlp/extractor/noz.py index 59d259f9df..8476a857ff 100644 --- a/yt_dlp/extractor/noz.py +++ b/yt_dlp/extractor/noz.py @@ -1,14 +1,16 @@ +import urllib.parse + from .common import InfoExtractor from ..utils import ( - int_or_none, find_xpath_attr, - xpath_text, + int_or_none, update_url_query, + xpath_text, ) -from ..compat import compat_urllib_parse_unquote class NozIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?noz\.de/video/(?P<id>[0-9]+)/' _TESTS = [{ 'url': 'http://www.noz.de/video/25151/32-Deutschland-gewinnt-Badminton-Lnderspiel-in-Melle', @@ -34,9 +36,9 @@ def _real_extract(self, url): config_url_encoded = self._search_regex( r'so\.addVariable\("config_url","[^,]*,(.*?)"', - edge_content, 'config URL' + edge_content, 'config URL', ) - config_url = compat_urllib_parse_unquote(config_url_encoded) + config_url = urllib.parse.unquote(config_url_encoded) doc = self._download_xml(config_url, 'video configuration') title = xpath_text(doc, './/title') @@ -52,7 +54,7 @@ def _real_extract(self, url): formats.append({ 'url': http_url, 'format_name': xpath_text(qnode, './name'), - 'format_id': '%s-%s' % ('http', xpath_text(qnode, './id')), + 'format_id': '{}-{}'.format('http', xpath_text(qnode, './id')), 'height': int_or_none(xpath_text(qnode, './height')), 'width': int_or_none(xpath_text(qnode, './width')), 'tbr': int_or_none(xpath_text(qnode, './bitrate'), scale=1000), diff --git a/yt_dlp/extractor/npo.py b/yt_dlp/extractor/npo.py index 40fee24d05..178fd98bf7 100644 --- a/yt_dlp/extractor/npo.py +++ b/yt_dlp/extractor/npo.py @@ -200,7 +200,7 @@ class NPOIE(InfoExtractor): def suitable(cls, url): return (False if any(ie.suitable(url) for ie in (NPOLiveIE, NPORadioIE, NPORadioFragmentIE)) - else super(NPOIE, cls).suitable(url)) + else super().suitable(url)) def _real_extract(self, url): video_id = self._match_id(url) @@ -228,7 +228,7 @@ def _real_extract(self, url): 'hasAdConsent': 0, }), headers={ 'x-xsrf-token': try_call(lambda: urllib.parse.unquote( - self._get_cookies('https://www.npostart.nl')['XSRF-TOKEN'].value)) + self._get_cookies('https://www.npostart.nl')['XSRF-TOKEN'].value)), }) player_token = player['token'] @@ -238,14 +238,14 @@ def _real_extract(self, url): formats = [] for profile in ('hls', 'dash-widevine', 'dash-playready', 'smooth'): streams = self._download_json( - 'https://start-player.npo.nl/video/%s/streams' % video_id, - video_id, 'Downloading %s profile JSON' % profile, fatal=False, + f'https://start-player.npo.nl/video/{video_id}/streams', + video_id, f'Downloading {profile} profile JSON', fatal=False, query={ 'profile': profile, 'quality': 'npoplus', 'tokenId': player_token, 'streamType': 'broadcast', - }) + }, data=b'') # endpoint requires POST if not streams: continue stream = streams.get('stream') @@ -339,7 +339,7 @@ class NPOLiveIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, }, { 'url': 'http://www.npo.nl/live', 'only_matching': True, @@ -358,7 +358,7 @@ def _real_extract(self, url): return { '_type': 'url_transparent', - 'url': 'npo:%s' % live_id, + 'url': f'npo:{live_id}', 'ie_key': NPOIE.ie_key(), 'id': live_id, 'display_id': display_id, @@ -379,16 +379,16 @@ class NPORadioIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, } @classmethod def suitable(cls, url): - return False if NPORadioFragmentIE.suitable(url) else super(NPORadioIE, cls).suitable(url) + return False if NPORadioFragmentIE.suitable(url) else super().suitable(url) @staticmethod def _html_get_attribute_regex(attribute): - return r'{0}\s*=\s*\'([^\']+)\''.format(attribute) + return rf'{attribute}\s*=\s*\'([^\']+)\'' def _real_extract(self, url): video_id = self._match_id(url) @@ -434,7 +434,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, audio_id) title = self._html_search_regex( - r'href="/radio/[^/]+/fragment/%s" title="([^"]+)"' % audio_id, + rf'href="/radio/[^/]+/fragment/{audio_id}" title="([^"]+)"', webpage, 'title') audio_url = self._search_regex( @@ -456,8 +456,8 @@ def _real_extract(self, url): return { '_type': 'url_transparent', 'ie_key': 'NPO', - 'url': 'npo:%s' % video_id, - 'display_id': display_id + 'url': f'npo:{video_id}', + 'display_id': display_id, } @@ -472,12 +472,12 @@ class SchoolTVIE(NPODataMidEmbedIE): 'display_id': 'ademhaling-de-hele-dag-haal-je-adem-maar-wat-gebeurt-er-dan-eigenlijk-in-je-lichaam', 'title': 'Ademhaling: De hele dag haal je adem. Maar wat gebeurt er dan eigenlijk in je lichaam?', 'ext': 'mp4', - 'description': 'md5:abfa0ff690adb73fd0297fd033aaa631' + 'description': 'md5:abfa0ff690adb73fd0297fd033aaa631', }, 'params': { # Skip because of m3u8 download - 'skip_download': True - } + 'skip_download': True, + }, } @@ -496,8 +496,8 @@ class HetKlokhuisIE(NPODataMidEmbedIE): 'upload_date': '20170223', }, 'params': { - 'skip_download': True - } + 'skip_download': True, + }, } @@ -508,7 +508,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, playlist_id) entries = [ - self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id) + self.url_result(f'npo:{video_id}' if not video_id.startswith('http') else video_id) for video_id in orderedSet(re.findall(self._PLAYLIST_ENTRY_RE, webpage)) ] @@ -574,9 +574,9 @@ class VPROIE(NPOPlaylistBaseIE): }, 'params': { # Skip because of m3u8 download - 'skip_download': True + 'skip_download': True, }, - } + }, ] diff --git a/yt_dlp/extractor/npr.py b/yt_dlp/extractor/npr.py index 4b6855c93f..06103ff383 100644 --- a/yt_dlp/extractor/npr.py +++ b/yt_dlp/extractor/npr.py @@ -8,14 +8,14 @@ class NprIE(InfoExtractor): 'url': 'https://www.npr.org/sections/allsongs/2015/10/21/449974205/new-music-from-beach-house-chairlift-cmj-discoveries-and-more', 'info_dict': { 'id': '449974205', - 'title': 'New Music From Beach House, Chairlift, CMJ Discoveries And More' + 'title': 'New Music From Beach House, Chairlift, CMJ Discoveries And More', }, 'playlist_count': 7, }, { 'url': 'https://www.npr.org/sections/deceptivecadence/2015/10/09/446928052/music-from-the-shadows-ancient-armenian-hymns-and-piano-jazz', 'info_dict': { 'id': '446928052', - 'title': "Songs We Love: Tigran Hamasyan, 'Your Mercy is Boundless'" + 'title': "Songs We Love: Tigran Hamasyan, 'Your Mercy is Boundless'", }, 'playlist': [{ 'md5': '12fa60cb2d3ed932f53609d4aeceabf1', diff --git a/yt_dlp/extractor/nrk.py b/yt_dlp/extractor/nrk.py index 88d08e5e3a..658ae5f916 100644 --- a/yt_dlp/extractor/nrk.py +++ b/yt_dlp/extractor/nrk.py @@ -3,7 +3,7 @@ import re from .common import InfoExtractor -from ..compat import compat_HTTPError, compat_str +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, determine_ext, @@ -52,12 +52,12 @@ def _raise_error(self, data): msg=MESSAGES.get('ProgramIsGeoBlocked'), countries=self._GEO_COUNTRIES) message = data.get('endUserMessage') or MESSAGES.get(message_type, message_type) - raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + raise ExtractorError(f'{self.IE_NAME} said: {message}', expected=True) def _call_api(self, path, video_id, item=None, note=None, fatal=True, query=None): return self._download_json( urljoin('https://psapi.nrk.no/', path), - video_id, note or 'Downloading %s JSON' % item, + video_id, note or f'Downloading {item} JSON', fatal=fatal, query=query) @@ -84,7 +84,7 @@ class NRKIE(NRKBaseIE): 'title': 'Dompap og andre fugler i Piip-Show', 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', 'duration': 262, - } + }, }, { # audio 'url': 'http://www.nrk.no/video/PS*154915', @@ -95,7 +95,7 @@ class NRKIE(NRKBaseIE): 'title': 'Slik høres internett ut når du er blind', 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', 'duration': 20, - } + }, }, { 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', 'only_matching': True, @@ -148,14 +148,14 @@ def call_playback_api(item, query=None): try: return self._call_api(f'playback/{item}/program/{video_id}', video_id, item, query=query) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: return self._call_api(f'playback/{item}/{video_id}', video_id, item, query=query) raise # known values for preferredCdn: akamai, iponly, minicdn and telenor manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) - video_id = try_get(manifest, lambda x: x['id'], compat_str) or video_id + video_id = try_get(manifest, lambda x: x['id'], str) or video_id if manifest.get('playability') == 'nonPlayable': self._raise_error(manifest['nonPlayable']) @@ -215,13 +215,13 @@ def call_playback_api(item, query=None): sub_key = str_or_none(sub.get('language')) or 'nb' sub_type = str_or_none(sub.get('type')) if sub_type: - sub_key += '-%s' % sub_type + sub_key += f'-{sub_type}' subtitles.setdefault(sub_key, []).append({ 'url': sub_url, }) legal_age = try_get( - data, lambda x: x['legalAge']['body']['rating']['code'], compat_str) + data, lambda x: x['legalAge']['body']['rating']['code'], str) # https://en.wikipedia.org/wiki/Norwegian_Media_Authority age_limit = None if legal_age: @@ -242,13 +242,13 @@ def call_playback_api(item, query=None): 'age_limit': age_limit, 'formats': formats, 'subtitles': subtitles, - 'timestamp': parse_iso8601(try_get(manifest, lambda x: x['availability']['onDemand']['from'], str)) + 'timestamp': parse_iso8601(try_get(manifest, lambda x: x['availability']['onDemand']['from'], str)), } if is_series: series = season_id = season_number = episode = episode_number = None programs = self._call_api( - 'programs/%s' % video_id, video_id, 'programs', fatal=False) + f'programs/{video_id}', video_id, 'programs', fatal=False) if programs and isinstance(programs, dict): series = str_or_none(programs.get('seriesTitle')) season_id = str_or_none(programs.get('seasonId')) @@ -258,7 +258,7 @@ def call_playback_api(item, query=None): if not series: series = title if alt_title: - title += ' - %s' % alt_title + title += f' - {alt_title}' if not season_number: season_number = int_or_none(self._search_regex( r'Sesong\s+(\d+)', description or '', 'season number', @@ -288,7 +288,7 @@ def call_playback_api(item, query=None): class NRKTVIE(InfoExtractor): IE_DESC = 'NRK TV and NRK Radio' _EPISODE_RE = r'(?P<id>[a-zA-Z]{4}\d{8})' - _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*%s' % _EPISODE_RE + _VALID_URL = rf'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*{_EPISODE_RE}' _TESTS = [{ 'url': 'https://tv.nrk.no/program/MDDP12000117', 'md5': 'c4a5960f1b00b40d47db65c1064e0ab1', @@ -305,7 +305,7 @@ class NRKTVIE(InfoExtractor): }], 'nb-ttv': [{ 'ext': 'vtt', - }] + }], }, }, }, { @@ -410,7 +410,7 @@ class NRKTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) return self.url_result( - 'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id) + f'nrk:{video_id}', ie=NRKIE.ie_key(), video_id=video_id) class NRKTVEpisodeIE(InfoExtractor): @@ -460,14 +460,14 @@ def _real_extract(self, url): info = self._search_json_ld(webpage, display_id, default={}) nrk_id = info.get('@id') or self._html_search_meta( 'nrk:program-id', webpage, default=None) or self._search_regex( - r'data-program-id=["\'](%s)' % NRKTVIE._EPISODE_RE, webpage, + rf'data-program-id=["\']({NRKTVIE._EPISODE_RE})', webpage, 'nrk id') assert re.match(NRKTVIE._EPISODE_RE, nrk_id) info.update({ '_type': 'url', 'id': nrk_id, - 'url': 'nrk:%s' % nrk_id, + 'url': f'nrk:{nrk_id}', 'ie_key': NRKIE.ie_key(), 'season_number': int(season_number), 'episode_number': int(episode_number), @@ -482,13 +482,13 @@ def _extract_entries(self, entry_list): entries = [] for episode in entry_list: nrk_id = episode.get('prfId') or episode.get('episodeId') - if not nrk_id or not isinstance(nrk_id, compat_str): + if not nrk_id or not isinstance(nrk_id, str): continue entries.append(self.url_result( - 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) + f'nrk:{nrk_id}', ie=NRKIE.ie_key(), video_id=nrk_id)) return entries - _ASSETS_KEYS = ('episodes', 'instalments',) + _ASSETS_KEYS = ('episodes', 'instalments') def _extract_assets_key(self, embedded): for asset_key in self._ASSETS_KEYS: @@ -513,19 +513,18 @@ def _entries(self, data, display_id): (lambda x: x[assets_key]['_embedded'][assets_key], lambda x: x[assets_key]), list) - for e in self._extract_entries(entries): - yield e + yield from self._extract_entries(entries) # Find next URL next_url_path = try_get( data, (lambda x: x['_links']['next']['href'], lambda x: x['_embedded'][assets_key]['_links']['next']['href']), - compat_str) + str) if not next_url_path: break data = self._call_api( next_url_path, display_id, - note='Downloading %s JSON page %d' % (assets_key, page_num), + note=f'Downloading {assets_key} JSON page {page_num}', fatal=False) if not data: break @@ -592,7 +591,7 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): @classmethod def suitable(cls, url): return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) or NRKRadioPodkastIE.suitable(url) - else super(NRKTVSeasonIE, cls).suitable(url)) + else super().suitable(url)) def _real_extract(self, url): mobj = self._match_valid_url(url) @@ -600,14 +599,13 @@ def _real_extract(self, url): serie_kind = mobj.group('serie_kind') serie = mobj.group('serie') season_id = mobj.group('id') or mobj.group('id_2') - display_id = '%s/%s' % (serie, season_id) + display_id = f'{serie}/{season_id}' data = self._call_api( - '%s/catalog/%s/%s/seasons/%s' - % (domain, self._catalog_name(serie_kind), serie, season_id), + f'{domain}/catalog/{self._catalog_name(serie_kind)}/{serie}/seasons/{season_id}', display_id, 'season', query={'pageSize': 50}) - title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id + title = try_get(data, lambda x: x['titles']['title'], str) or display_id return self.playlist_result( self._entries(data, display_id), display_id, title) @@ -690,7 +688,7 @@ def suitable(cls, url): return ( False if any(ie.suitable(url) for ie in (NRKTVIE, NRKTVEpisodeIE, NRKRadioPodkastIE, NRKTVSeasonIE)) - else super(NRKTVSeriesIE, cls).suitable(url)) + else super().suitable(url)) def _real_extract(self, url): site, serie_kind, series_id = self._match_valid_url(url).groups() @@ -699,8 +697,7 @@ def _real_extract(self, url): size_prefix = 'p' if is_radio else 'embeddedInstalmentsP' series = self._call_api( - '%s/catalog/%s/%s' - % (domain, self._catalog_name(serie_kind), series_id), + f'{domain}/catalog/{self._catalog_name(serie_kind)}/{series_id}', series_id, 'serie', query={size_prefix + 'ageSize': 50}) titles = try_get(series, [ lambda x: x['titles'], @@ -718,8 +715,8 @@ def _real_extract(self, url): season_url = urljoin(url, season.get('href')) if not season_url: season_name = season.get('name') - if season_name and isinstance(season_name, compat_str): - season_url = 'https://%s.nrk.no/serie/%s/sesong/%s' % (domain, series_id, season_name) + if season_name and isinstance(season_name, str): + season_url = f'https://{domain}.nrk.no/serie/{series_id}/sesong/{season_name}' if season_url: entries.append(self.url_result( season_url, ie=NRKTVSeasonIE.ie_key(), @@ -776,7 +773,7 @@ class NRKRadioPodkastIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) return self.url_result( - 'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id) + f'nrk:{video_id}', ie=NRKIE.ie_key(), video_id=video_id) class NRKPlaylistBaseIE(InfoExtractor): @@ -789,7 +786,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, playlist_id) entries = [ - self.url_result('nrk:%s' % video_id, NRKIE.ie_key()) + self.url_result(f'nrk:{video_id}', NRKIE.ie_key()) for video_id in re.findall(self._ITEM_RE, webpage) ] @@ -830,7 +827,7 @@ def _extract_description(self, webpage): class NRKTVEpisodesIE(NRKPlaylistBaseIE): _VALID_URL = r'https?://tv\.nrk\.no/program/[Ee]pisodes/[^/]+/(?P<id>\d+)' - _ITEM_RE = r'data-episode=["\']%s' % NRKTVIE._EPISODE_RE + _ITEM_RE = rf'data-episode=["\']{NRKTVIE._EPISODE_RE}' _TESTS = [{ 'url': 'https://tv.nrk.no/program/episodes/nytt-paa-nytt/69031', 'info_dict': { @@ -868,7 +865,7 @@ def _real_extract(self, url): video_id = self._match_id(url) nrk_id = self._download_json( - 'https://nrkno-skole-prod.kube.nrk.no/skole/api/media/%s' % video_id, + f'https://nrkno-skole-prod.kube.nrk.no/skole/api/media/{video_id}', video_id)['psId'] - return self.url_result('nrk:%s' % nrk_id) + return self.url_result(f'nrk:{nrk_id}') diff --git a/yt_dlp/extractor/nrl.py b/yt_dlp/extractor/nrl.py index 798d03417b..1e8cf0b754 100644 --- a/yt_dlp/extractor/nrl.py +++ b/yt_dlp/extractor/nrl.py @@ -2,6 +2,7 @@ class NRLTVIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?nrl\.com/tv(/[^/]+)*/(?P<id>[^/?&#]+)' _TEST = { 'url': 'https://www.nrl.com/tv/news/match-highlights-titans-v-knights-862805/', diff --git a/yt_dlp/extractor/nts.py b/yt_dlp/extractor/nts.py new file mode 100644 index 0000000000..a801740fa5 --- /dev/null +++ b/yt_dlp/extractor/nts.py @@ -0,0 +1,76 @@ +from .common import InfoExtractor +from ..utils import parse_iso8601, url_or_none +from ..utils.traversal import traverse_obj + + +class NTSLiveIE(InfoExtractor): + IE_NAME = 'nts.live' + _VALID_URL = r'https?://(?:www\.)?nts\.live/shows/[^/?#]+/episodes/(?P<id>[^/?#]+)' + _TESTS = [ + { + # embedded soundcloud + 'url': 'https://www.nts.live/shows/yu-su/episodes/yu-su-2nd-april-2024', + 'md5': 'b5444c04888c869d68758982de1a27d8', + 'info_dict': { + 'id': '1791563518', + 'ext': 'opus', + 'uploader_id': '995579326', + 'title': 'Pender Street Steppers & YU SU', + 'timestamp': 1712073600, + 'upload_date': '20240402', + 'thumbnail': 'https://i1.sndcdn.com/artworks-qKcNO0z0AQGGbv9s-GljJCw-original.jpg', + 'license': 'all-rights-reserved', + 'repost_count': int, + 'uploader_url': 'https://soundcloud.com/user-643553014', + 'uploader': 'NTS Latest', + 'description': 'md5:cd00ac535a63caaad722483ae3ff802a', + 'duration': 10784.157, + 'genres': ['Deep House', 'House', 'Leftfield Disco', 'Jazz Fusion', 'Dream Pop'], + 'modified_timestamp': 1712564687, + 'modified_date': '20240408', + }, + }, + { + # embedded mixcloud + 'url': 'https://www.nts.live/shows/absolute-fiction/episodes/absolute-fiction-23rd-july-2022', + 'info_dict': { + 'id': 'NTSRadio_absolute-fiction-23rd-july-2022', + 'ext': 'webm', + 'like_count': int, + 'title': 'Absolute Fiction', + 'comment_count': int, + 'uploader_url': 'https://www.mixcloud.com/NTSRadio/', + 'description': 'md5:ba49da971ae8d71ee45813c52c5e2a04', + 'tags': [], + 'duration': 3529, + 'timestamp': 1658588400, + 'repost_count': int, + 'upload_date': '20220723', + 'uploader_id': 'NTSRadio', + 'thumbnail': 'https://thumbnailer.mixcloud.com/unsafe/1024x1024/extaudio/5/1/a/d/ae3e-1be9-4fd4-983e-9c3294226eac', + 'uploader': 'Mixcloud NTS Radio', + 'genres': ['Minimal Synth', 'Post Punk', 'Industrial '], + 'modified_timestamp': 1658842165, + 'modified_date': '20220726', + }, + 'params': {'skip_download': 'm3u8'}, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + data = self._search_json(r'window\._REACT_STATE_\s*=', webpage, 'react state', video_id) + + return { + '_type': 'url_transparent', + **traverse_obj(data, ('episode', { + 'url': ('audio_sources', ..., 'url', {url_or_none}, any), + 'title': ('name', {str}), + 'description': ('description', {str}), + 'genres': ('genres', ..., 'value', {str}), + 'timestamp': ('broadcast', {parse_iso8601}), + 'modified_timestamp': ('updated', {parse_iso8601}), + })), + } diff --git a/yt_dlp/extractor/ntvde.py b/yt_dlp/extractor/ntvde.py index 6d7ea3d184..9f3a498ab3 100644 --- a/yt_dlp/extractor/ntvde.py +++ b/yt_dlp/extractor/ntvde.py @@ -1,21 +1,21 @@ import re from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( int_or_none, js_to_json, - parse_duration, + url_or_none, ) +from ..utils.traversal import traverse_obj class NTVDeIE(InfoExtractor): IE_NAME = 'n-tv.de' - _VALID_URL = r'https?://(?:www\.)?n-tv\.de/mediathek/videos/[^/?#]+/[^/?#]+-article(?P<id>.+)\.html' + _VALID_URL = r'https?://(?:www\.)?n-tv\.de/mediathek/(?:videos|magazine)/[^/?#]+/[^/?#]+-article(?P<id>[^/?#]+)\.html' _TESTS = [{ 'url': 'http://www.n-tv.de/mediathek/videos/panorama/Schnee-und-Glaette-fuehren-zu-zahlreichen-Unfaellen-und-Staus-article14438086.html', - 'md5': '6ef2514d4b1e8e03ca24b49e2f167153', + 'md5': '6bcf2a6638cb83f45d5561659a1cb498', 'info_dict': { 'id': '14438086', 'ext': 'mp4', @@ -23,51 +23,61 @@ class NTVDeIE(InfoExtractor): 'title': 'Schnee und Glätte führen zu zahlreichen Unfällen und Staus', 'alt_title': 'Winterchaos auf deutschen Straßen', 'description': 'Schnee und Glätte sorgen deutschlandweit für einen chaotischen Start in die Woche: Auf den Straßen kommt es zu kilometerlangen Staus und Dutzenden Glätteunfällen. In Düsseldorf und München wirbelt der Schnee zudem den Flugplan durcheinander. Dutzende Flüge landen zu spät, einige fallen ganz aus.', - 'duration': 4020, + 'duration': 67, 'timestamp': 1422892797, 'upload_date': '20150202', }, + }, { + 'url': 'https://www.n-tv.de/mediathek/magazine/auslandsreport/Juedische-Siedler-wollten-Rache-die-wollten-nur-toeten-article24523089.html', + 'md5': 'c5c6014c014ccc3359470e1d34472bfd', + 'info_dict': { + 'id': '24523089', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + 'title': 'Jüdische Siedler "wollten Rache, die wollten nur töten"', + 'alt_title': 'Israelische Gewalt fern von Gaza', + 'description': 'Vier Tage nach dem Massaker der Hamas greifen jüdische Siedler das Haus einer palästinensischen Familie im Westjordanland an. Die Überlebenden berichten, sie waren unbewaffnet, die Angreifer seien nur auf "Rache und Töten" aus gewesen. Als die Toten beerdigt werden sollen, eröffnen die Siedler erneut das Feuer.', + 'duration': 326, + 'timestamp': 1699688294, + 'upload_date': '20231111', + }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - info = self._parse_json(self._search_regex( - r'(?s)ntv\.pageInfo\.article\s*=\s*(\{.*?\});', webpage, 'info'), - video_id, transform_source=js_to_json) - timestamp = int_or_none(info.get('publishedDateAsUnixTimeStamp')) - vdata = self._parse_json(self._search_regex( - r'(?s)\$\(\s*"\#player"\s*\)\s*\.data\(\s*"player",\s*(\{.*?\})\);', - webpage, 'player data'), video_id, - transform_source=lambda s: js_to_json(re.sub(r'advertising:\s*{[^}]+},', '', s))) - duration = parse_duration(vdata.get('duration')) + info = self._search_json( + r'article:', webpage, 'info', video_id, transform_source=js_to_json) + + vdata = self._search_json( + r'\$\(\s*"#playerwrapper"\s*\)\s*\.data\(\s*"player",', + webpage, 'player data', video_id, + transform_source=lambda s: js_to_json(re.sub(r'ivw:[^},]+', '', s)))['setup']['source'] formats = [] - if vdata.get('video'): + if vdata.get('progressive'): formats.append({ - 'format_id': 'flash', - 'url': 'rtmp://fms.n-tv.de/%s' % vdata['video'], + 'format_id': 'http', + 'url': vdata['progressive'], }) - if vdata.get('videoMp4'): - formats.append({ - 'format_id': 'mobile', - 'url': compat_urlparse.urljoin('http://video.n-tv.de', vdata['videoMp4']), - 'tbr': 400, # estimation - }) - if vdata.get('videoM3u8'): - m3u8_url = compat_urlparse.urljoin('http://video.n-tv.de', vdata['videoM3u8']) + if vdata.get('hls'): formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', - quality=1, m3u8_id='hls', fatal=False)) + vdata['hls'], video_id, 'mp4', m3u8_id='hls', fatal=False)) + if vdata.get('dash'): + formats.extend(self._extract_mpd_formats(vdata['dash'], video_id, fatal=False, mpd_id='dash')) return { 'id': video_id, - 'title': info['headline'], - 'description': info.get('intro'), - 'alt_title': info.get('kicker'), - 'timestamp': timestamp, - 'thumbnail': vdata.get('html5VideoPoster'), - 'duration': duration, + **traverse_obj(info, { + 'title': 'headline', + 'description': 'intro', + 'alt_title': 'kicker', + 'timestamp': ('publishedDateAsUnixTimeStamp', {int_or_none}), + }), + **traverse_obj(vdata, { + 'thumbnail': ('poster', {url_or_none}), + 'duration': ('length', {int_or_none}), + }), 'formats': formats, } diff --git a/yt_dlp/extractor/ntvru.py b/yt_dlp/extractor/ntvru.py index 8d5877daa0..1ab1be0f60 100644 --- a/yt_dlp/extractor/ntvru.py +++ b/yt_dlp/extractor/ntvru.py @@ -21,6 +21,7 @@ class NTVRuIE(InfoExtractor): 'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины', 'thumbnail': r're:^http://.*\.jpg', 'duration': 136, + 'view_count': int, }, }, { 'url': 'http://www.ntv.ru/video/novosti/750370/', @@ -32,7 +33,9 @@ class NTVRuIE(InfoExtractor): 'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход', 'thumbnail': r're:^http://.*\.jpg', 'duration': 172, + 'view_count': int, }, + 'skip': '404 Not Found', }, { 'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416', 'md5': '82dbd49b38e3af1d00df16acbeab260c', @@ -43,6 +46,7 @@ class NTVRuIE(InfoExtractor): 'description': '«Сегодня». 21 марта 2014 года. 16:00', 'thumbnail': r're:^http://.*\.jpg', 'duration': 1496, + 'view_count': int, }, }, { 'url': 'https://www.ntv.ru/kino/Koma_film/m70281/o336036/video/', @@ -54,6 +58,7 @@ class NTVRuIE(InfoExtractor): 'description': 'Остросюжетный фильм «Кома»', 'thumbnail': r're:^http://.*\.jpg', 'duration': 5592, + 'view_count': int, }, }, { 'url': 'http://www.ntv.ru/serial/Delo_vrachey/m31760/o233916/', @@ -65,6 +70,7 @@ class NTVRuIE(InfoExtractor): 'description': '«Дело врачей»: «Деревце жизни»', 'thumbnail': r're:^http://.*\.jpg', 'duration': 2590, + 'view_count': int, }, }, { # Schemeless file URL @@ -73,7 +79,8 @@ class NTVRuIE(InfoExtractor): }] _VIDEO_ID_REGEXES = [ - r'<meta property="og:url" content="http://www\.ntv\.ru/video/(\d+)', + r'<meta property="og:url" content="https?://www\.ntv\.ru/video/(\d+)', + r'<meta property="og:video:(?:url|iframe)" content="https?://www\.ntv\.ru/embed/(\d+)', r'<video embed=[^>]+><id>(\d+)</id>', r'<video restriction[^>]+><key>(\d+)</key>', ] @@ -95,7 +102,7 @@ def _real_extract(self, url): self._VIDEO_ID_REGEXES, webpage, 'video id') player = self._download_xml( - 'http://www.ntv.ru/vi%s/' % video_id, + f'http://www.ntv.ru/vi{video_id}/', video_id, 'Downloading video XML') title = strip_or_none(unescapeHTML(xpath_text(player, './data/title', 'title', fatal=True))) @@ -104,7 +111,7 @@ def _real_extract(self, url): formats = [] for format_id in ['', 'hi', 'webm']: - file_ = xpath_text(video, './%sfile' % format_id) + file_ = xpath_text(video, f'./{format_id}file') if not file_: continue if file_.startswith('//'): @@ -113,8 +120,16 @@ def _real_extract(self, url): file_ = 'http://media.ntv.ru/vod/' + file_ formats.append({ 'url': file_, - 'filesize': int_or_none(xpath_text(video, './%ssize' % format_id)), + 'filesize': int_or_none(xpath_text(video, f'./{format_id}size')), }) + hls_manifest = xpath_text(video, './playback/hls') + if hls_manifest: + formats.extend(self._extract_m3u8_formats( + hls_manifest, video_id, m3u8_id='hls', fatal=False)) + dash_manifest = xpath_text(video, './playback/dash') + if dash_manifest: + formats.extend(self._extract_mpd_formats( + dash_manifest, video_id, mpd_id='dash', fatal=False)) return { 'id': xpath_text(video, './id'), diff --git a/yt_dlp/extractor/nubilesporn.py b/yt_dlp/extractor/nubilesporn.py new file mode 100644 index 0000000000..c2079d8b07 --- /dev/null +++ b/yt_dlp/extractor/nubilesporn.py @@ -0,0 +1,99 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + float_or_none, + format_field, + get_element_by_class, + get_element_by_id, + get_element_html_by_class, + get_elements_by_class, + int_or_none, + try_call, + unified_timestamp, + urlencode_postdata, +) + + +class NubilesPornIE(InfoExtractor): + _NETRC_MACHINE = 'nubiles-porn' + _VALID_URL = r'''(?x) + https://members\.nubiles-porn\.com/video/watch/(?P<id>\d+) + (?:/(?P<display_id>[\w\-]+-s(?P<season>\d+)e(?P<episode>\d+)))? + ''' + + _TESTS = [{ + 'url': 'https://members.nubiles-porn.com/video/watch/165320/trying-to-focus-my-one-track-mind-s3e1', + 'md5': 'fa7f09da8027c35e4bdf0f94f55eac82', + 'info_dict': { + 'id': '165320', + 'title': 'Trying To Focus My One Track Mind - S3:E1', + 'ext': 'mp4', + 'display_id': 'trying-to-focus-my-one-track-mind-s3e1', + 'thumbnail': 'https://images.nubiles-porn.com/videos/trying_to_focus_my_one_track_mind/samples/cover1280.jpg', + 'description': 'md5:81f3d4372e0e39bff5c801da277a5141', + 'timestamp': 1676160000, + 'upload_date': '20230212', + 'channel': 'Younger Mommy', + 'channel_id': '64', + 'channel_url': 'https://members.nubiles-porn.com/video/website/64', + 'like_count': int, + 'average_rating': float, + 'age_limit': 18, + 'categories': ['Big Boobs', 'Big Naturals', 'Blowjob', 'Brunette', 'Cowgirl', 'Girl Orgasm', 'Girl-Boy', + 'Glasses', 'Hardcore', 'Milf', 'Shaved Pussy', 'Tattoos', 'YoungerMommy.com'], + 'tags': list, + 'cast': ['Kenzie Love'], + 'availability': 'needs_auth', + 'series': 'Younger Mommy', + 'series_id': '64', + 'season': 'Season 3', + 'season_number': 3, + 'episode': 'Episode 1', + 'episode_number': 1, + }, + }] + + def _perform_login(self, username, password): + login_webpage = self._download_webpage('https://nubiles-porn.com/login', video_id=None) + inputs = self._hidden_inputs(login_webpage) + inputs.update({'username': username, 'password': password}) + self._request_webpage('https://nubiles-porn.com/authentication/login', None, data=urlencode_postdata(inputs)) + + def _real_extract(self, url): + url_match = self._match_valid_url(url) + video_id = url_match.group('id') + page = self._download_webpage(url, video_id) + + media_entries = self._parse_html5_media_entries( + url, get_element_by_class('watch-page-video-wrapper', page), video_id)[0] + + channel_id, channel_name = self._search_regex( + r'/video/website/(?P<id>\d+).+>(?P<name>\w+).com', get_element_html_by_class('site-link', page), + 'channel', fatal=False, group=('id', 'name')) or (None, None) + channel_name = re.sub(r'([^A-Z]+)([A-Z]+)', r'\1 \2', channel_name) + + return { + 'id': video_id, + 'title': self._search_regex('<h2>([^<]+)</h2>', page, 'title', fatal=False), + 'formats': media_entries.get('formats'), + 'display_id': url_match.group('display_id'), + 'thumbnail': media_entries.get('thumbnail'), + 'description': clean_html(get_element_html_by_class('content-pane-description', page)), + 'timestamp': unified_timestamp(get_element_by_class('date', page)), + 'channel': channel_name, + 'channel_id': channel_id, + 'channel_url': format_field(channel_id, None, 'https://members.nubiles-porn.com/video/website/%s'), + 'like_count': int_or_none(get_element_by_id('likecount', page)), + 'average_rating': float_or_none(get_element_by_class('score', page)), + 'age_limit': 18, + 'categories': try_call(lambda: list(map(clean_html, get_elements_by_class('btn', get_element_by_class('categories', page))))), + 'tags': try_call(lambda: list(map(clean_html, get_elements_by_class('btn', get_elements_by_class('tags', page)[1])))), + 'cast': get_elements_by_class('content-pane-performer', page), + 'availability': 'needs_auth', + 'series': channel_name, + 'series_id': channel_id, + 'season_number': int_or_none(url_match.group('season')), + 'episode_number': int_or_none(url_match.group('episode')), + } diff --git a/yt_dlp/extractor/nuevo.py b/yt_dlp/extractor/nuevo.py index ec54041f12..945fd0c9ce 100644 --- a/yt_dlp/extractor/nuevo.py +++ b/yt_dlp/extractor/nuevo.py @@ -1,9 +1,5 @@ from .common import InfoExtractor - -from ..utils import ( - float_or_none, - xpath_text -) +from ..utils import float_or_none, xpath_text class NuevoBaseIE(InfoExtractor): @@ -32,5 +28,5 @@ def _extract_nuevo(self, config_url, video_id, headers={}): 'title': title, 'thumbnail': thumbnail, 'duration': duration, - 'formats': formats + 'formats': formats, } diff --git a/yt_dlp/extractor/nuum.py b/yt_dlp/extractor/nuum.py new file mode 100644 index 0000000000..697fc6b32e --- /dev/null +++ b/yt_dlp/extractor/nuum.py @@ -0,0 +1,201 @@ +import functools + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + OnDemandPagedList, + UserNotLive, + filter_dict, + int_or_none, + parse_iso8601, + str_or_none, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class NuumBaseIE(InfoExtractor): + def _call_api(self, path, video_id, description, query={}): + response = self._download_json( + f'https://nuum.ru/api/v2/{path}', video_id, query=query, + note=f'Downloading {description} metadata', + errnote=f'Unable to download {description} metadata') + if error := response.get('error'): + raise ExtractorError(f'API returned error: {error!r}') + return response['result'] + + def _get_channel_info(self, channel_name): + return self._call_api( + 'broadcasts/public', video_id=channel_name, description='channel', + query={ + 'with_extra': 'true', + 'channel_name': channel_name, + 'with_deleted': 'true', + }) + + def _parse_video_data(self, container, extract_formats=True): + stream = traverse_obj(container, ('media_container_streams', 0, {dict})) or {} + media = traverse_obj(stream, ('stream_media', 0, {dict})) or {} + media_url = traverse_obj(media, ( + 'media_meta', ('media_archive_url', 'media_url'), {url_or_none}), get_all=False) + + video_id = str(container['media_container_id']) + is_live = media.get('media_status') == 'RUNNING' + + formats, subtitles = None, None + headers = {'Referer': 'https://nuum.ru/'} + if extract_formats: + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + media_url, video_id, 'mp4', live=is_live, headers=headers) + + return filter_dict({ + 'id': video_id, + 'is_live': is_live, + 'formats': formats, + 'subtitles': subtitles, + 'http_headers': headers, + **traverse_obj(container, { + 'title': ('media_container_name', {str}), + 'description': ('media_container_description', {str}), + 'timestamp': ('created_at', {parse_iso8601}), + 'channel': ('media_container_channel', 'channel_name', {str}), + 'channel_id': ('media_container_channel', 'channel_id', {str_or_none}), + }), + **traverse_obj(stream, { + 'view_count': ('stream_total_viewers', {int_or_none}), + 'concurrent_view_count': ('stream_current_viewers', {int_or_none}), + }), + **traverse_obj(media, { + 'duration': ('media_duration', {int_or_none}), + 'thumbnail': ('media_meta', ('media_preview_archive_url', 'media_preview_url'), {url_or_none}), + }, get_all=False), + }) + + +class NuumMediaIE(NuumBaseIE): + IE_NAME = 'nuum:media' + _VALID_URL = r'https?://nuum\.ru/(?:streams|videos|clips)/(?P<id>[\d]+)' + _TESTS = [{ + 'url': 'https://nuum.ru/streams/1592713-7-days-to-die', + 'only_matching': True, + }, { + 'url': 'https://nuum.ru/videos/1567547-toxi-hurtz', + 'md5': 'ce28837a5bbffe6952d7bfd3d39811b0', + 'info_dict': { + 'id': '1567547', + 'ext': 'mp4', + 'title': 'Toxi$ - Hurtz', + 'description': '', + 'timestamp': 1702631651, + 'upload_date': '20231215', + 'thumbnail': r're:^https?://.+\.jpg', + 'view_count': int, + 'concurrent_view_count': int, + 'channel_id': '6911', + 'channel': 'toxis', + 'duration': 116, + }, + }, { + 'url': 'https://nuum.ru/clips/1552564-pro-misu', + 'md5': 'b248ae1565b1e55433188f11beeb0ca1', + 'info_dict': { + 'id': '1552564', + 'ext': 'mp4', + 'title': 'Про Мису 🙃', + 'timestamp': 1701971828, + 'upload_date': '20231207', + 'thumbnail': r're:^https?://.+\.jpg', + 'view_count': int, + 'concurrent_view_count': int, + 'channel_id': '3320', + 'channel': 'Misalelik', + 'duration': 41, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._call_api(f'media-containers/{video_id}', video_id, 'media') + + return self._parse_video_data(video_data) + + +class NuumLiveIE(NuumBaseIE): + IE_NAME = 'nuum:live' + _VALID_URL = r'https?://nuum\.ru/channel/(?P<id>[^/#?]+)/?(?:$|[#?])' + _TESTS = [{ + 'url': 'https://nuum.ru/channel/mts_live', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel = self._match_id(url) + channel_info = self._get_channel_info(channel) + if traverse_obj(channel_info, ('channel', 'channel_is_live')) is False: + raise UserNotLive(video_id=channel) + + info = self._parse_video_data(channel_info['media_container']) + return { + 'webpage_url': f'https://nuum.ru/streams/{info["id"]}', + 'extractor_key': NuumMediaIE.ie_key(), + 'extractor': NuumMediaIE.IE_NAME, + **info, + } + + +class NuumTabIE(NuumBaseIE): + IE_NAME = 'nuum:tab' + _VALID_URL = r'https?://nuum\.ru/channel/(?P<id>[^/#?]+)/(?P<type>streams|videos|clips)' + _TESTS = [{ + 'url': 'https://nuum.ru/channel/dankon_/clips', + 'info_dict': { + 'id': 'dankon__clips', + 'title': 'Dankon_', + }, + 'playlist_mincount': 29, + }, { + 'url': 'https://nuum.ru/channel/dankon_/videos', + 'info_dict': { + 'id': 'dankon__videos', + 'title': 'Dankon_', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://nuum.ru/channel/dankon_/streams', + 'info_dict': { + 'id': 'dankon__streams', + 'title': 'Dankon_', + }, + 'playlist_mincount': 1, + }] + + _PAGE_SIZE = 50 + + def _fetch_page(self, channel_id, tab_type, tab_id, page): + CONTAINER_TYPES = { + 'clips': ['SHORT_VIDEO', 'REVIEW_VIDEO'], + 'videos': ['LONG_VIDEO'], + 'streams': ['SINGLE'], + } + + media_containers = self._call_api( + 'media-containers', video_id=tab_id, description=f'{tab_type} tab page {page + 1}', + query={ + 'limit': self._PAGE_SIZE, + 'offset': page * self._PAGE_SIZE, + 'channel_id': channel_id, + 'media_container_status': 'STOPPED', + 'media_container_type': CONTAINER_TYPES[tab_type], + }) + for container in traverse_obj(media_containers, (..., {dict})): + metadata = self._parse_video_data(container, extract_formats=False) + yield self.url_result(f'https://nuum.ru/videos/{metadata["id"]}', NuumMediaIE, **metadata) + + def _real_extract(self, url): + channel_name, tab_type = self._match_valid_url(url).group('id', 'type') + tab_id = f'{channel_name}_{tab_type}' + channel_data = self._get_channel_info(channel_name)['channel'] + + return self.playlist_result(OnDemandPagedList(functools.partial( + self._fetch_page, channel_data['channel_id'], tab_type, tab_id), self._PAGE_SIZE), + playlist_id=tab_id, playlist_title=channel_data.get('channel_name')) diff --git a/yt_dlp/extractor/nuvid.py b/yt_dlp/extractor/nuvid.py index 6ac351cb03..99a426b25f 100644 --- a/yt_dlp/extractor/nuvid.py +++ b/yt_dlp/extractor/nuvid.py @@ -2,8 +2,8 @@ from .common import InfoExtractor from ..utils import ( - parse_duration, int_or_none, + parse_duration, strip_or_none, traverse_obj, url_or_none, @@ -22,7 +22,7 @@ class NuvidIE(InfoExtractor): 'duration': 321.0, 'age_limit': 18, 'thumbnail': r're:https?://.+\.jpg', - } + }, }, { 'url': 'https://m.nuvid.com/video/6523263', 'md5': 'ebd22ce8e47e1d9a4d0756a15c67da52', @@ -34,7 +34,7 @@ class NuvidIE(InfoExtractor): 'age_limit': 18, 'thumbnail': r're:https?://.+\.jpg', 'thumbnails': list, - } + }, }, { 'url': 'http://m.nuvid.com/video/6415801/', 'md5': '638d5ececb138d5753593f751ae3f697', @@ -45,7 +45,7 @@ class NuvidIE(InfoExtractor): 'duration': 1882, 'age_limit': 18, 'thumbnail': r're:https?://.+\.jpg', - } + }, }] def _real_extract(self, url): @@ -64,7 +64,7 @@ def _real_extract(self, url): }) webpage = self._download_webpage( - 'http://m.nuvid.com/video/%s' % (video_id, ), + f'http://m.nuvid.com/video/{video_id}', video_id, 'Downloading video page', fatal=False) or '' title = strip_or_none(video_data.get('title') or self._html_search_regex( diff --git a/yt_dlp/extractor/nytimes.py b/yt_dlp/extractor/nytimes.py index 2e21edbb41..5ec3cdd675 100644 --- a/yt_dlp/extractor/nytimes.py +++ b/yt_dlp/extractor/nytimes.py @@ -1,50 +1,93 @@ -import hmac -import hashlib -import base64 +import json +import uuid from .common import InfoExtractor from ..utils import ( + ExtractorError, + clean_html, determine_ext, + extract_attributes, float_or_none, + get_elements_html_by_class, int_or_none, - js_to_json, + merge_dicts, mimetype2ext, parse_iso8601, + remove_end, remove_start, + str_or_none, + traverse_obj, + url_or_none, ) class NYTimesBaseIE(InfoExtractor): - _SECRET = b'pX(2MbU2);4N{7J8)>YwKRJ+/pQ3JkiU2Q^V>mFYv6g6gYvt6v' + _DNS_NAMESPACE = uuid.UUID('36dd619a-56dc-595b-9e09-37f4152c7b5d') + _TOKEN = 'MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAuNIzKBOFB77aT/jN/FQ+/QVKWq5V1ka1AYmCR9hstz1pGNPH5ajOU9gAqta0T89iPnhjwla+3oec/Z3kGjxbpv6miQXufHFq3u2RC6HyU458cLat5kVPSOQCe3VVB5NRpOlRuwKHqn0txfxnwSSj8mqzstR997d3gKB//RO9zE16y3PoWlDQXkASngNJEWvL19iob/xwAkfEWCjyRILWFY0JYX3AvLMSbq7wsqOCE5srJpo7rRU32zsByhsp1D5W9OYqqwDmflsgCEQy2vqTsJjrJohuNg+urMXNNZ7Y3naMoqttsGDrWVxtPBafKMI8pM2ReNZBbGQsQXRzQNo7+QIDAQAB' + _GRAPHQL_API = 'https://samizdat-graphql.nytimes.com/graphql/v2' + _GRAPHQL_QUERY = '''query VideoQuery($id: String!) { + video(id: $id) { + ... on Video { + bylines { + renderedRepresentation + } + duration + firstPublished + promotionalHeadline + promotionalMedia { + ... on Image { + crops { + name + renditions { + name + width + height + url + } + } + } + } + renditions { + type + width + height + url + bitrate + } + summary + } + } +}''' - def _extract_video_from_id(self, video_id): - # Authorization generation algorithm is reverse engineered from `signer` in - # http://graphics8.nytimes.com/video/vhs/vhs-2.x.min.js - path = '/svc/video/api/v3/video/' + video_id - hm = hmac.new(self._SECRET, (path + ':vhs').encode(), hashlib.sha512).hexdigest() - video_data = self._download_json('http://www.nytimes.com' + path, video_id, 'Downloading video JSON', headers={ - 'Authorization': 'NYTV ' + base64.b64encode(hm.encode()).decode(), - 'X-NYTV': 'vhs', - }, fatal=False) - if not video_data: - video_data = self._download_json( - 'http://www.nytimes.com/svc/video/api/v2/video/' + video_id, - video_id, 'Downloading video JSON') + def _call_api(self, media_id): + # reference: `id-to-uri.js` + video_uuid = uuid.uuid5(self._DNS_NAMESPACE, 'video') + media_uuid = uuid.uuid5(video_uuid, media_id) - title = video_data['headline'] + return traverse_obj(self._download_json( + self._GRAPHQL_API, media_id, 'Downloading JSON from GraphQL API', data=json.dumps({ + 'query': self._GRAPHQL_QUERY, + 'variables': {'id': f'nyt://video/{media_uuid}'}, + }, separators=(',', ':')).encode(), headers={ + 'Content-Type': 'application/json', + 'Nyt-App-Type': 'vhs', + 'Nyt-App-Version': 'v3.52.21', + 'Nyt-Token': self._TOKEN, + 'Origin': 'https://nytimes.com', + }, fatal=False), ('data', 'video', {dict})) or {} - def get_file_size(file_size): - if isinstance(file_size, int): - return file_size - elif isinstance(file_size, dict): - return int(file_size.get('value', 0)) - else: - return None + def _extract_thumbnails(self, thumbs): + return traverse_obj(thumbs, (lambda _, v: url_or_none(v['url']), { + 'url': 'url', + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), default=None) + def _extract_formats_and_subtitles(self, video_id, content_media_json): urls = [] formats = [] subtitles = {} - for video in video_data.get('renditions', []): + for video in traverse_obj(content_media_json, ('renditions', ..., {dict})): video_url = video.get('url') format_id = video.get('type') if not video_url or format_id == 'thumbs' or video_url in urls: @@ -56,11 +99,9 @@ def get_file_size(file_size): video_url, video_id, 'mp4', 'm3u8_native', m3u8_id=format_id or 'hls', fatal=False) formats.extend(m3u8_fmts) - subtitles = self._merge_subtitles(subtitles, m3u8_subs) + self._merge_subtitles(m3u8_subs, target=subtitles) elif ext == 'mpd': - continue - # formats.extend(self._extract_mpd_formats( - # video_url, video_id, format_id or 'dash', fatal=False)) + continue # all mpd urls give 404 errors else: formats.append({ 'url': video_url, @@ -68,55 +109,50 @@ def get_file_size(file_size): 'vcodec': video.get('videoencoding') or video.get('video_codec'), 'width': int_or_none(video.get('width')), 'height': int_or_none(video.get('height')), - 'filesize': get_file_size(video.get('file_size') or video.get('fileSize')), + 'filesize': traverse_obj(video, ( + ('file_size', 'fileSize'), (None, ('value')), {int_or_none}), get_all=False), 'tbr': int_or_none(video.get('bitrate'), 1000) or None, 'ext': ext, }) - thumbnails = [] - for image in video_data.get('images', []): - image_url = image.get('url') - if not image_url: - continue - thumbnails.append({ - 'url': 'http://www.nytimes.com/' + image_url, - 'width': int_or_none(image.get('width')), - 'height': int_or_none(image.get('height')), - }) + return formats, subtitles - publication_date = video_data.get('publication_date') - timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None + def _extract_video(self, media_id): + data = self._call_api(media_id) + formats, subtitles = self._extract_formats_and_subtitles(media_id, data) return { - 'id': video_id, - 'title': title, - 'description': video_data.get('summary'), - 'timestamp': timestamp, - 'uploader': video_data.get('byline'), - 'duration': float_or_none(video_data.get('duration'), 1000), + 'id': media_id, + 'title': data.get('promotionalHeadline'), + 'description': data.get('summary'), + 'timestamp': parse_iso8601(data.get('firstPublished')), + 'duration': float_or_none(data.get('duration'), scale=1000), + 'creator': ', '.join(traverse_obj(data, ( # TODO: change to 'creators' + 'bylines', ..., 'renderedRepresentation', {lambda x: remove_start(x, 'By ')}))), 'formats': formats, 'subtitles': subtitles, - 'thumbnails': thumbnails, + 'thumbnails': self._extract_thumbnails( + traverse_obj(data, ('promotionalMedia', 'crops', ..., 'renditions', ...))), } class NYTimesIE(NYTimesBaseIE): _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)' _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>'] - _TESTS = [{ 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263', - 'md5': 'd665342765db043f7e225cff19df0f2d', + 'md5': 'a553aa344014e3723d33893d89d4defc', 'info_dict': { 'id': '100000002847155', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'Verbatim: What Is a Photocopier?', 'description': 'md5:93603dada88ddbda9395632fdc5da260', - 'timestamp': 1398631707, - 'upload_date': '20140427', - 'uploader': 'Brett Weiner', + 'timestamp': 1398646132, + 'upload_date': '20140428', + 'creator': 'Brett Weiner', + 'thumbnail': r're:https?://\w+\.nyt.com/images/.+\.jpg', 'duration': 419, - } + }, }, { 'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html', 'only_matching': True, @@ -125,138 +161,260 @@ class NYTimesIE(NYTimesBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - return self._extract_video_from_id(video_id) + return self._extract_video(video_id) class NYTimesArticleIE(NYTimesBaseIE): - _VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?<!video))*?/(?:[^/]+/)*(?P<id>[^.]+)(?:\.html)?' + _VALID_URL = r'https?://(?:www\.)?nytimes\.com/\d{4}/\d{2}/\d{2}/(?!books|podcasts)[^/?#]+/(?:\w+/)?(?P<id>[^./?#]+)(?:\.html)?' _TESTS = [{ 'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0', - 'md5': 'e2076d58b4da18e6a001d53fd56db3c9', + 'md5': '3eb5ddb1d6f86254fe4f233826778737', 'info_dict': { 'id': '100000003628438', - 'ext': 'mov', - 'title': 'New Minimum Wage: $70,000 a Year', - 'description': 'Dan Price, C.E.O. of Gravity Payments, surprised his 120-person staff by announcing that he planned over the next three years to raise the salary of every employee to $70,000 a year.', - 'timestamp': 1429033037, + 'ext': 'mp4', + 'title': 'One Company’s New Minimum Wage: $70,000 a Year', + 'description': 'md5:89ba9ab67ca767bb92bf823d1f138433', + 'timestamp': 1429047468, 'upload_date': '20150414', 'uploader': 'Matthew Williams', - } + 'creator': 'Patricia Cohen', + 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', + 'duration': 119.0, + }, }, { - 'url': 'http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html', - 'md5': 'e0d52040cafb07662acf3c9132db3575', + # article with audio and no video + 'url': 'https://www.nytimes.com/2023/09/29/health/mosquitoes-genetic-engineering.html', + 'md5': '2365b3555c8aa7f4dd34ca735ad02e6a', 'info_dict': { - 'id': '100000004709062', - 'title': 'The Run-Up: ‘He Was Like an Octopus’', + 'id': '100000009110381', 'ext': 'mp3', - 'description': 'md5:fb5c6b93b12efc51649b4847fe066ee4', - 'series': 'The Run-Up', - 'episode': '‘He Was Like an Octopus’', - 'episode_number': 20, - 'duration': 2130, - } + 'title': 'The Gamble: Can Genetically Modified Mosquitoes End Disease?', + 'description': 'md5:9ff8b47acbaf7f3ca8c732f5c815be2e', + 'timestamp': 1695960700, + 'upload_date': '20230929', + 'creator': 'Stephanie Nolen, Natalija Gormalova', + 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', + 'duration': 1322, + }, }, { - 'url': 'http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html', + 'url': 'https://www.nytimes.com/2023/11/29/business/dealbook/kamala-harris-biden-voters.html', + 'md5': '3eb5ddb1d6f86254fe4f233826778737', 'info_dict': { - 'id': '100000004709479', - 'title': 'The Rise of Hitler', - 'ext': 'mp3', - 'description': 'md5:bce877fd9e3444990cb141875fab0028', - 'creator': 'Pamela Paul', - 'duration': 3475, + 'id': '100000009202270', + 'ext': 'mp4', + 'title': 'Kamala Harris Defends Biden Policies, but Says ‘More Work’ Needed to Reach Voters', + 'description': 'md5:de4212a7e19bb89e4fb14210ca915f1f', + 'timestamp': 1701290997, + 'upload_date': '20231129', + 'uploader': 'By The New York Times', + 'creator': 'Katie Rogers', + 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', + 'duration': 97.631, }, 'params': { - 'skip_download': True, + 'skip_download': 'm3u8', }, }, { - 'url': 'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1', + # multiple videos in the same article + 'url': 'https://www.nytimes.com/2023/12/02/business/air-traffic-controllers-safety.html', + 'info_dict': { + 'id': 'air-traffic-controllers-safety', + 'title': 'Drunk and Asleep on the Job: Air Traffic Controllers Pushed to the Brink', + 'description': 'md5:549e5a5e935bf7d048be53ba3d2c863d', + 'upload_date': '20231202', + 'creator': 'Emily Steel, Sydney Ember', + 'timestamp': 1701511264, + }, + 'playlist_count': 3, + }, { + 'url': 'https://www.nytimes.com/2023/12/02/business/media/netflix-squid-game-challenge.html', 'only_matching': True, }] - def _extract_podcast_from_json(self, json, page_id, webpage): - podcast_audio = self._parse_json( - json, page_id, transform_source=js_to_json) + def _extract_content_from_block(self, block): + details = traverse_obj(block, { + 'id': ('sourceId', {str}), + 'uploader': ('bylines', ..., 'renderedRepresentation', {str}), + 'duration': (None, (('duration', {lambda x: float_or_none(x, scale=1000)}), ('length', {int_or_none}))), + 'timestamp': ('firstPublished', {parse_iso8601}), + 'series': ('podcastSeries', {str}), + }, get_all=False) - audio_data = podcast_audio['data'] - track = audio_data['track'] - - episode_title = track['title'] - video_url = track['source'] - - description = track.get('description') or self._html_search_meta( - ['og:description', 'twitter:description'], webpage) - - podcast_title = audio_data.get('podcast', {}).get('title') - title = ('%s: %s' % (podcast_title, episode_title) - if podcast_title else episode_title) - - episode = audio_data.get('podcast', {}).get('episode') or '' - episode_number = int_or_none(self._search_regex( - r'[Ee]pisode\s+(\d+)', episode, 'episode number', default=None)) + formats, subtitles = self._extract_formats_and_subtitles(details.get('id'), block) + # audio articles will have an url and no formats + url = traverse_obj(block, ('fileUrl', {url_or_none})) + if not formats and url: + formats.append({'url': url, 'vcodec': 'none'}) return { - 'id': remove_start(podcast_audio.get('target'), 'FT') or page_id, - 'url': video_url, - 'title': title, - 'description': description, - 'creator': track.get('credit'), - 'series': podcast_title, - 'episode': episode_title, - 'episode_number': episode_number, - 'duration': int_or_none(track.get('duration')), + **details, + 'thumbnails': self._extract_thumbnails(traverse_obj( + block, ('promotionalMedia', 'crops', ..., 'renditions', ...))), + 'formats': formats, + 'subtitles': subtitles, } def _real_extract(self, url): page_id = self._match_id(url) - webpage = self._download_webpage(url, page_id) + art_json = self._search_json( + r'window\.__preloadedData\s*=', webpage, 'media details', page_id, + transform_source=lambda x: x.replace('undefined', 'null'))['initialData']['data']['article'] - video_id = self._search_regex( - r'data-videoid=["\'](\d+)', webpage, 'video id', - default=None, fatal=False) - if video_id is not None: - return self._extract_video_from_id(video_id) + blocks = traverse_obj(art_json, ( + 'sprinkledBody', 'content', ..., ('ledeMedia', None), + lambda _, v: v['__typename'] in ('Video', 'Audio'))) + if not blocks: + raise ExtractorError('Unable to extract any media blocks from webpage') - podcast_data = self._search_regex( - (r'NYTD\.FlexTypes\.push\s*\(\s*({.+?})\s*\)\s*;\s*</script', - r'NYTD\.FlexTypes\.push\s*\(\s*({.+})\s*\)\s*;'), - webpage, 'podcast data') - return self._extract_podcast_from_json(podcast_data, page_id, webpage) + common_info = { + 'title': remove_end(self._html_extract_title(webpage), ' - The New York Times'), + 'description': traverse_obj(art_json, ( + 'sprinkledBody', 'content', ..., 'summary', 'content', ..., 'text', {str}), + get_all=False) or self._html_search_meta(['og:description', 'twitter:description'], webpage), + 'timestamp': traverse_obj(art_json, ('firstPublished', {parse_iso8601})), + 'creator': ', '.join( + traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName'))), # TODO: change to 'creators' (list) + 'thumbnails': self._extract_thumbnails(traverse_obj( + art_json, ('promotionalMedia', 'assetCrops', ..., 'renditions', ...))), + } + + entries = [] + for block in blocks: + entries.append(merge_dicts(self._extract_content_from_block(block), common_info)) + + if len(entries) > 1: + return self.playlist_result(entries, page_id, **common_info) + + return { + 'id': page_id, + **entries[0], + } class NYTimesCookingIE(NYTimesBaseIE): - _VALID_URL = r'https?://cooking\.nytimes\.com/(?:guid|recip)es/(?P<id>\d+)' + IE_NAME = 'NYTimesCookingGuide' + _VALID_URL = r'https?://cooking\.nytimes\.com/guides/(?P<id>[\w-]+)' _TESTS = [{ - 'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart', - 'md5': 'dab81fa2eaeb3f9ed47498bdcfcdc1d3', + 'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey', 'info_dict': { - 'id': '100000004756089', - 'ext': 'mov', - 'timestamp': 1479383008, - 'uploader': 'By SHAW LASH, ADAM SAEWITZ and JAMES HERRON', - 'title': 'Cranberry Tart', - 'upload_date': '20161117', - 'description': 'If you are a fan of lemon curd or the classic French tarte au citron, you will love this cranberry version.', + 'id': '13-how-to-cook-a-turkey', + 'title': 'How to Cook a Turkey', + 'description': 'md5:726cfd3f9b161bdf5c279879e8050ca0', + }, + 'playlist_count': 2, + }, { + # single video example + 'url': 'https://cooking.nytimes.com/guides/50-how-to-make-mac-and-cheese', + 'md5': '64415805fe0b8640fce6b0b9def5989a', + 'info_dict': { + 'id': '100000005835845', + 'ext': 'mp4', + 'title': 'How to Make Mac and Cheese', + 'description': 'md5:b8f2f33ec1fb7523b21367147c9594f1', + 'timestamp': 1522950315, + 'upload_date': '20180405', + 'duration': 9.51, + 'creator': 'Alison Roman', + 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', }, }, { - 'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey', - 'md5': '4b2e8c70530a89b8d905a2b572316eb8', + 'url': 'https://cooking.nytimes.com/guides/20-how-to-frost-a-cake', + 'md5': '64415805fe0b8640fce6b0b9def5989a', 'info_dict': { - 'id': '100000003951728', - 'ext': 'mov', - 'timestamp': 1445509539, - 'description': 'Turkey guide', - 'upload_date': '20151022', - 'title': 'Turkey', - } + 'id': '20-how-to-frost-a-cake', + 'title': 'How to Frost a Cake', + 'description': 'md5:a31fe3b98a8ce7b98aae097730c269cd', + }, + 'playlist_count': 8, }] def _real_extract(self, url): page_id = self._match_id(url) - webpage = self._download_webpage(url, page_id) + title = self._html_search_meta(['og:title', 'twitter:title'], webpage) + description = self._html_search_meta(['og:description', 'twitter:description'], webpage) - video_id = self._search_regex( - r'data-video-id=["\'](\d+)', webpage, 'video id') + lead_video_id = self._search_regex( + r'data-video-player-id="(\d+)"></div>', webpage, 'lead video') + media_ids = traverse_obj( + get_elements_html_by_class('video-item', webpage), (..., {extract_attributes}, 'data-video-id')) - return self._extract_video_from_id(video_id) + if media_ids: + media_ids.append(lead_video_id) + return self.playlist_result( + [self._extract_video(media_id) for media_id in media_ids], page_id, title, description) + + return { + **self._extract_video(lead_video_id), + 'title': title, + 'description': description, + 'creator': self._search_regex( # TODO: change to 'creators' + r'<span itemprop="author">([^<]+)</span></p>', webpage, 'author', default=None), + } + + +class NYTimesCookingRecipeIE(InfoExtractor): + _VALID_URL = r'https?://cooking\.nytimes\.com/recipes/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart', + 'md5': '579e83bbe8e61e9de67f80edba8a78a8', + 'info_dict': { + 'id': '1017817', + 'ext': 'mp4', + 'title': 'Cranberry Curd Tart', + 'description': 'md5:ad77a3fc321db636256d4343c5742152', + 'timestamp': 1447804800, + 'upload_date': '20151118', + 'creator': 'David Tanis', + 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', + }, + }, { + 'url': 'https://cooking.nytimes.com/recipes/1024781-neapolitan-checkerboard-cookies', + 'md5': '58df35998241dcf0620e99e646331b42', + 'info_dict': { + 'id': '1024781', + 'ext': 'mp4', + 'title': 'Neapolitan Checkerboard Cookies', + 'description': 'md5:ba12394c585ababea951cb6d2fcc6631', + 'timestamp': 1701302400, + 'upload_date': '20231130', + 'creator': 'Sue Li', + 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', + }, + }, { + 'url': 'https://cooking.nytimes.com/recipes/1019516-overnight-oats', + 'md5': '2fe7965a3adc899913b8e25ada360823', + 'info_dict': { + 'id': '1019516', + 'ext': 'mp4', + 'timestamp': 1546387200, + 'description': 'md5:8856ce10239161bd2596ac335b9f9bfb', + 'upload_date': '20190102', + 'title': 'Overnight Oats', + 'creator': 'Genevieve Ko', + 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', + }, + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + recipe_data = self._search_nextjs_data(webpage, page_id)['props']['pageProps']['recipe'] + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + recipe_data['videoSrc'], page_id, 'mp4', m3u8_id='hls') + + return { + **traverse_obj(recipe_data, { + 'id': ('id', {str_or_none}), + 'title': ('title', {str}), + 'description': ('topnote', {clean_html}), + 'timestamp': ('publishedAt', {int_or_none}), + 'creator': ('contentAttribution', 'cardByline', {str}), + }), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': [{'url': thumb_url} for thumb_url in traverse_obj( + recipe_data, ('image', 'crops', 'recipe', ..., {url_or_none}))], + } diff --git a/yt_dlp/extractor/nzherald.py b/yt_dlp/extractor/nzherald.py index 062f9a875b..7c09e9ee81 100644 --- a/yt_dlp/extractor/nzherald.py +++ b/yt_dlp/extractor/nzherald.py @@ -2,11 +2,7 @@ from .brightcove import BrightcoveNewIE from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - traverse_obj -) +from ..utils import ExtractorError, traverse_obj class NZHeraldIE(InfoExtractor): @@ -27,7 +23,7 @@ class NZHeraldIE(InfoExtractor): 'tags': [], 'thumbnail': r're:https?://.*\.jpg$', 'description': 'md5:2f17713fcbfcfbe38bb9e7dfccbb0f2e', - } + }, }, { # Webpage has brightcove embed player url 'url': 'https://www.nzherald.co.nz/travel/pencarrow-coastal-trail/HDVTPJEPP46HJ2UEMK4EGD2DFI/', @@ -42,7 +38,7 @@ class NZHeraldIE(InfoExtractor): 'thumbnail': r're:https?://.*\.jpg$', 'tags': ['travel', 'video'], 'duration': 43.627, - } + }, }, { # two video embeds of the same video 'url': 'https://www.nzherald.co.nz/nz/truck-driver-captured-cutting-off-motorist-on-state-highway-1-in-canterbury/FIHNJB7PLLPHWQPK4S7ZBDUC4I/', @@ -53,7 +49,7 @@ class NZHeraldIE(InfoExtractor): 'timestamp': 1619730509, 'upload_date': '20210429', 'uploader_id': '1308227299001', - 'description': 'md5:4cae7dfb7613ac4c73b9e73a75c6b5d7' + 'description': 'md5:4cae7dfb7613ac4c73b9e73a75c6b5d7', }, 'skip': 'video removed', }, { @@ -70,17 +66,17 @@ class NZHeraldIE(InfoExtractor): 'tags': ['video', 'nz herald focus', 'politics', 'politics videos'], 'thumbnail': r're:https?://.*\.jpg$', 'duration': 99.584, - } + }, }, { 'url': 'https://www.nzherald.co.nz/kahu/kaupapa-companies-my-taiao-supporting-maori-in-study-and-business/PQBO2J25WCG77VGRX7W7BVYEAI/', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://nzherald.co.nz/the-country/video/focus-nzs-first-mass-covid-19-vaccination-event/N5I7IL3BRFLZSD33TLDLYJDGK4/', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.nzherald.co.nz/the-vision-is-clear/news/tvic-damian-roper-planting-trees-an-addiction/AN2AAEPNRK5VLISDWQAJZB6ATQ', - 'only_matching': True - } + 'only_matching': True, + }, ] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1308227299001/S1BXZn8t_default/index.html?videoId=%s' @@ -89,7 +85,7 @@ def _extract_bc_embed_url(self, webpage): """The initial webpage may include the brightcove player embed url""" bc_url = BrightcoveNewIE._extract_url(self, webpage) return bc_url or self._search_regex( - r'(?:embedUrl)\"\s*:\s*\"(?P<embed_url>%s)' % BrightcoveNewIE._VALID_URL, + rf'(?:embedUrl)\"\s*:\s*\"(?P<embed_url>{BrightcoveNewIE._VALID_URL})', webpage, 'embed url', default=None, group='embed_url') def _real_extract(self, url): @@ -111,7 +107,7 @@ def _real_extract(self, url): bc_video_id = traverse_obj( video_metadata or fusion_metadata, # fusion metadata is the video metadata for video-only pages 'brightcoveId', ('content_elements', ..., 'referent', 'id'), - get_all=False, expected_type=compat_str) + get_all=False, expected_type=str) if not bc_video_id: if isinstance(video_metadata, dict) and len(video_metadata) == 0: diff --git a/yt_dlp/extractor/nzonscreen.py b/yt_dlp/extractor/nzonscreen.py index 6926bc5b2f..5fc516daf4 100644 --- a/yt_dlp/extractor/nzonscreen.py +++ b/yt_dlp/extractor/nzonscreen.py @@ -10,7 +10,7 @@ class NZOnScreenIE(InfoExtractor): - _VALID_URL = r'^https://www\.nzonscreen\.com/title/(?P<id>[^/?#]+)' + _VALID_URL = r'^https?://www\.nzonscreen\.com/title/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.nzonscreen.com/title/shoop-shoop-diddy-wop-cumma-cumma-wang-dang-1982', 'info_dict': { @@ -89,5 +89,5 @@ def _real_extract(self, url): 'http_headers': { 'Referer': 'https://www.nzonscreen.com/', 'Origin': 'https://www.nzonscreen.com/', - } + }, } diff --git a/yt_dlp/extractor/odatv.py b/yt_dlp/extractor/odatv.py deleted file mode 100644 index 24ab939421..0000000000 --- a/yt_dlp/extractor/odatv.py +++ /dev/null @@ -1,47 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - NO_DEFAULT, - remove_start -) - - -class OdaTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?odatv\.com/(?:mob|vid)_video\.php\?.*\bid=(?P<id>[^&]+)' - _TESTS = [{ - 'url': 'http://odatv.com/vid_video.php?id=8E388', - 'md5': 'dc61d052f205c9bf2da3545691485154', - 'info_dict': { - 'id': '8E388', - 'ext': 'mp4', - 'title': 'Artık Davutoğlu ile devam edemeyiz' - } - }, { - # mobile URL - 'url': 'http://odatv.com/mob_video.php?id=8E388', - 'only_matching': True, - }, { - # no video - 'url': 'http://odatv.com/mob_video.php?id=8E900', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - no_video = 'NO VIDEO!' in webpage - - video_url = self._search_regex( - r'mp4\s*:\s*(["\'])(?P<url>http.+?)\1', webpage, 'video url', - default=None if no_video else NO_DEFAULT, group='url') - - if no_video: - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - return { - 'id': video_id, - 'url': video_url, - 'title': remove_start(self._og_search_title(webpage), 'Video: '), - 'thumbnail': self._og_search_thumbnail(webpage), - } diff --git a/yt_dlp/extractor/odkmedia.py b/yt_dlp/extractor/odkmedia.py index 2960860d6c..766cb941bb 100644 --- a/yt_dlp/extractor/odkmedia.py +++ b/yt_dlp/extractor/odkmedia.py @@ -1,13 +1,13 @@ import json -import urllib.error from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, GeoRestrictedError, float_or_none, traverse_obj, - try_call + try_call, ) @@ -24,7 +24,7 @@ class OnDemandChinaEpisodeIE(InfoExtractor): 'thumbnail': 'https://d2y2efdi5wgkcl.cloudfront.net/fit-in/256x256/media-io/2020/9/11/image.d9816e81.jpg', 'description': '疫情严峻,党政军民学、东西南北中协同应考', 'tags': ['Social Humanities', 'Documentary', 'Medical', 'Social'], - } + }, }] _QUERY = ''' @@ -74,8 +74,8 @@ def _real_extract(self, url): f'https://odkmedia.io/odc/api/v2/playback/{video_info["id"]}/', display_id, headers={'Authorization': '', 'service-name': 'odc'}) except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError): - error_data = self._parse_json(e.cause.read(), display_id)['detail'] + if isinstance(e.cause, HTTPError): + error_data = self._parse_json(e.cause.response.read(), display_id)['detail'] raise GeoRestrictedError(error_data) formats, subtitles = [], {} @@ -101,5 +101,5 @@ def _real_extract(self, url): or self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage)), 'formats': formats, 'subtitles': subtitles, - 'tags': try_call(lambda: self._html_search_meta('keywords', webpage).split(', ')) + 'tags': try_call(lambda: self._html_search_meta('keywords', webpage).split(', ')), } diff --git a/yt_dlp/extractor/odnoklassniki.py b/yt_dlp/extractor/odnoklassniki.py index 4b73eed37e..d27d1c3f02 100644 --- a/yt_dlp/extractor/odnoklassniki.py +++ b/yt_dlp/extractor/odnoklassniki.py @@ -1,10 +1,8 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import ( - compat_etree_fromstring, - compat_parse_qs, - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse, -) +from ..compat import compat_etree_fromstring +from ..networking import HEADRequest from ..utils import ( ExtractorError, float_or_none, @@ -15,6 +13,7 @@ unescapeHTML, unified_strdate, unsmuggle_url, + url_or_none, urlencode_postdata, ) @@ -41,7 +40,7 @@ class OdnoklassnikiIE(InfoExtractor): 'ext': 'mp4', 'timestamp': 1545580896, 'view_count': int, - 'thumbnail': 'https://coub-attachments.akamaized.net/coub_storage/coub/simple/cw_image/c5ac87553bd/608e806a1239c210ab692/1545580913_00026.jpg', + 'thumbnail': r're:^https?://.*\.jpg$', 'title': 'Народная забава', 'uploader': 'Nevata', 'upload_date': '20181223', @@ -65,13 +64,14 @@ class OdnoklassnikiIE(InfoExtractor): 'title': str, 'uploader': str, }, + 'skip': 'vk extractor error', }, { - # metadata in JSON + # metadata in JSON, webm_dash with Firefox UA 'url': 'http://ok.ru/video/20079905452', - 'md5': '5d2b64756e2af296e3b383a0bc02a6aa', + 'md5': '8f477d8931c531374a3e36daec617b2c', 'info_dict': { 'id': '20079905452', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Культура меняет нас (прекрасный ролик!))', 'thumbnail': str, 'duration': 100, @@ -81,10 +81,14 @@ class OdnoklassnikiIE(InfoExtractor): 'like_count': int, 'age_limit': 0, }, + 'params': { + 'format': 'bv[ext=webm]', + 'http_headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0'}, + }, }, { # metadataUrl 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5', - 'md5': 'f8c951122516af72e6e6ffdd3c41103b', + 'md5': '2bae2f58eefe1b3d26f3926c4a64d2f3', 'info_dict': { 'id': '63567059965189-0', 'ext': 'mp4', @@ -98,10 +102,11 @@ class OdnoklassnikiIE(InfoExtractor): 'age_limit': 0, 'start_time': 5, }, + 'params': {'skip_download': 'm3u8'}, }, { # YouTube embed (metadataUrl, provider == USER_YOUTUBE) 'url': 'https://ok.ru/video/3952212382174', - 'md5': '91749d0bd20763a28d083fa335bbd37a', + 'md5': '5fb5f83ce16cb212d6bf887282b5da53', 'info_dict': { 'id': '5axVgHHDBvU', 'ext': 'mp4', @@ -116,7 +121,7 @@ class OdnoklassnikiIE(InfoExtractor): 'live_status': 'not_live', 'view_count': int, 'thumbnail': 'https://i.mycdn.me/i?r=AEHujHvw2RjEbemUCNEorZbxYpb_p_9AcN2FmGik64Krkcmz37YtlY093oAM5-HIEAt7Zi9s0CiBOSDmbngC-I-k&fn=external_8', - 'uploader_url': 'http://www.youtube.com/user/MrKewlkid94', + 'uploader_url': 'https://www.youtube.com/@MrKewlkid94', 'channel_follower_count': int, 'tags': ['youtube-dl', 'youtube playlists', 'download videos', 'download audio'], 'channel_id': 'UCVGtvURtEURYHtJFUegdSug', @@ -145,7 +150,6 @@ class OdnoklassnikiIE(InfoExtractor): }, 'skip': 'Video has not been found', }, { - # TODO: HTTP Error 400: Bad Request, it only works if there's no cookies when downloading 'note': 'Only available in mobile webpage', 'url': 'https://m.ok.ru/video/2361249957145', 'info_dict': { @@ -153,8 +157,8 @@ class OdnoklassnikiIE(InfoExtractor): 'ext': 'mp4', 'title': 'Быковское крещение', 'duration': 3038.181, + 'thumbnail': r're:^https?://i\.mycdn\.me/videoPreview\?.+', }, - 'skip': 'HTTP Error 400', }, { 'note': 'subtitles', 'url': 'https://ok.ru/video/4249587550747', @@ -226,6 +230,12 @@ class OdnoklassnikiIE(InfoExtractor): 'skip': 'Site no longer embeds', }] + def _clear_cookies(self, cdn_url): + # Direct http downloads will fail if CDN cookies are set + # so we need to reset them after each format extraction + self.cookiejar.clear(domain='.mycdn.me') + self.cookiejar.clear(domain=urllib.parse.urlparse(cdn_url).hostname) + @classmethod def _extract_embed_urls(cls, url, webpage): for x in super()._extract_embed_urls(url, webpage): @@ -242,8 +252,8 @@ def _real_extract(self, url): raise e def _extract_desktop(self, url): - start_time = int_or_none(compat_parse_qs( - compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0]) + start_time = int_or_none(urllib.parse.parse_qs( + urllib.parse.urlparse(url).query).get('fromTime', [None])[0]) url, smuggled = unsmuggle_url(url, {}) video_id, is_embed = self._match_valid_url(url).group('id', 'embed') @@ -266,7 +276,7 @@ def _extract_desktop(self, url): player = self._parse_json( unescapeHTML(self._search_regex( - r'data-options=(?P<quote>["\'])(?P<player>{.+?%s.+?})(?P=quote)' % video_id, + rf'data-options=(?P<quote>["\'])(?P<player>{{.+?{video_id}.+?}})(?P=quote)', webpage, 'player', group='player')), video_id) @@ -285,7 +295,7 @@ def _extract_desktop(self, url): if st_location: data['st.location'] = st_location metadata = self._download_json( - compat_urllib_parse_unquote(flashvars['metadataUrl']), + urllib.parse.unquote(flashvars['metadataUrl']), video_id, 'Downloading metadata JSON', data=urlencode_postdata(data)) @@ -364,14 +374,22 @@ def _extract_desktop(self, url): formats = [{ 'url': f['url'], 'ext': 'mp4', - 'format_id': f['name'], - } for f in metadata['videos']] + 'format_id': f.get('name'), + } for f in traverse_obj(metadata, ('videos', lambda _, v: url_or_none(v['url'])))] - m3u8_url = metadata.get('hlsManifestUrl') + m3u8_url = traverse_obj(metadata, 'hlsManifestUrl', 'ondemandHls') if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._clear_cookies(m3u8_url) + + for mpd_id, mpd_key in [('dash', 'ondemandDash'), ('webm', 'metadataWebmUrl')]: + mpd_url = metadata.get(mpd_key) + if mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url, video_id, mpd_id=mpd_id, fatal=False)) + self._clear_cookies(mpd_url) dash_manifest = metadata.get('metadataEmbedded') if dash_manifest: @@ -390,6 +408,7 @@ def _extract_desktop(self, url): if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + self._clear_cookies(m3u8_url) rtmp_url = metadata.get('rtmpUrl') if rtmp_url: formats.append({ @@ -410,7 +429,7 @@ def _extract_mobile(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://m.ok.ru/video/%s' % video_id, video_id, + f'http://m.ok.ru/video/{video_id}', video_id, note='Downloading mobile webpage') error = self._search_regex( @@ -423,6 +442,10 @@ def _extract_mobile(self, url): r'data-video="(.+?)"', webpage, 'json data') json_data = self._parse_json(unescapeHTML(json_data), video_id) or {} + redirect_url = self._request_webpage(HEADRequest( + json_data['videoSrc']), video_id, 'Requesting download URL').url + self._clear_cookies(redirect_url) + return { 'id': video_id, 'title': json_data.get('videoName'), @@ -430,7 +453,7 @@ def _extract_mobile(self, url): 'thumbnail': json_data.get('videoPosterSrc'), 'formats': [{ 'format_id': 'mobile', - 'url': json_data.get('videoSrc'), + 'url': redirect_url, 'ext': 'mp4', - }] + }], } diff --git a/yt_dlp/extractor/oftv.py b/yt_dlp/extractor/oftv.py index 3ae7278fb9..415694ceeb 100644 --- a/yt_dlp/extractor/oftv.py +++ b/yt_dlp/extractor/oftv.py @@ -4,7 +4,7 @@ class OfTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?of.tv/video/(?P<id>\w+)' + _VALID_URL = r'https?://(?:www\.)?of\.tv/video/(?P<id>\w+)' _TESTS = [{ 'url': 'https://of.tv/video/627d7d95b353db0001dadd1a', 'md5': 'cb9cd5db3bb9ee0d32bfd7e373d6ef0a', @@ -20,8 +20,8 @@ class OfTVIE(InfoExtractor): 'timestamp': 1652391300, 'upload_date': '20220512', 'view_count': 0, - 'creator': 'This is Fire' - } + 'creator': 'This is Fire', + }, }] def _real_extract(self, url): @@ -34,13 +34,13 @@ def _real_extract(self, url): class OfTVPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?of.tv/creators/(?P<id>[a-zA-Z0-9-]+)/.?' + _VALID_URL = r'https?://(?:www\.)?of\.tv/creators/(?P<id>[a-zA-Z0-9-]+)/?(?:$|[?#])' _TESTS = [{ 'url': 'https://of.tv/creators/this-is-fire/', 'playlist_count': 8, 'info_dict': { - 'id': 'this-is-fire' - } + 'id': 'this-is-fire', + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/oktoberfesttv.py b/yt_dlp/extractor/oktoberfesttv.py index e0ac8563a7..b4bcdc7417 100644 --- a/yt_dlp/extractor/oktoberfesttv.py +++ b/yt_dlp/extractor/oktoberfesttv.py @@ -15,7 +15,7 @@ class OktoberfestTVIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, } def _real_extract(self, url): diff --git a/yt_dlp/extractor/olympics.py b/yt_dlp/extractor/olympics.py index 61d1f40486..bbf83e531a 100644 --- a/yt_dlp/extractor/olympics.py +++ b/yt_dlp/extractor/olympics.py @@ -1,12 +1,19 @@ from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( + ExtractorError, int_or_none, - try_get + parse_iso8601, + parse_qs, + try_get, + update_url, + url_or_none, ) +from ..utils.traversal import traverse_obj class OlympicsReplayIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?olympics\.com(?:/tokyo-2020)?/[a-z]{2}/(?:replay|video)/(?P<id>[^/#&?]+)' + _VALID_URL = r'https?://(?:www\.)?olympics\.com/[a-z]{2}/(?:paris-2024/)?(?:replay|videos?|original-series/episode)/(?P<id>[\w-]+)' _TESTS = [{ 'url': 'https://olympics.com/fr/video/men-s-109kg-group-a-weightlifting-tokyo-2020-replays', 'info_dict': { @@ -14,26 +21,105 @@ class OlympicsReplayIE(InfoExtractor): 'ext': 'mp4', 'title': '+109kg (H) Groupe A - Haltérophilie | Replay de Tokyo 2020', 'upload_date': '20210801', - 'timestamp': 1627783200, + 'timestamp': 1627797600, 'description': 'md5:c66af4a5bc7429dbcc43d15845ff03b3', - 'uploader': 'International Olympic Committee', - }, - 'params': { - 'skip_download': True, + 'thumbnail': 'https://img.olympics.com/images/image/private/t_1-1_1280/primary/nua4o7zwyaznoaejpbk2', + 'duration': 7017.0, }, }, { - 'url': 'https://olympics.com/tokyo-2020/en/replay/bd242924-4b22-49a5-a846-f1d4c809250d/mens-bronze-medal-match-hun-esp', - 'only_matching': True, + 'url': 'https://olympics.com/en/original-series/episode/b-boys-and-b-girls-take-the-spotlight-breaking-life-road-to-paris-2024', + 'info_dict': { + 'id': '32633650-c5ee-4280-8b94-fb6defb6a9b5', + 'ext': 'mp4', + 'title': 'B-girl Nicka - Breaking Life, Road to Paris 2024 | Episode 1', + 'upload_date': '20240517', + 'timestamp': 1715948200, + 'description': 'md5:f63d728a41270ec628f6ac33ce471bb1', + 'thumbnail': 'https://img.olympics.com/images/image/private/t_1-1_1280/primary/a3j96l7j6so3vyfijby1', + 'duration': 1321.0, + }, + }, { + 'url': 'https://olympics.com/en/paris-2024/videos/men-s-preliminaries-gbr-esp-ned-rsa-hockey-olympic-games-paris-2024', + 'info_dict': { + 'id': '3d96db23-8eee-4b7c-8ef5-488a0361026c', + 'ext': 'mp4', + 'title': 'Men\'s Preliminaries GBR-ESP & NED-RSA | Hockey | Olympic Games Paris 2024', + 'upload_date': '20240727', + 'timestamp': 1722066600, + }, + 'skip': 'Geo-restricted to RU, BR, BT, NP, TM, BD, TL', + }, { + 'url': 'https://olympics.com/en/paris-2024/videos/dnp-suni-lee-i-have-goals-and-i-have-expectations-for-myself-but-i-also-am-trying-to-give-myself-grace', + 'info_dict': { + 'id': 'a42f37ab-8a74-41d0-a7d9-af27b7b02a90', + 'ext': 'mp4', + 'title': 'md5:c7cfbc9918636a98e66400a812e4d407', + 'upload_date': '20240729', + 'timestamp': 1722288600, + }, }] + _GEO_BYPASS = False + + def _extract_from_nextjs_data(self, webpage, video_id): + data = traverse_obj(self._search_nextjs_data(webpage, video_id, default={}), ( + 'props', 'pageProps', 'page', 'items', + lambda _, v: v['name'] == 'videoPlaylist', 'data', 'currentVideo', {dict}, any)) + if not data: + return None + + geo_countries = traverse_obj(data, ('countries', ..., {str})) + if traverse_obj(data, ('geoRestrictedVideo', {bool})): + self.raise_geo_restricted(countries=geo_countries) + + is_live = traverse_obj(data, ('streamingStatus', {str})) == 'LIVE' + m3u8_url = traverse_obj(data, ('videoUrl', {url_or_none})) or data['streamUrl'] + tokenized_url = self._tokenize_url(m3u8_url, data['jwtToken'], is_live, video_id) + + try: + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + tokenized_url, video_id, 'mp4', m3u8_id='hls') + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and 'georestricted' in e.cause.msg: + self.raise_geo_restricted(countries=geo_countries) + raise + + return { + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + **traverse_obj(data, { + 'id': ('videoID', {str}), + 'title': ('title', {str}), + 'timestamp': ('contentDate', {parse_iso8601}), + }), + } + + def _tokenize_url(self, url, token, is_live, video_id): + return self._download_json( + 'https://metering.olympics.com/tokengenerator', video_id, + 'Downloading tokenized m3u8 url', query={ + **parse_qs(url), + 'url': update_url(url, query=None), + 'service-id': 'live' if is_live else 'vod', + 'user-auth': token, + })['data']['url'] + + def _legacy_tokenize_url(self, url, video_id): + return self._download_json( + 'https://olympics.com/tokenGenerator', video_id, + 'Downloading legacy tokenized m3u8 url', query={'url': url}) def _real_extract(self, url): - id = self._match_id(url) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + if info := self._extract_from_nextjs_data(webpage, video_id): + return info - webpage = self._download_webpage(url, id) title = self._html_search_meta(('title', 'og:title', 'twitter:title'), webpage) - uuid = self._html_search_meta('episode_uid', webpage) + video_uuid = self._html_search_meta('episode_uid', webpage) m3u8_url = self._html_search_meta('video_url', webpage) - json_ld = self._search_json_ld(webpage, uuid) + json_ld = self._search_json_ld(webpage, video_uuid) thumbnails_list = json_ld.get('image') if not thumbnails_list: thumbnails_list = self._html_search_regex( @@ -49,17 +135,17 @@ def _real_extract(self, url): thumbnails.append({ 'url': thumbnail, 'width': width, - 'height': int_or_none(try_get(width, lambda x: x * height_a / width_a)) + 'height': int_or_none(try_get(width, lambda x: x * height_a / width_a)), }) - m3u8_url = self._download_json( - f'https://olympics.com/tokenGenerator?url={m3u8_url}', uuid, note='Downloading m3u8 url') - formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, uuid, 'mp4', m3u8_id='hls') + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + self._legacy_tokenize_url(m3u8_url, video_uuid), video_uuid, 'mp4', m3u8_id='hls') return { - 'id': uuid, + 'id': video_uuid, 'title': title, 'thumbnails': thumbnails, 'formats': formats, 'subtitles': subtitles, - **json_ld + **json_ld, } diff --git a/yt_dlp/extractor/on24.py b/yt_dlp/extractor/on24.py index 9a4abc98d9..05218e9de1 100644 --- a/yt_dlp/extractor/on24.py +++ b/yt_dlp/extractor/on24.py @@ -27,7 +27,7 @@ class On24IE(InfoExtractor): 'upload_date': '20200219', 'timestamp': 1582149600.0, 'view_count': int, - } + }, }, { 'url': 'https://event.on24.com/wcc/r/2639291/82829018E813065A122363877975752E?mode=login&email=johnsmith@gmail.com', 'only_matching': True, @@ -47,7 +47,7 @@ def _real_extract(self, url): 'eventId': event_id, 'displayProfile': 'player', 'key': event_key, - 'contentType': 'A' + 'contentType': 'A', }) event_id = str(try_get(event_data, lambda x: x['presentationLogInfo']['eventid'])) or event_id language = event_data.get('localelanguagecode') @@ -74,7 +74,7 @@ def _real_extract(self, url): 'language': language, 'ext': 'wav', 'vcodec': 'none', - 'acodec': 'wav' + 'acodec': 'wav', }) return { diff --git a/yt_dlp/extractor/ondemandkorea.py b/yt_dlp/extractor/ondemandkorea.py index dd7d1d7dea..591b4147eb 100644 --- a/yt_dlp/extractor/ondemandkorea.py +++ b/yt_dlp/extractor/ondemandkorea.py @@ -1,87 +1,169 @@ +import functools import re +import uuid from .common import InfoExtractor from ..utils import ( ExtractorError, - js_to_json, + OnDemandPagedList, + float_or_none, + int_or_none, + join_nonempty, + parse_age_limit, + parse_qs, + str_or_none, + unified_strdate, + url_or_none, ) +from ..utils.traversal import traverse_obj class OnDemandKoreaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?P<id>[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?:en/)?player/vod/[a-z0-9-]+\?(?:[^#]+&)?contentId=(?P<id>\d+)' _GEO_COUNTRIES = ['US', 'CA'] + _TESTS = [{ - 'url': 'https://www.ondemandkorea.com/ask-us-anything-e351.html', + 'url': 'https://www.ondemandkorea.com/player/vod/ask-us-anything?contentId=686471', + 'md5': 'e2ff77255d989e3135bde0c5889fbce8', 'info_dict': { - 'id': 'ask-us-anything-e351', + 'id': '686471', 'ext': 'mp4', - 'title': 'Ask Us Anything : Jung Sung-ho, Park Seul-gi, Kim Bo-min, Yang Seung-won - 09/24/2022', - 'description': 'A talk show/game show with a school theme where celebrity guests appear as “transfer students.”', - 'thumbnail': r're:^https?://.*\.jpg$', + 'title': 'Ask Us Anything: Jung Sung-ho, Park Seul-gi, Kim Bo-min, Yang Seung-won', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)', + 'duration': 5486.955, + 'release_date': '20220924', + 'series': 'Ask Us Anything', + 'series_id': '11790', + 'episode_number': 351, + 'episode': 'Jung Sung-ho, Park Seul-gi, Kim Bo-min, Yang Seung-won', }, - 'params': { - 'skip_download': 'm3u8 download' - } }, { - 'url': 'https://www.ondemandkorea.com/work-later-drink-now-e1.html', + 'url': 'https://www.ondemandkorea.com/player/vod/breakup-probation-a-week?contentId=1595796', + 'md5': '57266c720006962be7ff415b24775caa', 'info_dict': { - 'id': 'work-later-drink-now-e1', + 'id': '1595796', 'ext': 'mp4', - 'title': 'Work Later, Drink Now : E01', - 'description': 'Work Later, Drink First follows three women who find solace in a glass of liquor at the end of the day. So-hee, who gets comfort from a cup of soju af', - 'thumbnail': r're:^https?://.*\.png$', - 'subtitles': { - 'English': 'mincount:1', - }, + 'title': 'Breakup Probation, A Week: E08', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)', + 'duration': 1586.0, + 'release_date': '20231001', + 'series': 'Breakup Probation, A Week', + 'series_id': '22912', + 'episode_number': 8, + 'episode': 'E08', }, - 'params': { - 'skip_download': 'm3u8 download' - } + }, { + 'url': 'https://www.ondemandkorea.com/player/vod/the-outlaws?contentId=369531', + 'md5': 'fa5523b87aa1f6d74fc622a97f2b47cd', + 'info_dict': { + 'id': '369531', + 'ext': 'mp4', + 'release_date': '20220519', + 'duration': 7267.0, + 'title': 'The Outlaws: Main Movie', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)', + 'age_limit': 18, + }, + }, { + 'url': 'https://www.ondemandkorea.com/en/player/vod/capture-the-moment-how-is-that-possible?contentId=1605006', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id, fatal=False) - if not webpage: - # Page sometimes returns captcha page with HTTP 403 - raise ExtractorError( - 'Unable to access page. You may have been blocked.', - expected=True) + data = self._download_json( + f'https://odkmedia.io/odx/api/v3/playback/{video_id}/', video_id, fatal=False, + headers={'service-name': 'odk'}, query={'did': str(uuid.uuid4())}, expected_status=(403, 404)) + if not traverse_obj(data, ('result', {dict})): + msg = traverse_obj(data, ('messages', '__default'), 'title', expected_type=str) + raise ExtractorError(msg or 'Got empty response from playback API', expected=True) - if 'msg_block_01.png' in webpage: - self.raise_geo_restricted( - msg='This content is not available in your region', - countries=self._GEO_COUNTRIES) + data = data['result'] - if 'This video is only available to ODK PLUS members.' in webpage: - raise ExtractorError( - 'This video is only available to ODK PLUS members.', - expected=True) + def try_geo_bypass(url): + return traverse_obj(url, ({parse_qs}, 'stream_url', 0, {url_or_none})) or url - if 'ODK PREMIUM Members Only' in webpage: - raise ExtractorError( - 'This video is only available to ODK PREMIUM members.', - expected=True) + formats = [] + for m3u8_url in traverse_obj(data, (('sources', 'manifest'), ..., 'url', {url_or_none}, {try_geo_bypass})): + mod_url = re.sub(r'_720(p?)\.m3u8', r'_1080\1.m3u8', m3u8_url) + if mod_url != m3u8_url: + mod_format = self._extract_m3u8_formats( + mod_url, video_id, note='Checking for higher quality format', + errnote='No higher quality format found', fatal=False) + if mod_format: + formats.extend(mod_format) + continue + formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, fatal=False)) - title = self._search_regex( - r'class=["\']episode_title["\'][^>]*>([^<]+)', - webpage, 'episode_title', fatal=False) or self._og_search_title(webpage) + subtitles = {} + for track in traverse_obj(data, ('text_tracks', lambda _, v: url_or_none(v['url']))): + subtitles.setdefault(track.get('language', 'und'), []).append({ + 'url': track['url'], + 'ext': track.get('codec'), + 'name': track.get('label'), + }) - jw_config = self._parse_json( - self._search_regex(( - r'(?P<options>{\s*[\'"]tracks[\'"].*?})[)\];]+$', - r'playlist\s*=\s*\[(?P<options>.+)];?$', - r'odkPlayer\.init.*?(?P<options>{[^;]+}).*?;', - ), webpage, 'jw config', flags=re.MULTILINE | re.DOTALL, group='options'), - video_id, transform_source=js_to_json) - info = self._parse_jwplayer_data( - jw_config, video_id, require_title=False, m3u8_id='hls', - base_url=url) + def if_series(key=None): + return lambda obj: obj[key] if key and obj['kind'] == 'series' else None - info.update({ - 'title': title, - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage) - }) - return info + return { + 'id': video_id, + 'title': join_nonempty( + ('episode', 'program', 'title'), + ('episode', 'title'), from_dict=data, delim=': '), + **traverse_obj(data, { + 'thumbnail': ('episode', 'images', 'thumbnail', {url_or_none}), + 'release_date': ('episode', 'release_date', {lambda x: x.replace('-', '')}, {unified_strdate}), + 'duration': ('duration', {functools.partial(float_or_none, scale=1000)}), + 'age_limit': ('age_rating', 'name', {lambda x: x.replace('R', '')}, {parse_age_limit}), + 'series': ('episode', {if_series(key='program')}, 'title'), + 'series_id': ('episode', {if_series(key='program')}, 'id', {str_or_none}), + 'episode': ('episode', {if_series(key='title')}), + 'episode_number': ('episode', {if_series(key='number')}, {int_or_none}), + }, get_all=False), + 'formats': formats, + 'subtitles': subtitles, + } + + +class OnDemandKoreaProgramIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?:en/)?player/vod/(?P<id>[a-z0-9-]+)(?:$|#)' + _GEO_COUNTRIES = ['US', 'CA'] + + _TESTS = [{ + 'url': 'https://www.ondemandkorea.com/player/vod/uskn-news', + 'info_dict': { + 'id': 'uskn-news', + }, + 'playlist_mincount': 755, + }, { + 'url': 'https://www.ondemandkorea.com/en/player/vod/the-land', + 'info_dict': { + 'id': 'the-land', + }, + 'playlist_count': 52, + }] + + _PAGE_SIZE = 100 + + def _fetch_page(self, display_id, page): + page += 1 + page_data = self._download_json( + f'https://odkmedia.io/odx/api/v3/program/{display_id}/episodes/', display_id, + headers={'service-name': 'odk'}, query={ + 'page': page, + 'page_size': self._PAGE_SIZE, + }, note=f'Downloading page {page}', expected_status=404) + for episode in traverse_obj(page_data, ('result', 'results', ...)): + yield self.url_result( + f'https://www.ondemandkorea.com/player/vod/{display_id}?contentId={episode["id"]}', + ie=OnDemandKoreaIE, video_title=episode.get('title')) + + def _real_extract(self, url): + display_id = self._match_id(url) + + entries = OnDemandPagedList(functools.partial( + self._fetch_page, display_id), self._PAGE_SIZE) + + return self.playlist_result(entries, display_id) diff --git a/yt_dlp/extractor/onefootball.py b/yt_dlp/extractor/onefootball.py index 591d15732d..ee432e8eda 100644 --- a/yt_dlp/extractor/onefootball.py +++ b/yt_dlp/extractor/onefootball.py @@ -1,4 +1,6 @@ from .common import InfoExtractor +from .jwplatform import JWPlatformIE +from ..utils import make_archive_id class OneFootballIE(InfoExtractor): @@ -7,41 +9,43 @@ class OneFootballIE(InfoExtractor): _TESTS = [{ 'url': 'https://onefootball.com/en/video/highlights-fc-zuerich-3-3-fc-basel-34012334', 'info_dict': { - 'id': '34012334', + 'id': 'Y2VtcWAT', 'ext': 'mp4', 'title': 'Highlights: FC Zürich 3-3 FC Basel', 'description': 'md5:33d9855cb790702c4fe42a513700aba8', - 'thumbnail': 'https://photobooth-api.onefootball.com/api/screenshot/https:%2F%2Fperegrine-api.onefootball.com%2Fv2%2Fphotobooth%2Fcms%2Fen%2F34012334', - 'timestamp': 1635874604, - 'upload_date': '20211102' + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/Y2VtcWAT/poster.jpg?width=720', + 'timestamp': 1635874895, + 'upload_date': '20211102', + 'duration': 375.0, + 'tags': ['Football', 'Soccer', 'OneFootball'], + '_old_archive_ids': ['onefootball 34012334'], }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, + 'expected_warnings': ['Failed to download m3u8 information'], }, { 'url': 'https://onefootball.com/en/video/klopp-fumes-at-var-decisions-in-west-ham-defeat-34041020', 'info_dict': { - 'id': '34041020', + 'id': 'leVJrMho', 'ext': 'mp4', 'title': 'Klopp fumes at VAR decisions in West Ham defeat', 'description': 'md5:9c50371095a01ad3f63311c73d8f51a5', - 'thumbnail': 'https://photobooth-api.onefootball.com/api/screenshot/https:%2F%2Fperegrine-api.onefootball.com%2Fv2%2Fphotobooth%2Fcms%2Fen%2F34041020', - 'timestamp': 1636314103, - 'upload_date': '20211107' + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/leVJrMho/poster.jpg?width=720', + 'timestamp': 1636315232, + 'upload_date': '20211107', + 'duration': 93.0, + 'tags': ['Football', 'Soccer', 'OneFootball'], + '_old_archive_ids': ['onefootball 34041020'], }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }] def _real_extract(self, url): - id = self._match_id(url) - webpage = self._download_webpage(url, id) - data_json = self._search_json_ld(webpage, id) - m3u8_url = self._html_search_regex(r'(https://cdn\.jwplayer\.com/manifests/.+\.m3u8)', webpage, 'm3u8_url') - formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, id) - return { - 'id': id, - 'title': data_json.get('title'), - 'description': data_json.get('description'), - 'thumbnail': data_json.get('thumbnail'), - 'timestamp': data_json.get('timestamp'), - 'formats': formats, - 'subtitles': subtitles, - } + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data_json = self._search_json_ld(webpage, video_id, fatal=False) + data_json.pop('url', None) + m3u8_url = self._html_search_regex(r'(https://cdn\.jwplayer\.com/manifests/\w+\.m3u8)', webpage, 'm3u8_url') + + return self.url_result( + m3u8_url, JWPlatformIE, video_id, _old_archive_ids=[make_archive_id(self, video_id)], + **data_json, url_transparent=True) diff --git a/yt_dlp/extractor/onenewsnz.py b/yt_dlp/extractor/onenewsnz.py index a46211e777..c849da0813 100644 --- a/yt_dlp/extractor/onenewsnz.py +++ b/yt_dlp/extractor/onenewsnz.py @@ -1,10 +1,6 @@ from .brightcove import BrightcoveNewIE from .common import InfoExtractor - -from ..utils import ( - ExtractorError, - traverse_obj -) +from ..utils import ExtractorError, traverse_obj class OneNewsNZIE(InfoExtractor): @@ -30,8 +26,8 @@ class OneNewsNZIE(InfoExtractor): 'duration': 38.272, 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'Greenpeace accused the Government of "greenwashing" instead of taking climate action.', - } - }] + }, + }], }, { # YouTube video 'url': 'https://www.1news.co.nz/2022/09/30/now-is-the-time-to-care-about-womens-rugby/', @@ -63,8 +59,8 @@ class OneNewsNZIE(InfoExtractor): 'availability': 'public', 'playable_in_embed': True, 'live_status': 'not_live', - } - }] + }, + }], }, { # 2 Brightcove videos 'url': 'https://www.1news.co.nz/2022/09/29/raw-videos-capture-hurricane-ians-fury-as-it-slams-florida/', @@ -93,7 +89,7 @@ def _real_extract(self, url): brightcove_config = traverse_obj(item, ('embed', 'config')) brightcove_url = self.BRIGHTCOVE_URL_TEMPLATE % ( traverse_obj(brightcove_config, 'brightcoveAccount') or '963482464001', - traverse_obj(brightcove_config, 'brightcoveVideoId') + traverse_obj(brightcove_config, 'brightcoveVideoId'), ) entries.append(self.url_result(brightcove_url, BrightcoveNewIE)) elif item_type == 'youtube': diff --git a/yt_dlp/extractor/oneplace.py b/yt_dlp/extractor/oneplace.py index 86337ad0ad..12e6ef64c1 100644 --- a/yt_dlp/extractor/oneplace.py +++ b/yt_dlp/extractor/oneplace.py @@ -10,7 +10,7 @@ class OnePlacePodcastIE(InfoExtractor): 'ext': 'mp3', 'title': 'Living in the Last Days Part 2 | A Daily Walk with John Randall', 'description': 'md5:fbb8f1cf21447ac54ecaa2887fc20c6e', - } + }, }, { 'url': 'https://www.oneplace.com/ministries/ankerberg-show/listen/ep-3-relying-on-the-constant-companionship-of-the-holy-spirit-part-2-922513.html', 'info_dict': { @@ -18,7 +18,7 @@ class OnePlacePodcastIE(InfoExtractor): 'ext': 'mp3', 'description': 'md5:8b810b4349aa40a5d033b4536fe428e1', 'title': 'md5:ce10f7d8d5ddcf485ed8905ef109659d', - } + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/onet.py b/yt_dlp/extractor/onet.py index 0d59e8cb44..05e4d692ad 100644 --- a/yt_dlp/extractor/onet.py +++ b/yt_dlp/extractor/onet.py @@ -2,13 +2,13 @@ from .common import InfoExtractor from ..utils import ( - determine_ext, + NO_DEFAULT, ExtractorError, + determine_ext, float_or_none, get_element_by_class, int_or_none, js_to_json, - NO_DEFAULT, parse_iso8601, remove_start, strip_or_none, @@ -39,7 +39,7 @@ def _extract_from_id(self, video_id, webpage=None): error = response.get('error') if error: raise ExtractorError( - '%s said: %s' % (self.IE_NAME, error['message']), expected=True) + '{} said: {}'.format(self.IE_NAME, error['message']), expected=True) video = response['result'].get('0') @@ -182,7 +182,7 @@ def _real_extract(self, url): return self._extract_from_id(video_id, webpage) matches = re.findall( - r'<a[^>]+href=[\'"](%s[a-z]+/[0-9a-z-]+/[0-9a-z]+)' % self._URL_BASE_RE, + rf'<a[^>]+href=[\'"]({self._URL_BASE_RE}[a-z]+/[0-9a-z-]+/[0-9a-z]+)', webpage) entries = [ self.url_result(video_link, OnetIE.ie_key()) @@ -256,4 +256,4 @@ def _real_extract(self, url): mvp_id = self._search_mvp_id(webpage) return self.url_result( - 'onetmvp:%s' % mvp_id, OnetMVPIE.ie_key(), video_id=mvp_id) + f'onetmvp:{mvp_id}', OnetMVPIE.ie_key(), video_id=mvp_id) diff --git a/yt_dlp/extractor/onionstudios.py b/yt_dlp/extractor/onionstudios.py index 5fa49e1423..7e30b2d33d 100644 --- a/yt_dlp/extractor/onionstudios.py +++ b/yt_dlp/extractor/onionstudios.py @@ -1,5 +1,4 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import js_to_json @@ -34,7 +33,7 @@ def _real_extract(self, url): webpage = self._download_webpage( 'http://onionstudios.com/embed/dc94dc2899fe644c0e7241fa04c1b732.js', video_id) - mcp_id = compat_str(self._parse_json(self._search_regex( + mcp_id = str(self._parse_json(self._search_regex( r'window\.mcpMapping\s*=\s*({.+?});', webpage, 'MCP Mapping'), video_id, js_to_json)[video_id]['mcp_id']) return self.url_result( diff --git a/yt_dlp/extractor/ooyala.py b/yt_dlp/extractor/ooyala.py deleted file mode 100644 index 65afccdb1c..0000000000 --- a/yt_dlp/extractor/ooyala.py +++ /dev/null @@ -1,230 +0,0 @@ -import base64 -import re - -from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_str, -) -from ..utils import ( - determine_ext, - float_or_none, - int_or_none, - smuggle_url, - try_get, - unsmuggle_url, -) - - -class OoyalaBaseIE(InfoExtractor): - _PLAYER_BASE = 'http://player.ooyala.com/' - _CONTENT_TREE_BASE = _PLAYER_BASE + 'player_api/v1/content_tree/' - _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v2/authorization/embed_code/%s/%s' - - def _extract(self, content_tree_url, video_id, domain=None, supportedformats=None, embed_token=None): - content_tree = self._download_json(content_tree_url, video_id)['content_tree'] - metadata = content_tree[list(content_tree)[0]] - embed_code = metadata['embed_code'] - pcode = metadata.get('asset_pcode') or embed_code - title = metadata['title'] - - auth_data = self._download_json( - self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code), - video_id, headers=self.geo_verification_headers(), query={ - 'domain': domain or 'player.ooyala.com', - 'supportedFormats': supportedformats or 'mp4,rtmp,m3u8,hds,dash,smooth', - 'embedToken': embed_token, - })['authorization_data'][embed_code] - - urls = [] - formats = [] - streams = auth_data.get('streams') or [{ - 'delivery_type': 'hls', - 'url': { - 'data': base64.b64encode(('http://player.ooyala.com/hls/player/all/%s.m3u8' % embed_code).encode()).decode(), - } - }] - for stream in streams: - url_data = try_get(stream, lambda x: x['url']['data'], compat_str) - if not url_data: - continue - s_url = compat_b64decode(url_data).decode('utf-8') - if not s_url or s_url in urls: - continue - urls.append(s_url) - ext = determine_ext(s_url, None) - delivery_type = stream.get('delivery_type') - if delivery_type == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - re.sub(r'/ip(?:ad|hone)/', '/all/', s_url), embed_code, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif delivery_type == 'hds' or ext == 'f4m': - formats.extend(self._extract_f4m_formats( - s_url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False)) - elif delivery_type == 'dash' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - s_url, embed_code, mpd_id='dash', fatal=False)) - elif delivery_type == 'smooth': - self._extract_ism_formats( - s_url, embed_code, ism_id='mss', fatal=False) - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - s_url, embed_code, fatal=False)) - else: - formats.append({ - 'url': s_url, - 'ext': ext or delivery_type, - 'vcodec': stream.get('video_codec'), - 'format_id': delivery_type, - 'width': int_or_none(stream.get('width')), - 'height': int_or_none(stream.get('height')), - 'abr': int_or_none(stream.get('audio_bitrate')), - 'vbr': int_or_none(stream.get('video_bitrate')), - 'fps': float_or_none(stream.get('framerate')), - }) - if not formats and not auth_data.get('authorized'): - self.raise_no_formats('%s said: %s' % ( - self.IE_NAME, auth_data['message']), expected=True) - - subtitles = {} - for lang, sub in metadata.get('closed_captions_vtt', {}).get('captions', {}).items(): - sub_url = sub.get('url') - if not sub_url: - continue - subtitles[lang] = [{ - 'url': sub_url, - }] - - return { - 'id': embed_code, - 'title': title, - 'description': metadata.get('description'), - 'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'), - 'duration': float_or_none(metadata.get('duration'), 1000), - 'subtitles': subtitles, - 'formats': formats, - } - - -class OoyalaIE(OoyalaBaseIE): - _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)' - - _TESTS = [ - { - # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video - 'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', - 'info_dict': { - 'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8', - 'ext': 'mp4', - 'title': 'Explaining Data Recovery from Hard Drives and SSDs', - 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', - 'duration': 853.386, - }, - # The video in the original webpage now uses PlayWire - 'skip': 'Ooyala said: movie expired', - }, { - # Only available for ipad - 'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', - 'info_dict': { - 'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', - 'ext': 'mp4', - 'title': 'Simulation Overview - Levels of Simulation', - 'duration': 194.948, - }, - }, - { - # Information available only through SAS api - # From http://community.plm.automation.siemens.com/t5/News-NX-Manufacturing/Tool-Path-Divide/ba-p/4187 - 'url': 'http://player.ooyala.com/player.js?embedCode=FiOG81ZTrvckcchQxmalf4aQj590qTEx', - 'md5': 'a84001441b35ea492bc03736e59e7935', - 'info_dict': { - 'id': 'FiOG81ZTrvckcchQxmalf4aQj590qTEx', - 'ext': 'mp4', - 'title': 'Divide Tool Path.mp4', - 'duration': 204.405, - } - }, - { - # empty stream['url']['data'] - 'url': 'http://player.ooyala.com/player.js?embedCode=w2bnZtYjE6axZ_dw1Cd0hQtXd_ige2Is', - 'only_matching': True, - } - ] - - def _extract_from_webpage(self, url, webpage): - mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) - or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) - or re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) - or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) - or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) - if mobj is not None: - embed_token = self._search_regex( - r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)', - webpage, 'ooyala embed token', default=None) - yield self._build_url_result(smuggle_url( - mobj.group('ec'), { - 'domain': url, - 'embed_token': embed_token, - })) - return - - # Look for multiple Ooyala embeds on SBN network websites - mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) - if mobj is not None: - for v in self._parse_json(mobj.group(1), self._generic_id(url), fatal=False) or []: - yield self._build_url_result(smuggle_url(v['provider_video_id'], {'domain': url})) - - @staticmethod - def _url_for_embed_code(embed_code): - return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code - - @classmethod - def _build_url_result(cls, embed_code): - return cls.url_result(cls._url_for_embed_code(embed_code), - ie=cls.ie_key()) - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - embed_code = self._match_id(url) - domain = smuggled_data.get('domain') - supportedformats = smuggled_data.get('supportedformats') - embed_token = smuggled_data.get('embed_token') - content_tree_url = self._CONTENT_TREE_BASE + 'embed_code/%s/%s' % (embed_code, embed_code) - return self._extract(content_tree_url, embed_code, domain, supportedformats, embed_token) - - -class OoyalaExternalIE(OoyalaBaseIE): - _VALID_URL = r'''(?x) - (?: - ooyalaexternal:| - https?://.+?\.ooyala\.com/.*?\bexternalId= - ) - (?P<partner_id>[^:]+) - : - (?P<id>.+) - (?: - :| - .*?&pcode= - ) - (?P<pcode>.+?) - (?:&|$) - ''' - - _TEST = { - 'url': 'https://player.ooyala.com/player.js?externalId=espn:10365079&pcode=1kNG061cgaoolOncv54OAO1ceO-I&adSetCode=91cDU6NuXTGKz3OdjOxFdAgJVtQcKJnI&callback=handleEvents&hasModuleParams=1&height=968&playerBrandingId=7af3bd04449c444c964f347f11873075&targetReplaceId=videoPlayer&width=1656&wmode=opaque&allowScriptAccess=always', - 'info_dict': { - 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG', - 'ext': 'mp4', - 'title': 'dm_140128_30for30Shorts___JudgingJewellv2', - 'duration': 1302.0, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - partner_id, video_id, pcode = self._match_valid_url(url).groups() - content_tree_url = self._CONTENT_TREE_BASE + 'external_id/%s/%s:%s' % (pcode, partner_id, video_id) - return self._extract(content_tree_url, video_id) diff --git a/yt_dlp/extractor/opencast.py b/yt_dlp/extractor/opencast.py index fa46757f7b..a4b0a1989d 100644 --- a/yt_dlp/extractor/opencast.py +++ b/yt_dlp/extractor/opencast.py @@ -2,8 +2,8 @@ from .common import InfoExtractor from ..utils import ( - determine_ext, ExtractorError, + determine_ext, int_or_none, parse_iso8601, traverse_obj, @@ -55,9 +55,9 @@ def _parse_mediapackage(self, video): transport = track.get('transport') if transport == 'DASH' or ext == 'mpd': - formats.extend(self._extract_mpd_formats_and_subtitles(href, video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_mpd_formats(href, video_id, mpd_id='dash', fatal=False)) elif transport == 'HLS' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats_and_subtitles( + formats.extend(self._extract_m3u8_formats( href, video_id, m3u8_id='hls', entry_protocol='m3u8_native', fatal=False)) elif transport == 'HDS' or ext == 'f4m': formats.extend(self._extract_f4m_formats(href, video_id, f4m_id='hds', fatal=False)) @@ -105,10 +105,9 @@ def _parse_mediapackage(self, video): class OpencastIE(OpencastBaseIE): - _VALID_URL = r'''(?x) - https?://(?P<host>%s)/paella/ui/watch.html\?.*? - id=(?P<id>%s) - ''' % (OpencastBaseIE._INSTANCES_RE, OpencastBaseIE._UUID_RE) + _VALID_URL = rf'''(?x) + https?://(?P<host>{OpencastBaseIE._INSTANCES_RE})/paella/ui/watch\.html\? + (?:[^#]+&)?id=(?P<id>{OpencastBaseIE._UUID_RE})''' _API_BASE = 'https://%s/search/episode.json?id=%s' @@ -123,8 +122,11 @@ class OpencastIE(OpencastBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1606208400, 'upload_date': '20201124', + 'season_id': 'cf68a4a1-36b1-4a53-a6ba-61af5705a0d0', + 'series': 'Kryptographie - WiSe 15/16', + 'creator': 'Alexander May', }, - } + }, ] def _real_extract(self, url): @@ -134,10 +136,11 @@ def _real_extract(self, url): class OpencastPlaylistIE(OpencastBaseIE): - _VALID_URL = r'''(?x) - https?://(?P<host>%s)/engage/ui/index.html\?.*? - epFrom=(?P<id>%s) - ''' % (OpencastBaseIE._INSTANCES_RE, OpencastBaseIE._UUID_RE) + _VALID_URL = rf'''(?x) + https?://(?P<host>{OpencastBaseIE._INSTANCES_RE})(?: + /engage/ui/index\.html\?(?:[^#]+&)?epFrom=| + /ltitools/index\.html\?(?:[^#]+&)?series= + )(?P<id>{OpencastBaseIE._UUID_RE})''' _API_BASE = 'https://%s/search/episode.json?sid=%s' @@ -148,15 +151,23 @@ class OpencastPlaylistIE(OpencastBaseIE): 'id': 'cf68a4a1-36b1-4a53-a6ba-61af5705a0d0', 'title': 'Kryptographie - WiSe 15/16', }, - 'playlist_mincount': 28, + 'playlist_mincount': 29, }, { - 'url': 'https://oc-video.ruhr-uni-bochum.de/engage/ui/index.html?e=1&p=1&epFrom=b1a54262-3684-403f-9731-8e77c3766f9a', + 'url': 'https://oc-video1.ruhr-uni-bochum.de/ltitools/index.html?subtool=series&series=cf68a4a1-36b1-4a53-a6ba-61af5705a0d0&lng=de', 'info_dict': { - 'id': 'b1a54262-3684-403f-9731-8e77c3766f9a', - 'title': 'inSTUDIES-Social movements and prefigurative politics in a global perspective', + 'id': 'cf68a4a1-36b1-4a53-a6ba-61af5705a0d0', + 'title': 'Kryptographie - WiSe 15/16', }, - 'playlist_mincount': 6, + 'playlist_mincount': 29, + }, + { + 'url': 'https://electures.uni-muenster.de/engage/ui/index.html?e=1&p=1&epFrom=39391d10-a711-4d23-b21d-afd2ed7d758c', + 'info_dict': { + 'id': '39391d10-a711-4d23-b21d-afd2ed7d758c', + 'title': '021670 Theologische Themen bei Hans Blumenberg WiSe 2017/18', + }, + 'playlist_mincount': 13, }, ] diff --git a/yt_dlp/extractor/openload.py b/yt_dlp/extractor/openload.py index 56b8330ff8..2d56252b16 100644 --- a/yt_dlp/extractor/openload.py +++ b/yt_dlp/extractor/openload.py @@ -4,8 +4,8 @@ import os import subprocess import tempfile +import urllib.parse -from ..compat import compat_urlparse from ..utils import ( ExtractorError, Popen, @@ -121,7 +121,7 @@ def __init__(self, extractor, required_version=None, timeout=10000): if is_outdated_version(version, required_version): self.extractor._downloader.report_warning( 'Your copy of PhantomJS is outdated, update it to version ' - '%s or newer if you encounter any errors.' % required_version) + f'{required_version} or newer if you encounter any errors.') for name in self._TMP_FILE_NAMES: tmp = tempfile.NamedTemporaryFile(delete=False) @@ -146,9 +146,9 @@ def _save_cookies(self, url): if 'path' not in cookie: cookie['path'] = '/' if 'domain' not in cookie: - cookie['domain'] = compat_urlparse.urlparse(url).netloc + cookie['domain'] = urllib.parse.urlparse(url).netloc with open(self._TMP_FILES['cookies'].name, 'wb') as f: - f.write(json.dumps(cookies).encode('utf-8')) + f.write(json.dumps(cookies).encode()) def _load_cookies(self): with open(self._TMP_FILES['cookies'].name, 'rb') as f: @@ -201,7 +201,7 @@ def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on w if not html: html = self.extractor._download_webpage(url, video_id, note=note, headers=headers) with open(self._TMP_FILES['html'].name, 'wb') as f: - f.write(html.encode('utf-8')) + f.write(html.encode()) self._save_cookies(url) diff --git a/yt_dlp/extractor/openrec.py b/yt_dlp/extractor/openrec.py index 86dc9bb898..b4f1c7d858 100644 --- a/yt_dlp/extractor/openrec.py +++ b/yt_dlp/extractor/openrec.py @@ -8,10 +8,11 @@ unified_strdate, unified_timestamp, ) -from ..compat import compat_str class OpenRecBaseIE(InfoExtractor): + _M3U8_HEADERS = {'Referer': 'https://www.openrec.tv/'} + def _extract_pagestore(self, webpage, video_id): return self._parse_json( self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id) @@ -21,7 +22,7 @@ def _expand_media(self, video_id, media): if not m3u8_url: continue yield from self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', m3u8_id=name) + m3u8_url, video_id, ext='mp4', m3u8_id=name, headers=self._M3U8_HEADERS) def _extract_movie(self, webpage, video_id, name, is_live): window_stores = self._extract_pagestore(webpage, video_id) @@ -60,6 +61,7 @@ def _extract_movie(self, webpage, video_id, name, is_live): 'uploader_id': get_first(movie_stores, ('channel', 'user', 'id')), 'timestamp': int_or_none(get_first(movie_stores, ['publishedAt', 'time']), scale=1000) or unified_timestamp(get_first(movie_stores, 'publishedAt')), 'is_live': is_live, + 'http_headers': self._M3U8_HEADERS, } @@ -110,17 +112,18 @@ def _real_extract(self, url): raise ExtractorError('Cannot extract title') formats = self._extract_m3u8_formats( - capture_data.get('source'), video_id, ext='mp4') + capture_data.get('source'), video_id, ext='mp4', headers=self._M3U8_HEADERS) return { 'id': video_id, 'title': capture_data.get('title'), 'thumbnail': capture_data.get('thumbnailUrl'), 'formats': formats, - 'timestamp': unified_timestamp(traverse_obj(movie_store, 'createdAt', expected_type=compat_str)), - 'uploader': traverse_obj(movie_store, ('channel', 'name'), expected_type=compat_str), - 'uploader_id': traverse_obj(movie_store, ('channel', 'id'), expected_type=compat_str), + 'timestamp': unified_timestamp(traverse_obj(movie_store, 'createdAt', expected_type=str)), + 'uploader': traverse_obj(movie_store, ('channel', 'name'), expected_type=str), + 'uploader_id': traverse_obj(movie_store, ('channel', 'id'), expected_type=str), 'upload_date': unified_strdate(capture_data.get('createdAt')), + 'http_headers': self._M3U8_HEADERS, } diff --git a/yt_dlp/extractor/ora.py b/yt_dlp/extractor/ora.py index d49909d528..c6ba4b0d3e 100644 --- a/yt_dlp/extractor/ora.py +++ b/yt_dlp/extractor/ora.py @@ -1,6 +1,7 @@ import re +import urllib.parse + from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( get_element_by_attribute, qualities, @@ -18,7 +19,7 @@ class OraTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Vine & YouTube Stars Zach King & King Bach On Their Viral Videos!', 'description': 'md5:ebbc5b1424dd5dba7be7538148287ac1', - } + }, }, { 'url': 'http://www.unsafespeech.com/video/2016/5/10/student-self-censorship-and-the-thought-police-on-university-campuses-0_6622bnkppw4d', 'only_matching': True, @@ -37,14 +38,14 @@ def _real_extract(self, url): m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) # similar to GameSpotIE - m3u8_path = compat_urlparse.urlparse(m3u8_url).path + m3u8_path = urllib.parse.urlparse(m3u8_url).path QUALITIES_RE = r'((,[a-z]+\d+)+,?)' available_qualities = self._search_regex( QUALITIES_RE, m3u8_path, 'qualities').strip(',').split(',') http_path = m3u8_path[1:].split('/', 1)[1] http_template = re.sub(QUALITIES_RE, r'%s', http_path) http_template = http_template.replace('.csmil/master.m3u8', '') - http_template = compat_urlparse.urljoin( + http_template = urllib.parse.urljoin( 'http://videocdn-pmd.ora.tv/', http_template) preference = qualities( ['mobile400', 'basic400', 'basic600', 'sd900', 'sd1200', 'sd1500', 'hd720', 'hd1080']) diff --git a/yt_dlp/extractor/orf.py b/yt_dlp/extractor/orf.py index e9d23a4d12..9c37a54d62 100644 --- a/yt_dlp/extractor/orf.py +++ b/yt_dlp/extractor/orf.py @@ -1,3 +1,4 @@ +import base64 import functools import re @@ -6,196 +7,20 @@ clean_html, determine_ext, float_or_none, - HEADRequest, - InAdvancePagedList, int_or_none, - join_nonempty, - orderedSet, - remove_end, make_archive_id, - smuggle_url, + mimetype2ext, + orderedSet, + parse_age_limit, + parse_iso8601, + remove_end, + str_or_none, strip_jsonp, try_call, - unescapeHTML, unified_strdate, - unsmuggle_url, url_or_none, ) - - -class ORFTVthekIE(InfoExtractor): - IE_NAME = 'orf:tvthek' - IE_DESC = 'ORF TVthek' - _VALID_URL = r'(?P<url>https?://tvthek\.orf\.at/(?:(?:[^/]+/){2}){1,2}(?P<id>\d+))(/[^/]+/(?P<vid>\d+))?(?:$|[?#])' - - _TESTS = [{ - 'url': 'https://tvthek.orf.at/profile/ZIB-2/1211/ZIB-2/14121079', - 'info_dict': { - 'id': '14121079', - }, - 'playlist_count': 11, - 'params': {'noplaylist': True} - }, { - 'url': 'https://tvthek.orf.at/profile/ZIB-2/1211/ZIB-2/14121079/Umfrage-Welches-Tier-ist-Sebastian-Kurz/15083150', - 'info_dict': { - 'id': '14121079', - }, - 'playlist_count': 1, - 'params': {'playlist_items': '5'} - }, { - 'url': 'https://tvthek.orf.at/profile/ZIB-2/1211/ZIB-2/14121079/Umfrage-Welches-Tier-ist-Sebastian-Kurz/15083150', - 'info_dict': { - 'id': '14121079', - 'playlist_count': 1 - }, - 'playlist': [{ - 'info_dict': { - 'id': '15083150', - 'ext': 'mp4', - 'description': 'md5:7be1c485425f5f255a5e4e4815e77d04', - 'thumbnail': 'https://api-tvthek.orf.at/uploads/media/segments/0130/59/824271ea35cd8931a0fb08ab316a5b0a1562342c.jpeg', - 'title': 'Umfrage: Welches Tier ist Sebastian Kurz?', - } - }], - 'playlist_count': 1, - 'params': {'noplaylist': True, 'skip_download': 'm3u8'} - }, { - 'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389', - 'playlist': [{ - 'md5': '2942210346ed779588f428a92db88712', - 'info_dict': { - 'id': '8896777', - 'ext': 'mp4', - 'title': 'Aufgetischt: Mit der Steirischen Tafelrunde', - 'description': 'md5:c1272f0245537812d4e36419c207b67d', - 'duration': 2668, - 'upload_date': '20141208', - }, - }], - 'skip': 'Blocked outside of Austria / Germany', - }, { - 'url': 'http://tvthek.orf.at/topic/Im-Wandel-der-Zeit/8002126/Best-of-Ingrid-Thurnher/7982256', - 'info_dict': { - 'id': '7982259', - 'ext': 'mp4', - 'title': 'Best of Ingrid Thurnher', - 'upload_date': '20140527', - 'description': 'Viele Jahre war Ingrid Thurnher das "Gesicht" der ZIB 2. Vor ihrem Wechsel zur ZIB 2 im Jahr 1995 moderierte sie unter anderem "Land und Leute", "Österreich-Bild" und "Niederösterreich heute".', - }, - 'params': { - 'skip_download': True, # rtsp downloads - }, - 'skip': 'Blocked outside of Austria / Germany', - }, { - 'url': 'http://tvthek.orf.at/topic/Fluechtlingskrise/10463081/Heimat-Fremde-Heimat/13879132/Senioren-betreuen-Migrantenkinder/13879141', - 'only_matching': True, - }, { - 'url': 'http://tvthek.orf.at/profile/Universum/35429', - 'only_matching': True, - }] - - def _pagefunc(self, url, data_jsb, n, *, image=None): - sd = data_jsb[n] - video_id, title = str(sd['id']), sd['title'] - formats = [] - for fd in sd['sources']: - src = url_or_none(fd.get('src')) - if not src: - continue - format_id = join_nonempty('delivery', 'quality', 'quality_string', from_dict=fd) - ext = determine_ext(src) - if ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - src, video_id, 'mp4', m3u8_id=format_id, fatal=False, note=f'Downloading {format_id} m3u8 manifest') - if any('/geoprotection' in f['url'] for f in m3u8_formats): - self.raise_geo_restricted() - formats.extend(m3u8_formats) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - src, video_id, f4m_id=format_id, fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - src, video_id, mpd_id=format_id, fatal=False, note=f'Downloading {format_id} mpd manifest')) - else: - formats.append({ - 'format_id': format_id, - 'url': src, - 'protocol': fd.get('protocol'), - }) - - # Check for geoblocking. - # There is a property is_geoprotection, but that's always false - geo_str = sd.get('geoprotection_string') - http_url = next( - (f['url'] for f in formats if re.match(r'^https?://.*\.mp4$', f['url'])), - None) if geo_str else None - if http_url: - self._request_webpage( - HEADRequest(http_url), video_id, fatal=False, note='Testing for geoblocking', - errnote=f'This video seems to be blocked outside of {geo_str}. You may want to try the streaming-* formats') - - subtitles = {} - for sub in sd.get('subtitles', []): - sub_src = sub.get('src') - if not sub_src: - continue - subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({ - 'url': sub_src, - }) - - upload_date = unified_strdate(sd.get('created_date')) - - thumbnails = [] - preview = sd.get('preview_image_url') - if preview: - thumbnails.append({ - 'id': 'preview', - 'url': preview, - 'preference': 0, - }) - image = sd.get('image_full_url') or image - if image: - thumbnails.append({ - 'id': 'full', - 'url': image, - 'preference': 1, - }) - - yield { - 'id': video_id, - 'title': title, - 'webpage_url': smuggle_url(f'{url}/part/{video_id}', {'force_noplaylist': True}), - 'formats': formats, - 'subtitles': subtitles, - 'description': sd.get('description'), - 'duration': int_or_none(sd.get('duration_in_seconds')), - 'upload_date': upload_date, - 'thumbnails': thumbnails, - } - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url) - playlist_id, video_id, base_url = self._match_valid_url(url).group('id', 'vid', 'url') - webpage = self._download_webpage(url, playlist_id) - - data_jsb = self._parse_json( - self._search_regex( - r'<div[^>]+class=(["\']).*?VideoPlaylist.*?\1[^>]+data-jsb=(["\'])(?P<json>.+?)\2', - webpage, 'playlist', group='json'), - playlist_id, transform_source=unescapeHTML)['playlist']['videos'] - - if not self._yes_playlist(playlist_id, video_id, smuggled_data): - data_jsb = [sd for sd in data_jsb if str(sd.get('id')) == video_id] - - playlist_count = len(data_jsb) - image = self._og_search_thumbnail(webpage) if playlist_count == 1 else None - - page_func = functools.partial(self._pagefunc, base_url, data_jsb, image=image) - return { - '_type': 'playlist', - 'entries': InAdvancePagedList(page_func, playlist_count, 1), - 'id': playlist_id, - } +from ..utils.traversal import traverse_obj class ORFRadioIE(InfoExtractor): @@ -240,8 +65,8 @@ class ORFRadioIE(InfoExtractor): 'duration': 18000, 'timestamp': 1659322789, 'description': 'md5:a3f6083399ef92b8cbe2d421b180835a', - } - }] + }, + }], }, { 'url': 'https://ooe.orf.at/player/20220801/OGMO', 'info_dict': { @@ -259,8 +84,8 @@ class ORFRadioIE(InfoExtractor): 'duration': 18000, 'timestamp': 1659322789, 'description': 'md5:a3f6083399ef92b8cbe2d421b180835a', - } - }] + }, + }], }, { 'url': 'http://fm4.orf.at/player/20170107/4CC', 'only_matching': True, @@ -302,7 +127,7 @@ class ORFRadioIE(InfoExtractor): 'timestamp': 1483858796, 'upload_date': '20170108', }, - 'skip': 'Shows from ORF radios are only available for 7 days.' + 'skip': 'Shows from ORF radios are only available for 7 days.', }] def _entries(self, data, station): @@ -334,6 +159,45 @@ def _real_extract(self, url): self._entries(data, station or station2), show_id, data.get('title'), clean_html(data.get('subtitle'))) +class ORFPodcastIE(InfoExtractor): + IE_NAME = 'orf:podcast' + _STATION_RE = '|'.join(map(re.escape, ( + 'bgl', 'fm4', 'ktn', 'noe', 'oe1', 'oe3', + 'ooe', 'sbg', 'stm', 'tir', 'tv', 'vbg', 'wie'))) + _VALID_URL = rf'https?://sound\.orf\.at/podcast/(?P<station>{_STATION_RE})/(?P<show>[\w-]+)/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://sound.orf.at/podcast/oe3/fruehstueck-bei-mir/nicolas-stockhammer-15102023', + 'md5': '526a5700e03d271a1505386a8721ab9b', + 'info_dict': { + 'id': 'nicolas-stockhammer-15102023', + 'ext': 'mp3', + 'title': 'Nicolas Stockhammer (15.10.2023)', + 'duration': 3396.0, + 'series': 'Frühstück bei mir', + }, + 'skip': 'ORF podcasts are only available for a limited time', + }] + + def _real_extract(self, url): + station, show, show_id = self._match_valid_url(url).group('station', 'show', 'id') + data = self._download_json( + f'https://audioapi.orf.at/radiothek/api/2.0/podcast/{station}/{show}/{show_id}', show_id) + + return { + 'id': show_id, + 'ext': 'mp3', + 'vcodec': 'none', + **traverse_obj(data, ('payload', { + 'url': ('enclosures', 0, 'url'), + 'ext': ('enclosures', 0, 'type', {mimetype2ext}), + 'title': 'title', + 'description': ('description', {clean_html}), + 'duration': ('duration', {functools.partial(float_or_none, scale=1000)}), + 'series': ('podcast', 'title'), + })), + } + + class ORFIPTVIE(InfoExtractor): IE_NAME = 'orf:iptv' IE_DESC = 'iptv.ORF.at' @@ -357,13 +221,13 @@ def _real_extract(self, url): story_id = self._match_id(url) webpage = self._download_webpage( - 'http://iptv.orf.at/stories/%s' % story_id, story_id) + f'http://iptv.orf.at/stories/{story_id}', story_id) video_id = self._search_regex( r'data-video(?:id)?="(\d+)"', webpage, 'video id') data = self._download_json( - 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, + f'http://bits.orf.at/filehandler/static-api/json/current/data.json?file={video_id}', video_id)[0] duration = float_or_none(data['duration'], 1000) @@ -462,7 +326,7 @@ def _real_extract(self, url): all_ids = orderedSet(re.findall(r'data-video(?:id)?="(\d+)"', webpage)) for idx, video_id in enumerate(all_ids): data = self._download_json( - 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, + f'http://bits.orf.at/filehandler/static-api/json/current/data.json?file={video_id}', video_id)[0] duration = float_or_none(data['duration'], 1000) @@ -524,3 +388,172 @@ def _real_extract(self, url): }) return self.playlist_result(entries) + + +class ORFONIE(InfoExtractor): + IE_NAME = 'orf:on' + _VALID_URL = r'https?://on\.orf\.at/video/(?P<id>\d+)(?:/(?P<segment>\d+))?' + _TESTS = [{ + 'url': 'https://on.orf.at/video/14210000/school-of-champions-48', + 'info_dict': { + 'id': '14210000', + 'ext': 'mp4', + 'duration': 2651.08, + 'thumbnail': 'https://api-tvthek.orf.at/assets/segments/0167/98/thumb_16697671_segments_highlight_teaser.jpeg', + 'title': 'School of Champions (4/8)', + 'description': 'md5:d09ad279fc2e8502611e7648484b6afd', + 'media_type': 'episode', + 'timestamp': 1706558922, + 'upload_date': '20240129', + 'release_timestamp': 1706472362, + 'release_date': '20240128', + 'modified_timestamp': 1712756663, + 'modified_date': '20240410', + '_old_archive_ids': ['orftvthek 14210000'], + }, + }, { + 'url': 'https://on.orf.at/video/3220355', + 'md5': 'f94d98e667cf9a3851317efb4e136662', + 'info_dict': { + 'id': '3220355', + 'ext': 'mp4', + 'duration': 445.04, + 'thumbnail': 'https://api-tvthek.orf.at/assets/segments/0002/60/thumb_159573_segments_highlight_teaser.png', + 'title': '50 Jahre Burgenland: Der Festumzug', + 'description': 'md5:1560bf855119544ee8c4fa5376a2a6b0', + 'media_type': 'episode', + 'timestamp': 52916400, + 'upload_date': '19710905', + 'release_timestamp': 52916400, + 'release_date': '19710905', + 'modified_timestamp': 1498536049, + 'modified_date': '20170627', + '_old_archive_ids': ['orftvthek 3220355'], + }, + }, { + # Video with multiple segments selecting the second segment + 'url': 'https://on.orf.at/video/14226549/15639808/jugendbande-einbrueche-aus-langeweile', + 'md5': '90f4ebff86b4580837b8a361d0232a9e', + 'info_dict': { + 'id': '15639808', + 'ext': 'mp4', + 'duration': 97.707, + 'thumbnail': 'https://api-tvthek.orf.at/assets/segments/0175/43/thumb_17442704_segments_highlight_teaser.jpg', + 'title': 'Jugendbande: Einbrüche aus Langeweile', + 'description': 'md5:193df0bf0d91cf16830c211078097120', + 'media_type': 'segment', + 'timestamp': 1715792400, + 'upload_date': '20240515', + 'modified_timestamp': 1715794394, + 'modified_date': '20240515', + '_old_archive_ids': ['orftvthek 15639808'], + }, + 'params': {'noplaylist': True}, + }, { + # Video with multiple segments and no combined version + 'url': 'https://on.orf.at/video/14227864/formel-1-grosser-preis-von-monaco-2024', + 'info_dict': { + '_type': 'multi_video', + 'id': '14227864', + 'duration': 18410.52, + 'thumbnail': 'https://api-tvthek.orf.at/assets/segments/0176/04/thumb_17503881_segments_highlight_teaser.jpg', + 'title': 'Formel 1: Großer Preis von Monaco 2024', + 'description': 'md5:aeeb010710ccf70ce28ccb4482243d4f', + 'media_type': 'episode', + 'timestamp': 1716721200, + 'upload_date': '20240526', + 'release_timestamp': 1716721802, + 'release_date': '20240526', + 'modified_timestamp': 1716967501, + 'modified_date': '20240529', + }, + 'playlist_count': 42, + }, { + # Video with multiple segments, but with combined version + 'url': 'https://on.orf.at/video/14228172', + 'info_dict': { + 'id': '14228172', + 'ext': 'mp4', + 'duration': 3294.878, + 'thumbnail': 'https://api-tvthek.orf.at/assets/segments/0176/17/thumb_17516455_segments_highlight_teaser.jpg', + 'title': 'Willkommen Österreich mit Stermann & Grissemann', + 'description': 'md5:5de034d033a9c27f989343be3bbd4839', + 'media_type': 'episode', + 'timestamp': 1716926584, + 'upload_date': '20240528', + 'release_timestamp': 1716919202, + 'release_date': '20240528', + 'modified_timestamp': 1716968045, + 'modified_date': '20240529', + '_old_archive_ids': ['orftvthek 14228172'], + }, + }] + + @staticmethod + def _parse_metadata(api_json): + return traverse_obj(api_json, { + 'id': ('id', {int}, {str_or_none}), + 'age_limit': ('age_classification', {parse_age_limit}), + 'duration': ('exact_duration', {functools.partial(float_or_none, scale=1000)}), + 'title': (('title', 'headline'), {str}), + 'description': (('description', 'teaser_text'), {str}), + 'media_type': ('video_type', {str}), + 'thumbnail': ('_embedded', 'image', 'public_urls', 'highlight_teaser', 'url', {url_or_none}), + 'timestamp': (('date', 'episode_date'), {parse_iso8601}), + 'release_timestamp': ('release_date', {parse_iso8601}), + 'modified_timestamp': ('updated_at', {parse_iso8601}), + }, get_all=False) + + def _extract_video_info(self, video_id, api_json): + formats, subtitles = [], {} + for manifest_type in traverse_obj(api_json, ('sources', {dict.keys}, ...)): + for manifest_url in traverse_obj(api_json, ('sources', manifest_type, ..., 'src', {url_or_none})): + if manifest_type == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + manifest_url, video_id, fatal=False, m3u8_id='hls') + elif manifest_type == 'dash': + fmts, subs = self._extract_mpd_formats_and_subtitles( + manifest_url, video_id, fatal=False, mpd_id='dash') + else: + continue + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + for sub_url in traverse_obj(api_json, ( + '_embedded', 'subtitle', + ('xml_url', 'sami_url', 'stl_url', 'ttml_url', 'srt_url', 'vtt_url'), {url_or_none})): + self._merge_subtitles({'de': [{'url': sub_url}]}, target=subtitles) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + '_old_archive_ids': [make_archive_id('ORFTVthek', video_id)], + **self._parse_metadata(api_json), + } + + def _real_extract(self, url): + video_id, segment_id = self._match_valid_url(url).group('id', 'segment') + + encrypted_id = base64.b64encode(f'3dSlfek03nsLKdj4Jsd{video_id}'.encode()).decode() + api_json = self._download_json( + f'https://api-tvthek.orf.at/api/v4.3/public/episode/encrypted/{encrypted_id}', video_id) + + if traverse_obj(api_json, 'is_drm_protected'): + self.report_drm(video_id) + + segments = traverse_obj(api_json, ('_embedded', 'segments', lambda _, v: v['id'])) + selected_segment = traverse_obj(segments, (lambda _, v: str(v['id']) == segment_id, any)) + + # selected_segment will be falsy if input URL did not include a valid segment_id + if selected_segment and not self._yes_playlist(video_id, segment_id, playlist_label='episode', video_label='segment'): + return self._extract_video_info(segment_id, selected_segment) + + # Even some segmented videos have an unsegmented version available in API response root + if (self._configuration_arg('prefer_segments_playlist') + or not traverse_obj(api_json, ('sources', ..., ..., 'src', {url_or_none}))): + return self.playlist_result( + (self._extract_video_info(str(segment['id']), segment) for segment in segments), + video_id, **self._parse_metadata(api_json), multi_video=True) + + return self._extract_video_info(video_id, api_json) diff --git a/yt_dlp/extractor/outsidetv.py b/yt_dlp/extractor/outsidetv.py index b1fcbd6a77..b9191c9cce 100644 --- a/yt_dlp/extractor/outsidetv.py +++ b/yt_dlp/extractor/outsidetv.py @@ -13,7 +13,7 @@ class OutsideTVIE(InfoExtractor): 'description': 'md5:41a12e94f3db3ca253b04bb1e8d8f4cd', 'upload_date': '20181225', 'timestamp': 1545742800, - } + }, }, { 'url': 'http://www.outsidetv.com/home/play/ZjQYboH6/1/10/Hdg0jukV/4', 'only_matching': True, diff --git a/yt_dlp/extractor/owncloud.py b/yt_dlp/extractor/owncloud.py new file mode 100644 index 0000000000..79fd830bb3 --- /dev/null +++ b/yt_dlp/extractor/owncloud.py @@ -0,0 +1,80 @@ +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + url_or_none, + urlencode_postdata, +) + + +class OwnCloudIE(InfoExtractor): + _INSTANCES_RE = '|'.join(( + r'(?:[^\.]+\.)?sciebo\.de', + r'cloud\.uni-koblenz-landau\.de', + )) + _VALID_URL = rf'https?://(?:{_INSTANCES_RE})/s/(?P<id>[\w.-]+)' + + _TESTS = [ + { + 'url': 'https://ruhr-uni-bochum.sciebo.de/s/wWhqZzh9jTumVFN', + 'info_dict': { + 'id': 'wWhqZzh9jTumVFN', + 'ext': 'mp4', + 'title': 'CmvpJST.mp4', + }, + }, + { + 'url': 'https://ruhr-uni-bochum.sciebo.de/s/WNDuFu0XuFtmm3f', + 'info_dict': { + 'id': 'WNDuFu0XuFtmm3f', + 'ext': 'mp4', + 'title': 'CmvpJST.mp4', + }, + 'params': { + 'videopassword': '12345', + }, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage, urlh = self._download_webpage_handle(url, video_id) + + if re.search(r'<label[^>]+for="password"', webpage): + webpage = self._verify_video_password(webpage, urlh.url, video_id) + + hidden_inputs = self._hidden_inputs(webpage) + title = hidden_inputs.get('filename') + parsed_url = urllib.parse.urlparse(url) + + return { + 'id': video_id, + 'title': title, + 'url': url_or_none(hidden_inputs.get('downloadURL')) or parsed_url._replace( + path=urllib.parse.urljoin(parsed_url.path, 'download')).geturl(), + 'ext': determine_ext(title), + } + + def _verify_video_password(self, webpage, url, video_id): + password = self.get_param('videopassword') + if password is None: + raise ExtractorError( + 'This video is protected by a password, use the --video-password option', + expected=True) + + validation_response = self._download_webpage( + url, video_id, 'Validating Password', 'Wrong password?', + data=urlencode_postdata({ + 'requesttoken': self._hidden_inputs(webpage)['requesttoken'], + 'password': password, + })) + + if re.search(r'<label[^>]+for="password"', validation_response): + warning = self._search_regex( + r'<div[^>]+class="warning">([^<]*)</div>', validation_response, + 'warning', default='The password is wrong') + raise ExtractorError(f'Opening the video failed, {self.IE_NAME} said: {warning!r}', expected=True) + return validation_response diff --git a/yt_dlp/extractor/packtpub.py b/yt_dlp/extractor/packtpub.py index 51778d8a20..38ffd3451f 100644 --- a/yt_dlp/extractor/packtpub.py +++ b/yt_dlp/extractor/packtpub.py @@ -1,18 +1,14 @@ import json from .common import InfoExtractor -from ..compat import ( - # compat_str, - compat_HTTPError, -) +from ..networking.exceptions import HTTPError from ..utils import ( - clean_html, ExtractorError, + clean_html, # remove_end, str_or_none, strip_or_none, unified_timestamp, - # urljoin, ) @@ -54,8 +50,8 @@ def _perform_login(self, username, password): 'password': password, }).encode())['data']['access'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401, 404): - message = self._parse_json(e.cause.read().decode(), None)['message'] + if isinstance(e.cause, HTTPError) and e.cause.status in (400, 401, 404): + message = self._parse_json(e.cause.response.read().decode(), None)['message'] raise ExtractorError(message, expected=True) raise @@ -67,10 +63,10 @@ def _real_extract(self, url): headers['Authorization'] = 'Bearer ' + self._TOKEN try: video_url = self._download_json( - 'https://services.packtpub.com/products-v1/products/%s/%s/%s' % (course_id, chapter_id, video_id), video_id, + f'https://services.packtpub.com/products-v1/products/{course_id}/{chapter_id}/{video_id}', video_id, 'Downloading JSON video', headers=headers)['data'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: self.raise_login_required('This video is locked') raise @@ -113,17 +109,16 @@ class PacktPubCourseIE(PacktPubBaseIE): @classmethod def suitable(cls, url): - return False if PacktPubIE.suitable(url) else super( - PacktPubCourseIE, cls).suitable(url) + return False if PacktPubIE.suitable(url) else super().suitable(url) def _real_extract(self, url): mobj = self._match_valid_url(url) url, course_id = mobj.group('url', 'id') course = self._download_json( - self._STATIC_PRODUCTS_BASE + '%s/toc' % course_id, course_id) + self._STATIC_PRODUCTS_BASE + f'{course_id}/toc', course_id) metadata = self._download_json( - self._STATIC_PRODUCTS_BASE + '%s/summary' % course_id, + self._STATIC_PRODUCTS_BASE + f'{course_id}/summary', course_id, fatal=False) or {} entries = [] diff --git a/yt_dlp/extractor/palcomp3.py b/yt_dlp/extractor/palcomp3.py index 4b0801c1a0..138a7853ac 100644 --- a/yt_dlp/extractor/palcomp3.py +++ b/yt_dlp/extractor/palcomp3.py @@ -1,5 +1,4 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, str_or_none, @@ -30,7 +29,7 @@ def _call_api(self, artist_slug, artist_fields): })['data'] def _parse_music(self, music): - music_id = compat_str(music['musicID']) + music_id = str(music['musicID']) title = music['title'] formats = [] @@ -77,12 +76,12 @@ class PalcoMP3IE(PalcoMP3BaseIE): 'title': 'Nossas Composições - CUIDA BEM DELA', 'duration': 210, 'view_count': int, - } + }, }] @classmethod def suitable(cls, url): - return False if PalcoMP3VideoIE.suitable(url) else super(PalcoMP3IE, cls).suitable(url) + return False if PalcoMP3VideoIE.suitable(url) else super().suitable(url) class PalcoMP3ArtistIE(PalcoMP3BaseIE): @@ -106,7 +105,7 @@ class PalcoMP3ArtistIE(PalcoMP3BaseIE): @classmethod def suitable(cls, url): - return False if PalcoMP3IE._match_valid_url(url) else super(PalcoMP3ArtistIE, cls).suitable(url) + return False if PalcoMP3IE._match_valid_url(url) else super().suitable(url) def _real_extract(self, url): artist_slug = self._match_id(url) @@ -134,7 +133,7 @@ class PalcoMP3VideoIE(PalcoMP3BaseIE): 'upload_date': '20161107', 'uploader_id': 'maiaramaraisaoficial', 'uploader': 'Maiara e Maraisa', - } + }, }] _MUSIC_FIELDS = 'youtubeID' diff --git a/yt_dlp/extractor/pandoratv.py b/yt_dlp/extractor/pandoratv.py deleted file mode 100644 index ccc78da57f..0000000000 --- a/yt_dlp/extractor/pandoratv.py +++ /dev/null @@ -1,128 +0,0 @@ -from .common import InfoExtractor -from ..compat import ( - compat_str, -) -from ..utils import ( - ExtractorError, - float_or_none, - parse_duration, - parse_qs, - str_to_int, - urlencode_postdata, -) - - -class PandoraTVIE(InfoExtractor): - IE_NAME = 'pandora.tv' - IE_DESC = '판도라TV' - _VALID_URL = r'''(?x) - https?:// - (?: - (?:www\.)?pandora\.tv/view/(?P<user_id>[^/]+)/(?P<id>\d+)| # new format - (?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\?| # old format - m\.pandora\.tv/?\? # mobile - ) - ''' - _TESTS = [{ - 'url': 'http://jp.channel.pandora.tv/channel/video.ptv?c1=&prgid=53294230&ch_userid=mikakim&ref=main&lot=cate_01_2', - 'info_dict': { - 'id': '53294230', - 'ext': 'flv', - 'title': '頭を撫でてくれる?', - 'description': '頭を撫でてくれる?', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 39, - 'upload_date': '20151218', - 'uploader': 'カワイイ動物まとめ', - 'uploader_id': 'mikakim', - 'view_count': int, - 'like_count': int, - } - }, { - 'url': 'http://channel.pandora.tv/channel/video.ptv?ch_userid=gogoucc&prgid=54721744', - 'info_dict': { - 'id': '54721744', - 'ext': 'flv', - 'title': '[HD] JAPAN COUNTDOWN 170423', - 'description': '[HD] JAPAN COUNTDOWN 170423', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1704.9, - 'upload_date': '20170423', - 'uploader': 'GOGO_UCC', - 'uploader_id': 'gogoucc', - 'view_count': int, - 'like_count': int, - }, - 'params': { - # Test metadata only - 'skip_download': True, - }, - }, { - 'url': 'http://www.pandora.tv/view/mikakim/53294230#36797454_new', - 'only_matching': True, - }, { - 'url': 'http://m.pandora.tv/?c=view&ch_userid=mikakim&prgid=54600346', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - user_id = mobj.group('user_id') - video_id = mobj.group('id') - - if not user_id or not video_id: - qs = parse_qs(url) - video_id = qs.get('prgid', [None])[0] - user_id = qs.get('ch_userid', [None])[0] - if any(not f for f in (video_id, user_id,)): - raise ExtractorError('Invalid URL', expected=True) - - data = self._download_json( - 'http://m.pandora.tv/?c=view&m=viewJsonApi&ch_userid=%s&prgid=%s' - % (user_id, video_id), video_id) - - info = data['data']['rows']['vod_play_info']['result'] - - formats = [] - for format_id, format_url in info.items(): - if not format_url: - continue - height = self._search_regex( - r'^v(\d+)[Uu]rl$', format_id, 'height', default=None) - if not height: - continue - - play_url = self._download_json( - 'http://m.pandora.tv/?c=api&m=play_url', video_id, - data=urlencode_postdata({ - 'prgid': video_id, - 'runtime': info.get('runtime'), - 'vod_url': format_url, - }), - headers={ - 'Origin': url, - 'Content-Type': 'application/x-www-form-urlencoded', - }) - format_url = play_url.get('url') - if not format_url: - continue - - formats.append({ - 'format_id': '%sp' % height, - 'url': format_url, - 'height': int(height), - }) - - return { - 'id': video_id, - 'title': info['subject'], - 'description': info.get('body'), - 'thumbnail': info.get('thumbnail') or info.get('poster'), - 'duration': float_or_none(info.get('runtime'), 1000) or parse_duration(info.get('time')), - 'upload_date': info['fid'].split('/')[-1][:8] if isinstance(info.get('fid'), compat_str) else None, - 'uploader': info.get('nickname'), - 'uploader_id': info.get('upload_userid'), - 'view_count': str_to_int(info.get('hit')), - 'like_count': str_to_int(info.get('likecnt')), - 'formats': formats, - } diff --git a/yt_dlp/extractor/panopto.py b/yt_dlp/extractor/panopto.py index 6e3c9f442d..91f1055193 100644 --- a/yt_dlp/extractor/panopto.py +++ b/yt_dlp/extractor/panopto.py @@ -1,21 +1,17 @@ import calendar -import json +import datetime as dt import functools -from datetime import datetime -from random import random +import json +import random +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlparse, - compat_urlparse -) - from ..utils import ( - bug_reports_message, ExtractorError, + OnDemandPagedList, + bug_reports_message, get_first, int_or_none, - OnDemandPagedList, parse_qs, srt_subtitles_timecode, traverse_obj, @@ -48,7 +44,7 @@ class PanoptoBaseIE(InfoExtractor): 18: 'hu-HU', 19: 'nb-NO', 20: 'sv-SE', - 21: 'it-IT' + 21: 'it-IT', } def _call_api(self, base_url, path, video_id, data=None, fatal=True, **kwargs): @@ -70,7 +66,7 @@ def _call_api(self, base_url, path, video_id, data=None, fatal=True, **kwargs): @staticmethod def _parse_fragment(url): - return {k: json.loads(v[0]) for k, v in compat_urlparse.parse_qs(compat_urllib_parse_urlparse(url).fragment).items()} + return {k: json.loads(v[0]) for k, v in urllib.parse.parse_qs(urllib.parse.urlparse(url).fragment).items()} class PanoptoIE(PanoptoBaseIE): @@ -92,7 +88,7 @@ class PanoptoIE(PanoptoBaseIE): 'average_rating': int, 'uploader_id': '2db6b718-47a0-4b0b-9e17-ab0b00f42b1e', 'channel_id': 'e4c6a2fc-1214-4ca0-8fb7-aef2e29ff63a', - 'channel': 'Showcase Videos' + 'channel': 'Showcase Videos', }, }, { @@ -135,7 +131,7 @@ class PanoptoIE(PanoptoBaseIE): 'uploader': 'Kathryn Kelly', 'channel_id': 'fb93bc3c-6750-4b80-a05b-a921013735d3', 'channel': 'Getting Started', - } + }, }, { # Does not allow normal Viewer.aspx. AUDIO livestream has no url, so should be skipped and only give one stream. @@ -178,7 +174,7 @@ class PanoptoIE(PanoptoBaseIE): 'chapters': 'count:28', 'thumbnail': r're:https://demo\.hosted\.panopto\.com/.+', }, - 'params': {'format': 'mhtml', 'skip_download': True} + 'params': {'format': 'mhtml', 'skip_download': True}, }, { 'url': 'https://na-training-1.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=8285224a-9a2b-4957-84f2-acb0000c4ea9', @@ -200,7 +196,7 @@ class PanoptoIE(PanoptoBaseIE): 'subtitles': {'en-US': [{'ext': 'srt', 'data': 'md5:a3f4d25963fdeace838f327097c13265'}], 'es-ES': [{'ext': 'srt', 'data': 'md5:57e9dad365fd0fbaf0468eac4949f189'}]}, }, - 'params': {'writesubtitles': True, 'skip_download': True} + 'params': {'writesubtitles': True, 'skip_download': True}, }, { # On Panopto there are two subs: "Default" and en-US. en-US is blank and should be skipped. 'url': 'https://na-training-1.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=940cbd41-f616-4a45-b13e-aaf1000c915b', @@ -222,15 +218,15 @@ class PanoptoIE(PanoptoBaseIE): 'upload_date': '20191129', }, - 'params': {'writesubtitles': True, 'skip_download': True} + 'params': {'writesubtitles': True, 'skip_download': True}, }, { 'url': 'https://ucc.cloud.panopto.eu/Panopto/Pages/Viewer.aspx?id=0e8484a4-4ceb-4d98-a63f-ac0200b455cb', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://brown.hosted.panopto.com/Panopto/Pages/Embed.aspx?id=0b3ff73b-36a0-46c5-8455-aadf010a3638', - 'only_matching': True + 'only_matching': True, }, ] @@ -243,7 +239,7 @@ def _mark_watched(self, base_url, video_id, delivery_info): invocation_id = delivery_info.get('InvocationId') stream_id = traverse_obj(delivery_info, ('Delivery', 'Streams', ..., 'PublicID'), get_all=False, expected_type=str) if invocation_id and stream_id and duration: - timestamp_str = f'/Date({calendar.timegm(datetime.utcnow().timetuple())}000)/' + timestamp_str = f'/Date({calendar.timegm(dt.datetime.now(dt.timezone.utc).timetuple())}000)/' data = { 'streamRequests': [ { @@ -258,7 +254,7 @@ def _mark_watched(self, base_url, video_id, delivery_info): 'StopReason': None, 'StreamID': stream_id, 'TimeStamp': timestamp_str, - 'UpdatesRejected': 0 + 'UpdatesRejected': 0, }, ]} @@ -278,7 +274,7 @@ def _extract_chapters(timestamps): chapters.append({ 'start_time': start, 'end_time': start + duration, - 'title': caption + 'title': caption, }) return chapters @@ -287,11 +283,11 @@ def _extract_mhtml_formats(base_url, timestamps): image_frags = {} for timestamp in timestamps or []: duration = timestamp.get('Duration') - obj_id, obj_sn = timestamp.get('ObjectIdentifier'), timestamp.get('ObjectSequenceNumber'), + obj_id, obj_sn = timestamp.get('ObjectIdentifier'), timestamp.get('ObjectSequenceNumber') if timestamp.get('EventTargetType') == 'PowerPoint' and obj_id is not None and obj_sn is not None: image_frags.setdefault('slides', []).append({ 'url': base_url + f'/Pages/Viewer/Image.aspx?id={obj_id}&number={obj_sn}', - 'duration': duration + 'duration': duration, }) obj_pid, session_id, abs_time = timestamp.get('ObjectPublicIdentifier'), timestamp.get('SessionID'), timestamp.get('AbsoluteTime') @@ -308,7 +304,7 @@ def _extract_mhtml_formats(base_url, timestamps): 'acodec': 'none', 'vcodec': 'none', 'url': 'about:invalid', - 'fragments': fragments + 'fragments': fragments, } @staticmethod @@ -333,8 +329,8 @@ def _get_subtitles(self, base_url, video_id, delivery): 'deliveryId': video_id, 'getCaptions': True, 'language': str(lang), - 'responseType': 'json' - } + 'responseType': 'json', + }, ) if not isinstance(response, list): continue @@ -363,12 +359,12 @@ def _extract_streams_formats_and_subtitles(self, video_id, streams, **fmt_kwargs subtitles = self._merge_subtitles(subtitles, stream_subtitles) else: stream_formats.append({ - 'url': stream_url + 'url': stream_url, }) for fmt in stream_formats: fmt.update({ 'format_note': stream.get('Tag'), - **fmt_kwargs + **fmt_kwargs, }) formats.extend(stream_formats) @@ -388,7 +384,7 @@ def _real_extract(self, url): 'isKollectiveAgentInstalled': 'false', 'isEmbed': 'false', 'responseType': 'json', - } + }, ) delivery = delivery_info['Delivery'] @@ -415,7 +411,7 @@ def _real_extract(self, url): 'cast': traverse_obj(delivery, ('Contributors', ..., 'DisplayName'), expected_type=lambda x: x or None), 'timestamp': session_start_time - 11640000000 if session_start_time else None, 'duration': delivery.get('Duration'), - 'thumbnail': base_url + f'/Services/FrameGrabber.svc/FrameRedirect?objectId={video_id}&mode=Delivery&random={random()}', + 'thumbnail': base_url + f'/Services/FrameGrabber.svc/FrameRedirect?objectId={video_id}&mode=Delivery&random={random.random()}', 'average_rating': delivery.get('AverageRating'), 'chapters': self._extract_chapters(timestamps), 'uploader': delivery.get('OwnerDisplayName') or None, @@ -425,7 +421,7 @@ def _real_extract(self, url): 'channel_id': delivery.get('SessionGroupPublicID'), 'channel': traverse_obj(delivery, 'SessionGroupLongName', 'SessionGroupShortName', get_all=False), 'formats': formats, - 'subtitles': subtitles + 'subtitles': subtitles, } @@ -439,7 +435,7 @@ class PanoptoPlaylistIE(PanoptoBaseIE): 'id': 'f3b39fcf-882f-4849-93d6-a9f401236d36', 'description': '', }, - 'playlist_mincount': 36 + 'playlist_mincount': 36, }, { 'url': 'https://utsa.hosted.panopto.com/Panopto/Pages/Viewer.aspx?pid=e2900555-3ad4-4bdb-854d-ad2401686190', @@ -448,7 +444,7 @@ class PanoptoPlaylistIE(PanoptoBaseIE): 'id': 'e2900555-3ad4-4bdb-854d-ad2401686190', 'description': 'md5:f958bca50a1cbda15fdc1e20d32b3ecb', }, - 'playlist_mincount': 4 + 'playlist_mincount': 4, }, ] @@ -470,7 +466,7 @@ def _entries(self, base_url, playlist_id, session_list_id): 'description': item.get('Description'), 'duration': item.get('Duration'), 'channel': traverse_obj(item, ('Parent', 'Name')), - 'channel_id': traverse_obj(item, ('Parent', 'Id')) + 'channel_id': traverse_obj(item, ('Parent', 'Id')), } def _real_extract(self, url): @@ -479,7 +475,7 @@ def _real_extract(self, url): video_id = get_first(parse_qs(url), 'id') if video_id: if self.get_param('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + self.to_screen(f'Downloading just video {video_id} because of --no-playlist') return self.url_result(base_url + f'/Pages/Viewer.aspx?id={video_id}', ie_key=PanoptoIE.ie_key(), video_id=video_id) else: self.to_screen(f'Downloading playlist {playlist_id}; add --no-playlist to just download video {video_id}') @@ -499,28 +495,28 @@ class PanoptoListIE(PanoptoBaseIE): 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Sessions/List.aspx#folderID=%22e4c6a2fc-1214-4ca0-8fb7-aef2e29ff63a%22', 'info_dict': { 'id': 'e4c6a2fc-1214-4ca0-8fb7-aef2e29ff63a', - 'title': 'Showcase Videos' + 'title': 'Showcase Videos', }, - 'playlist_mincount': 140 + 'playlist_mincount': 140, }, { 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Sessions/List.aspx#view=2&maxResults=250', 'info_dict': { 'id': 'panopto_list', - 'title': 'panopto_list' + 'title': 'panopto_list', }, - 'playlist_mincount': 300 + 'playlist_mincount': 300, }, { # Folder that contains 8 folders and a playlist 'url': 'https://howtovideos.hosted.panopto.com/Panopto/Pages/Sessions/List.aspx?noredirect=true#folderID=%224b9de7ae-0080-4158-8496-a9ba01692c2e%22', 'info_dict': { 'id': '4b9de7ae-0080-4158-8496-a9ba01692c2e', - 'title': 'Video Tutorials' + 'title': 'Video Tutorials', }, - 'playlist_mincount': 9 - } + 'playlist_mincount': 9, + }, ] @@ -536,7 +532,7 @@ def _fetch_page(self, base_url, query_params, display_id, page): } response = self._call_api( - base_url, '/Services/Data.svc/GetSessions', f'{display_id} page {page+1}', + base_url, '/Services/Data.svc/GetSessions', f'{display_id} page {page + 1}', data={'queryParameters': params}, fatal=False) for result in get_first(response, 'Results', default=[]): @@ -563,7 +559,7 @@ def _extract_folder_metadata(self, base_url, folder_id): base_url, '/Services/Data.svc/GetFolderInfo', folder_id, data={'folderID': folder_id}, fatal=False) return { - 'title': get_first(response, 'Name') + 'title': get_first(response, 'Name'), } def _real_extract(self, url): diff --git a/yt_dlp/extractor/paramountplus.py b/yt_dlp/extractor/paramountplus.py index 7e472a63e0..317f53b2bc 100644 --- a/yt_dlp/extractor/paramountplus.py +++ b/yt_dlp/extractor/paramountplus.py @@ -1,7 +1,7 @@ import itertools -from .common import InfoExtractor from .cbs import CBSBaseIE +from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, @@ -170,19 +170,19 @@ class ParamountPlusSeriesIE(InfoExtractor): 'playlist_mincount': 50, 'info_dict': { 'id': 'drake-josh', - } + }, }, { 'url': 'https://www.paramountplus.com/shows/hawaii_five_0/', 'playlist_mincount': 240, 'info_dict': { 'id': 'hawaii_five_0', - } + }, }, { 'url': 'https://www.paramountplus.com/shows/spongebob-squarepants/', 'playlist_mincount': 248, 'info_dict': { 'id': 'spongebob-squarepants', - } + }, }] def _entries(self, show_name): @@ -193,7 +193,7 @@ def _entries(self, show_name): return for episode in show_json['result']['data']: yield self.url_result( - 'https://www.paramountplus.com%s' % episode['url'], + 'https://www.paramountplus.com{}'.format(episode['url']), ie=ParamountPlusIE.ie_key(), video_id=episode['content_id']) def _real_extract(self, url): diff --git a/yt_dlp/extractor/parler.py b/yt_dlp/extractor/parler.py index 68a60bc84f..9be288a7d0 100644 --- a/yt_dlp/extractor/parler.py +++ b/yt_dlp/extractor/parler.py @@ -1,19 +1,20 @@ +import functools + from .common import InfoExtractor from .youtube import YoutubeIE from ..utils import ( clean_html, - format_field, int_or_none, strip_or_none, traverse_obj, unified_timestamp, - urlencode_postdata, + urljoin, ) class ParlerIE(InfoExtractor): IE_DESC = 'Posts on parler.com' - _VALID_URL = r'https://parler\.com/feed/(?P<id>[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})' + _VALID_URL = r'https?://parler\.com/feed/(?P<id>[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})' _TESTS = [ { 'url': 'https://parler.com/feed/df79fdba-07cc-48fe-b085-3293897520d7', @@ -24,7 +25,7 @@ class ParlerIE(InfoExtractor): 'thumbnail': 'https://bl-images.parler.com/videos/6ce7cdf3-a27a-4d72-bf9c-d3e17ce39a66/thumbnail.jpeg', 'title': 'Parler video #df79fdba-07cc-48fe-b085-3293897520d7', 'description': 'md5:6f220bde2df4a97cbb89ac11f1fd8197', - 'timestamp': 1659744000, + 'timestamp': 1659785481, 'upload_date': '20220806', 'uploader': 'Tulsi Gabbard', 'uploader_id': 'TulsiGabbard', @@ -34,78 +35,57 @@ class ParlerIE(InfoExtractor): 'repost_count': int, }, }, - { - 'url': 'https://parler.com/feed/a7406eb4-91e5-4793-b5e3-ade57a24e287', - 'md5': '11687e2f5bb353682cee338d181422ed', - 'info_dict': { - 'id': 'a7406eb4-91e5-4793-b5e3-ade57a24e287', - 'ext': 'mp4', - 'thumbnail': 'https://bl-images.parler.com/videos/317827a8-1e48-4cbc-981f-7dd17d4c1183/thumbnail.jpeg', - 'title': 'Parler video #a7406eb4-91e5-4793-b5e3-ade57a24e287', - 'description': 'This man should run for office', - 'timestamp': 1659657600, - 'upload_date': '20220805', - 'uploader': 'Benny Johnson', - 'uploader_id': 'BennyJohnson', - 'uploader_url': 'https://parler.com/BennyJohnson', - 'view_count': int, - 'comment_count': int, - 'repost_count': int, - }, - }, { 'url': 'https://parler.com/feed/f23b85c1-6558-470f-b9ff-02c145f28da5', 'md5': 'eaba1ff4a10fe281f5ce74e930ab2cb4', 'info_dict': { 'id': 'r5vkSaz8PxQ', 'ext': 'mp4', - 'thumbnail': 'https://i.ytimg.com/vi_webp/r5vkSaz8PxQ/maxresdefault.webp', - 'title': 'Tom MacDonald Names Reaction', - 'description': 'md5:33c21f0d35ae6dc2edf3007d6696baea', - 'upload_date': '20220716', - 'duration': 1267, - 'uploader': 'Mahesh Chookolingo', - 'uploader_id': 'maheshchookolingo', - 'uploader_url': 'http://www.youtube.com/user/maheshchookolingo', - 'channel': 'Mahesh Chookolingo', - 'channel_id': 'UCox6YeMSY1PQInbCtTaZj_w', - 'channel_url': 'https://www.youtube.com/channel/UCox6YeMSY1PQInbCtTaZj_w', - 'categories': ['Entertainment'], - 'tags': list, - 'availability': 'public', 'live_status': 'not_live', - 'view_count': int, 'comment_count': int, + 'duration': 1267, 'like_count': int, 'channel_follower_count': int, - 'age_limit': 0, + 'channel_id': 'UCox6YeMSY1PQInbCtTaZj_w', + 'upload_date': '20220716', + 'thumbnail': 'https://i.ytimg.com/vi/r5vkSaz8PxQ/maxresdefault.jpg', + 'tags': 'count:17', + 'availability': 'public', + 'categories': ['Entertainment'], 'playable_in_embed': True, + 'channel': 'Who Knows What! With Mahesh & Friends', + 'title': 'Tom MacDonald Names Reaction', + 'uploader': 'Who Knows What! With Mahesh & Friends', + 'uploader_id': '@maheshchookolingo', + 'age_limit': 0, + 'description': 'md5:33c21f0d35ae6dc2edf3007d6696baea', + 'channel_url': 'https://www.youtube.com/channel/UCox6YeMSY1PQInbCtTaZj_w', + 'view_count': int, + 'uploader_url': 'http://www.youtube.com/@maheshchookolingo', }, }, ] def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_json( - 'https://parler.com/open-api/ParleyDetailEndpoint.php', video_id, - data=urlencode_postdata({'uuid': video_id}))['data'][0] - primary = data['primary'] - - embed = self._parse_json(primary.get('V2LINKLONG') or '', video_id, fatal=False) - if embed: - return self.url_result(embed[0], YoutubeIE) + data = self._download_json(f'https://api.parler.com/v0/public/parleys/{video_id}', + video_id)['data'] + if data.get('link'): + return self.url_result(data['link'], YoutubeIE) return { 'id': video_id, - 'url': traverse_obj(primary, ('video_data', 'videoSrc')), - 'thumbnail': traverse_obj(primary, ('video_data', 'thumbnailUrl')), - 'title': '', - 'description': strip_or_none(clean_html(primary.get('full_body'))) or None, - 'timestamp': unified_timestamp(primary.get('date_created')), - 'uploader': strip_or_none(primary.get('name')), - 'uploader_id': strip_or_none(primary.get('username')), - 'uploader_url': format_field(strip_or_none(primary.get('username')), None, 'https://parler.com/%s'), - 'view_count': int_or_none(primary.get('view_count')), - 'comment_count': int_or_none(traverse_obj(data, ('engagement', 'commentCount'))), - 'repost_count': int_or_none(traverse_obj(data, ('engagement', 'echoCount'))), + 'title': strip_or_none(data.get('title')) or '', + **traverse_obj(data, { + 'url': ('video', 'videoSrc'), + 'thumbnail': ('video', 'thumbnailUrl'), + 'description': ('body', {clean_html}), + 'timestamp': ('date_created', {unified_timestamp}), + 'uploader': ('user', 'name', {strip_or_none}), + 'uploader_id': ('user', 'username', {str}), + 'uploader_url': ('user', 'username', {functools.partial(urljoin, 'https://parler.com/')}), + 'view_count': ('views', {int_or_none}), + 'comment_count': ('total_comments', {int_or_none}), + 'repost_count': ('echos', {int_or_none}), + }), } diff --git a/yt_dlp/extractor/parlview.py b/yt_dlp/extractor/parlview.py index 0b547917c0..b93b5edacd 100644 --- a/yt_dlp/extractor/parlview.py +++ b/yt_dlp/extractor/parlview.py @@ -1,5 +1,4 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, try_get, @@ -8,7 +7,7 @@ class ParlviewIE(InfoExtractor): - + _WORKING = False _VALID_URL = r'https?://(?:www\.)?parlview\.aph\.gov\.au/(?:[^/]+)?\bvideoID=(?P<id>\d{6})' _TESTS = [{ 'url': 'https://parlview.aph.gov.au/mediaPlayer.php?videoID=542661', @@ -24,7 +23,7 @@ class ParlviewIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, }, { 'url': 'https://parlview.aph.gov.au/mediaPlayer.php?videoID=539936', 'only_matching': True, @@ -36,13 +35,13 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) media = self._download_json(self._API_URL % video_id, video_id).get('media') - timestamp = try_get(media, lambda x: x['timeMap']['source']['timecode_offsets'][0], compat_str) or '/' + timestamp = try_get(media, lambda x: x['timeMap']['source']['timecode_offsets'][0], str) or '/' stream = try_get(media, lambda x: x['renditions'][0], dict) if not stream: self.raise_no_formats('No streams were detected') elif stream.get('streamType') != 'VOD': - self.raise_no_formats('Unknown type of stream was detected: "%s"' % str(stream.get('streamType'))) + self.raise_no_formats('Unknown type of stream was detected: "{}"'.format(str(stream.get('streamType')))) formats = self._extract_m3u8_formats(stream['url'], video_id, 'mp4', 'm3u8_native') media_info = self._download_webpage( diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index e93e37eb93..4489d533a6 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -1,22 +1,23 @@ import itertools -from urllib.error import HTTPError +import urllib.parse from .common import InfoExtractor +from .sproutvideo import VidsIoIE from .vimeo import VimeoIE - -from ..compat import compat_urllib_parse_unquote +from ..networking.exceptions import HTTPError from ..utils import ( + KNOWN_EXTENSIONS, + ExtractorError, clean_html, determine_ext, - ExtractorError, int_or_none, - KNOWN_EXTENSIONS, mimetype2ext, parse_iso8601, + smuggle_url, str_or_none, traverse_obj, - try_get, url_or_none, + urljoin, ) @@ -34,12 +35,12 @@ def _call_api(self, ep, item_id, query=None, headers=None, fatal=True, note=None try: return self._download_json( f'https://www.patreon.com/api/{ep}', - item_id, note='Downloading API JSON' if not note else note, + item_id, note=note if note else 'Downloading API JSON', query=query, fatal=fatal, headers=headers) except ExtractorError as e: - if not isinstance(e.cause, HTTPError) or mimetype2ext(e.cause.headers.get('Content-Type')) != 'json': + if not isinstance(e.cause, HTTPError) or mimetype2ext(e.cause.response.headers.get('Content-Type')) != 'json': raise - err_json = self._parse_json(self._webpage_read_content(e.cause, None, item_id), item_id, fatal=False) + err_json = self._parse_json(self._webpage_read_content(e.cause.response, None, item_id), item_id, fatal=False) err_message = traverse_obj(err_json, ('errors', ..., 'detail'), get_all=False) if err_message: raise ExtractorError(f'Patreon said: {err_message}', expected=True) @@ -92,7 +93,7 @@ class PatreonIE(PatreonBaseIE): 'thumbnail': 're:^https?://.*$', 'upload_date': '20150211', 'description': 'md5:8af6425f50bd46fbf29f3db0fc3a8364', - 'uploader_id': 'TraciJHines', + 'uploader_id': '@TraciHinesMusic', 'categories': ['Entertainment'], 'duration': 282, 'view_count': int, @@ -106,13 +107,15 @@ class PatreonIE(PatreonBaseIE): 'availability': 'public', 'channel_follower_count': int, 'playable_in_embed': True, - 'uploader_url': 'http://www.youtube.com/user/TraciJHines', + 'uploader_url': 'https://www.youtube.com/@TraciHinesMusic', 'comment_count': int, + 'channel_is_verified': True, + 'chapters': 'count:4', }, 'params': { 'noplaylist': True, 'skip_download': True, - } + }, }, { 'url': 'https://www.patreon.com/posts/episode-166-of-743933', 'only_matching': True, @@ -132,7 +135,7 @@ class PatreonIE(PatreonBaseIE): 'description': 'md5:557a409bd79d3898689419094934ba79', 'uploader_id': '14936315', }, - 'skip': 'Patron-only content' + 'skip': 'Patron-only content', }, { # m3u8 video (https://github.com/yt-dlp/yt-dlp/issues/2277) 'url': 'https://www.patreon.com/posts/video-sketchbook-32452882', @@ -153,7 +156,7 @@ class PatreonIE(PatreonBaseIE): 'channel_id': '1641751', 'channel_url': 'https://www.patreon.com/loish', 'channel_follower_count': int, - } + }, }, { # bad videos under media (if media is included). Real one is under post_file 'url': 'https://www.patreon.com/posts/premium-access-70282931', @@ -176,7 +179,71 @@ class PatreonIE(PatreonBaseIE): 'uploader_url': 'https://www.patreon.com/thenormies', }, 'skip': 'Patron-only content', + }, { + # dead vimeo and embed URLs, need to extract post_file + 'url': 'https://www.patreon.com/posts/hunter-x-hunter-34007913', + 'info_dict': { + 'id': '34007913', + 'ext': 'mp4', + 'title': 'Hunter x Hunter | Kurapika DESTROYS Uvogin!!!', + 'like_count': int, + 'uploader': 'YaBoyRoshi', + 'timestamp': 1581636833, + 'channel_url': 'https://www.patreon.com/yaboyroshi', + 'thumbnail': r're:^https?://.*$', + 'tags': ['Hunter x Hunter'], + 'uploader_id': '14264111', + 'comment_count': int, + 'channel_follower_count': int, + 'description': 'Kurapika is a walking cheat code!', + 'upload_date': '20200213', + 'channel_id': '2147162', + 'uploader_url': 'https://www.patreon.com/yaboyroshi', + }, + }, { + # NSFW vimeo embed URL + 'url': 'https://www.patreon.com/posts/4k-spiderman-4k-96414599', + 'info_dict': { + 'id': '902250943', + 'ext': 'mp4', + 'title': '❤️(4K) Spiderman Girl Yeonhwa’s Gift ❤️(4K) 스파이더맨걸 연화의 선물', + 'description': '❤️(4K) Spiderman Girl Yeonhwa’s Gift \n❤️(4K) 스파이더맨걸 연화의 선물', + 'uploader': 'Npickyeonhwa', + 'uploader_id': '90574422', + 'uploader_url': 'https://www.patreon.com/Yeonhwa726', + 'channel_id': '10237902', + 'channel_url': 'https://www.patreon.com/Yeonhwa726', + 'duration': 70, + 'timestamp': 1705150153, + 'upload_date': '20240113', + 'comment_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.+', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + # multiple attachments/embeds + 'url': 'https://www.patreon.com/posts/holy-wars-solos-100601977', + 'playlist_count': 3, + 'info_dict': { + 'id': '100601977', + 'title': '"Holy Wars" (Megadeth) Solos Transcription & Lesson/Analysis', + 'description': 'md5:d099ab976edfce6de2a65c2b169a88d3', + 'uploader': 'Bradley Hall', + 'uploader_id': '24401883', + 'uploader_url': 'https://www.patreon.com/bradleyhallguitar', + 'channel_id': '3193932', + 'channel_url': 'https://www.patreon.com/bradleyhallguitar', + 'channel_follower_count': int, + 'timestamp': 1710777855, + 'upload_date': '20240318', + 'like_count': int, + 'comment_count': int, + 'thumbnail': r're:^https?://.+', + }, + 'skip': 'Patron-only content', }] + _RETURN_TYPE = 'video' def _real_extract(self, url): video_id = self._match_id(url) @@ -191,102 +258,114 @@ def _real_extract(self, url): 'include': 'audio,user,user_defined_tags,campaign,attachments_media', }) attributes = post['data']['attributes'] - title = attributes['title'].strip() - image = attributes.get('image') or {} - info = { - 'id': video_id, - 'title': title, - 'description': clean_html(attributes.get('content')), - 'thumbnail': image.get('large_url') or image.get('url'), - 'timestamp': parse_iso8601(attributes.get('published_at')), - 'like_count': int_or_none(attributes.get('like_count')), - 'comment_count': int_or_none(attributes.get('comment_count')), - } - can_view_post = traverse_obj(attributes, 'current_user_can_view') - if can_view_post and info['comment_count']: - info['__post_extractor'] = self.extract_comments(video_id) + info = traverse_obj(attributes, { + 'title': ('title', {str.strip}), + 'description': ('content', {clean_html}), + 'thumbnail': ('image', ('large_url', 'url'), {url_or_none}, any), + 'timestamp': ('published_at', {parse_iso8601}), + 'like_count': ('like_count', {int_or_none}), + 'comment_count': ('comment_count', {int_or_none}), + }) - for i in post.get('included', []): - i_type = i.get('type') - if i_type == 'media': - media_attributes = i.get('attributes') or {} - download_url = media_attributes.get('download_url') + entries = [] + idx = 0 + for include in traverse_obj(post, ('included', lambda _, v: v['type'])): + include_type = include['type'] + if include_type == 'media': + media_attributes = traverse_obj(include, ('attributes', {dict})) or {} + download_url = url_or_none(media_attributes.get('download_url')) ext = mimetype2ext(media_attributes.get('mimetype')) # if size_bytes is None, this media file is likely unavailable # See: https://github.com/yt-dlp/yt-dlp/issues/4608 size_bytes = int_or_none(media_attributes.get('size_bytes')) if download_url and ext in KNOWN_EXTENSIONS and size_bytes is not None: - # XXX: what happens if there are multiple attachments? - return { - **info, + idx += 1 + entries.append({ + 'id': f'{video_id}-{idx}', 'ext': ext, 'filesize': size_bytes, 'url': download_url, - } - elif i_type == 'user': - user_attributes = i.get('attributes') - if user_attributes: - info.update({ - 'uploader': user_attributes.get('full_name'), - 'uploader_id': str_or_none(i.get('id')), - 'uploader_url': user_attributes.get('url'), }) - elif i_type == 'post_tag': - info.setdefault('tags', []).append(traverse_obj(i, ('attributes', 'value'))) + elif include_type == 'user': + info.update(traverse_obj(include, { + 'uploader': ('attributes', 'full_name', {str}), + 'uploader_id': ('id', {str_or_none}), + 'uploader_url': ('attributes', 'url', {url_or_none}), + })) - elif i_type == 'campaign': - info.update({ - 'channel': traverse_obj(i, ('attributes', 'title')), - 'channel_id': str_or_none(i.get('id')), - 'channel_url': traverse_obj(i, ('attributes', 'url')), - 'channel_follower_count': int_or_none(traverse_obj(i, ('attributes', 'patron_count'))), - }) + elif include_type == 'post_tag': + if post_tag := traverse_obj(include, ('attributes', 'value', {str})): + info.setdefault('tags', []).append(post_tag) + + elif include_type == 'campaign': + info.update(traverse_obj(include, { + 'channel': ('attributes', 'title', {str}), + 'channel_id': ('id', {str_or_none}), + 'channel_url': ('attributes', 'url', {url_or_none}), + 'channel_follower_count': ('attributes', 'patron_count', {int_or_none}), + })) + + # all-lowercase 'referer' so we can smuggle it to Generic, SproutVideo, Vimeo + headers = {'referer': 'https://patreon.com/'} # handle Vimeo embeds - if try_get(attributes, lambda x: x['embed']['provider']) == 'Vimeo': - embed_html = try_get(attributes, lambda x: x['embed']['html']) - v_url = url_or_none(compat_urllib_parse_unquote( - self._search_regex(r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', embed_html, 'vimeo url', fatal=False))) - if v_url: - return { - **info, - '_type': 'url_transparent', - 'url': VimeoIE._smuggle_referrer(v_url, 'https://patreon.com'), - 'ie_key': 'Vimeo', - } + if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo': + v_url = urllib.parse.unquote(self._html_search_regex( + r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', + traverse_obj(attributes, ('embed', 'html', {str})), 'vimeo url', fatal=False) or '') + if url_or_none(v_url) and self._request_webpage( + v_url, video_id, 'Checking Vimeo embed URL', headers=headers, + fatal=False, errnote=False, expected_status=429): # 429 is TLS fingerprint rejection + entries.append(self.url_result( + VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'), + VimeoIE, url_transparent=True)) - embed_url = try_get(attributes, lambda x: x['embed']['url']) - if embed_url: - return { - **info, - '_type': 'url', - 'url': embed_url, - } + embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none})) + if embed_url and (urlh := self._request_webpage( + embed_url, video_id, 'Checking embed URL', headers=headers, + fatal=False, errnote=False, expected_status=403)): + # Password-protected vids.io embeds return 403 errors w/o --video-password or session cookie + if urlh.status != 403 or VidsIoIE.suitable(embed_url): + entries.append(self.url_result(smuggle_url(embed_url, headers))) - post_file = traverse_obj(attributes, 'post_file') + post_file = traverse_obj(attributes, ('post_file', {dict})) if post_file: name = post_file.get('name') ext = determine_ext(name) if ext in KNOWN_EXTENSIONS: - return { - **info, + entries.append({ + 'id': video_id, 'ext': ext, 'url': post_file['url'], - } - elif name == 'video': + }) + elif name == 'video' or determine_ext(post_file.get('url')) == 'm3u8': formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id) - return { - **info, + entries.append({ + 'id': video_id, 'formats': formats, 'subtitles': subtitles, - } + }) - if can_view_post is False: + can_view_post = traverse_obj(attributes, 'current_user_can_view') + comments = None + if can_view_post and info.get('comment_count'): + comments = self.extract_comments(video_id) + + if not entries and can_view_post is False: self.raise_no_formats('You do not have access to this post', video_id=video_id, expected=True) - else: + elif not entries: self.raise_no_formats('No supported media found in this post', video_id=video_id, expected=True) + elif len(entries) == 1: + info.update(entries[0]) + else: + for entry in entries: + entry.update(info) + return self.playlist_result(entries, video_id, **info, __post_extractor=comments) + + info['id'] = video_id + info['__post_extractor'] = comments return info def _get_comments(self, post_id): @@ -307,7 +386,7 @@ def _get_comments(self, post_id): params.update({'page[cursor]': cursor} if cursor else {}) response = self._call_api( - f'posts/{post_id}/comments', post_id, query=params, note='Downloading comments page %d' % page) + f'posts/{post_id}/comments', post_id, query=params, note=f'Downloading comments page {page}') cursor = None for comment in traverse_obj(response, (('data', ('included', lambda _, v: v['type'] == 'comment')), ...)): @@ -341,7 +420,7 @@ def _get_comments(self, post_id): class PatreonCampaignIE(PatreonBaseIE): - _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?!rss)(?:(?:m/(?P<campaign_id>\d+))|(?P<vanity>[-\w]+))' + _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?!rss)(?:(?:m|api/campaigns)/(?P<campaign_id>\d+)|(?P<vanity>[-\w]+))' _TESTS = [{ 'url': 'https://www.patreon.com/dissonancepod/', 'info_dict': { @@ -363,30 +442,49 @@ class PatreonCampaignIE(PatreonBaseIE): 'url': 'https://www.patreon.com/m/4767637/posts', 'info_dict': { 'title': 'Not Just Bikes', - 'channel_follower_count': int, 'id': '4767637', 'channel_id': '4767637', 'channel_url': 'https://www.patreon.com/notjustbikes', - 'description': 'md5:595c6e7dca76ae615b1d38c298a287a1', + 'description': 'md5:9f4b70051216c4d5c58afe580ffc8d0f', 'age_limit': 0, 'channel': 'Not Just Bikes', 'uploader_url': 'https://www.patreon.com/notjustbikes', - 'uploader': 'Not Just Bikes', + 'uploader': 'Jason', 'uploader_id': '37306634', 'thumbnail': r're:^https?://.*$', }, - 'playlist_mincount': 71 + 'playlist_mincount': 71, + }, { + 'url': 'https://www.patreon.com/api/campaigns/4243769/posts', + 'info_dict': { + 'title': 'Second Thought', + 'channel_follower_count': int, + 'id': '4243769', + 'channel_id': '4243769', + 'channel_url': 'https://www.patreon.com/secondthought', + 'description': 'md5:69c89a3aba43efdb76e85eb023e8de8b', + 'age_limit': 0, + 'channel': 'Second Thought', + 'uploader_url': 'https://www.patreon.com/secondthought', + 'uploader': 'JT Chapman', + 'uploader_id': '32718287', + 'thumbnail': r're:^https?://.*$', + }, + 'playlist_mincount': 201, }, { 'url': 'https://www.patreon.com/dissonancepod/posts', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://www.patreon.com/m/5932659', - 'only_matching': True + 'only_matching': True, + }, { + 'url': 'https://www.patreon.com/api/campaigns/4243769', + 'only_matching': True, }] @classmethod def suitable(cls, url): - return False if PatreonIE.suitable(url) else super(PatreonCampaignIE, cls).suitable(url) + return False if PatreonIE.suitable(url) else super().suitable(url) def _entries(self, campaign_id): cursor = None @@ -401,11 +499,11 @@ def _entries(self, campaign_id): for page in itertools.count(1): params.update({'page[cursor]': cursor} if cursor else {}) - posts_json = self._call_api('posts', campaign_id, query=params, note='Downloading posts page %d' % page) + posts_json = self._call_api('posts', campaign_id, query=params, note=f'Downloading posts page {page}') cursor = traverse_obj(posts_json, ('meta', 'pagination', 'cursors', 'next')) - for post in posts_json.get('data') or []: - yield self.url_result(url_or_none(traverse_obj(post, ('attributes', 'patreon_url'))), 'Patreon') + for post_url in traverse_obj(posts_json, ('data', ..., 'attributes', 'patreon_url')): + yield self.url_result(urljoin('https://www.patreon.com/', post_url), PatreonIE) if cursor is None: break @@ -415,13 +513,14 @@ def _real_extract(self, url): campaign_id, vanity = self._match_valid_url(url).group('campaign_id', 'vanity') if campaign_id is None: webpage = self._download_webpage(url, vanity, headers={'User-Agent': self.USER_AGENT}) - campaign_id = self._search_regex(r'https://www.patreon.com/api/campaigns/(\d+)/?', webpage, 'Campaign ID') + campaign_id = self._search_nextjs_data( + webpage, vanity)['props']['pageProps']['bootstrapEnvelope']['pageBootstrap']['campaign']['data']['id'] params = { 'json-api-use-default-includes': 'false', 'fields[user]': 'full_name,url', 'fields[campaign]': 'name,summary,url,patron_count,creation_count,is_nsfw,avatar_photo_url', - 'include': 'creator' + 'include': 'creator', } campaign_response = self._call_api( diff --git a/yt_dlp/extractor/pbs.py b/yt_dlp/extractor/pbs.py index 5bdf561db9..686796491d 100644 --- a/yt_dlp/extractor/pbs.py +++ b/yt_dlp/extractor/pbs.py @@ -1,19 +1,19 @@ import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( + US_RATINGS, ExtractorError, determine_ext, - int_or_none, float_or_none, + int_or_none, js_to_json, orderedSet, strip_jsonp, strip_or_none, + traverse_obj, unified_strdate, url_or_none, - US_RATINGS, ) @@ -181,18 +181,18 @@ class PBSIE(InfoExtractor): ) IE_NAME = 'pbs' - IE_DESC = 'Public Broadcasting Service (PBS) and member stations: %s' % ', '.join(list(zip(*_STATIONS))[1]) + IE_DESC = 'Public Broadcasting Service (PBS) and member stations: {}'.format(', '.join(list(zip(*_STATIONS))[1])) _VALID_URL = r'''(?x)https?:// (?: # Direct video URL - (?:%s)/(?:(?:vir|port)alplayer|video)/(?P<id>[0-9]+)(?:[?/]|$) | + (?:{})/(?:(?:vir|port)alplayer|video)/(?P<id>[0-9]+)(?:[?/]|$) | # Article with embedded player (or direct video) - (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) | + (?:www\.)?pbs\.org/(?:[^/]+/){{1,5}}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) | # Player (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+) ) - ''' % '|'.join(list(zip(*_STATIONS))[0]) + '''.format('|'.join(next(zip(*_STATIONS)))) _GEO_COUNTRIES = ['US'] @@ -414,7 +414,7 @@ class PBSIE(InfoExtractor): { 'url': 'https://player.pbs.org/portalplayer/3004638221/?uid=', 'only_matching': True, - } + }, ] _ERRORS = { 101: 'We\'re sorry, but this video is not yet available.', @@ -517,7 +517,7 @@ def _extract_webpage(self, url): if not video_id: video_info = self._extract_video_data( player_page, 'video data', display_id) - video_id = compat_str( + video_id = str( video_info.get('id') or video_info['contentID']) else: video_id = mobj.group('id') @@ -538,7 +538,7 @@ def _real_extract(self, url): if isinstance(video_id, list): entries = [self.url_result( - 'http://video.pbs.org/video/%s' % vid_id, 'PBS', vid_id) + f'http://video.pbs.org/video/{vid_id}', 'PBS', vid_id) for vid_id in video_id] return self.playlist_result(entries, display_id) @@ -567,11 +567,11 @@ def extract_redirect_urls(info): # Player pages may also serve different qualities for page in ('widget/partnerplayer', 'portalplayer'): player = self._download_webpage( - 'http://player.pbs.org/%s/%s' % (page, video_id), - display_id, 'Downloading %s page' % page, fatal=False) + f'http://player.pbs.org/{page}/{video_id}', + display_id, f'Downloading {page} page', fatal=False) if player: video_info = self._extract_video_data( - player, '%s video data' % page, display_id, fatal=False) + player, f'{page} video data', display_id, fatal=False) if video_info: extract_redirect_urls(video_info) if not info: @@ -602,7 +602,7 @@ def extract_redirect_urls(info): redirect_id = redirect.get('eeid') redirect_info = self._download_json( - '%s?format=json' % redirect['url'], display_id, + '{}?format=json'.format(redirect['url']), display_id, 'Downloading %s video url info' % (redirect_id or num), headers=self.geo_verification_headers()) @@ -613,7 +613,7 @@ def extract_redirect_urls(info): self.raise_geo_restricted( msg=message, countries=self._GEO_COUNTRIES) raise ExtractorError( - '%s said: %s' % (self.IE_NAME, message), expected=True) + f'{self.IE_NAME} said: {message}', expected=True) format_url = redirect_info.get('url') if not format_url: @@ -648,7 +648,7 @@ def extract_redirect_urls(info): f_url = re.sub(r'\d+k|baseline', bitrate + 'k', http_url) # This may produce invalid links sometimes (e.g. # http://www.pbs.org/wgbh/frontline/film/suicide-plan) - if not self._is_valid_url(f_url, display_id, 'http-%sk video' % bitrate): + if not self._is_valid_url(f_url, display_id, f'http-{bitrate}k video'): continue f = m3u8_format.copy() f.update({ @@ -670,7 +670,7 @@ def extract_redirect_urls(info): captions = info.get('cc') or {} for caption_url in captions.values(): subtitles.setdefault('en', []).append({ - 'url': caption_url + 'url': caption_url, }) subtitles = self._merge_subtitles(subtitles, hls_subs) @@ -696,3 +696,61 @@ def extract_redirect_urls(info): 'subtitles': subtitles, 'chapters': chapters, } + + +class PBSKidsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pbskids\.org/video/[\w-]+/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'https://pbskids.org/video/molly-of-denali/3030407927', + 'md5': '1ded20a017cc6b53446238f1804ce4c7', + 'info_dict': { + 'id': '3030407927', + 'title': 'Bird in the Hand/Bye-Bye Birdie', + 'channel': 'molly-of-denali', + 'duration': 1540, + 'ext': 'mp4', + 'series': 'Molly of Denali', + 'description': 'md5:d006b2211633685d8ebc8d03b6d5611e', + 'categories': ['Episode'], + 'upload_date': '20190718', + }, + }, + { + 'url': 'https://pbskids.org/video/plum-landing/2365205059', + 'md5': '92e5d189851a64ae1d0237a965be71f5', + 'info_dict': { + 'id': '2365205059', + 'title': 'Cooper\'s Favorite Place in Nature', + 'channel': 'plum-landing', + 'duration': 67, + 'ext': 'mp4', + 'series': 'Plum Landing', + 'description': 'md5:657e5fc4356a84ead1c061eb280ff05d', + 'categories': ['Episode'], + 'upload_date': '20140302', + }, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + meta = self._search_json(r'window\._PBS_KIDS_DEEPLINK\s*=', webpage, 'video info', video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + traverse_obj(meta, ('video_obj', 'URI', {url_or_none})), video_id, ext='mp4') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(meta, { + 'categories': ('video_obj', 'video_type', {str}, {lambda x: [x] if x else None}), + 'channel': ('show_slug', {str}), + 'description': ('video_obj', 'description', {str}), + 'duration': ('video_obj', 'duration', {int_or_none}), + 'series': ('video_obj', 'program_title', {str}), + 'title': ('video_obj', 'title', {str}), + 'upload_date': ('video_obj', 'air_date', {unified_strdate}), + }), + } diff --git a/yt_dlp/extractor/pearvideo.py b/yt_dlp/extractor/pearvideo.py index e27e5a7bac..4bea04bf2b 100644 --- a/yt_dlp/extractor/pearvideo.py +++ b/yt_dlp/extractor/pearvideo.py @@ -3,8 +3,8 @@ from .common import InfoExtractor from ..utils import ( qualities, - unified_timestamp, traverse_obj, + unified_timestamp, ) @@ -19,7 +19,7 @@ class PearVideoIE(InfoExtractor): 'description': 'md5:01d576b747de71be0ee85eb7cac25f9d', 'timestamp': 1494275280, 'upload_date': '20170508', - } + }, } def _real_extract(self, url): @@ -43,7 +43,7 @@ def _real_extract(self, url): query={'contId': video_id}, headers={'Referer': url}) formats = [{ 'format_id': k, - 'url': v.replace(info['systemTime'], f'cont-{video_id}') if k == 'srcUrl' else v + 'url': v.replace(info['systemTime'], f'cont-{video_id}') if k == 'srcUrl' else v, } for k, v in traverse_obj(info, ('videoInfo', 'videos'), default={}).items() if v] title = self._search_regex( diff --git a/yt_dlp/extractor/peekvids.py b/yt_dlp/extractor/peekvids.py index d1fc058b92..939c26dc7a 100644 --- a/yt_dlp/extractor/peekvids.py +++ b/yt_dlp/extractor/peekvids.py @@ -146,7 +146,6 @@ class PlayVidsIE(PeekVidsBaseIE): 'uploader': 'Brazzers', 'age_limit': 18, 'view_count': int, - 'age_limit': 18, 'categories': list, 'tags': list, }, @@ -158,7 +157,6 @@ class PlayVidsIE(PeekVidsBaseIE): 'display_id': '47iUho33toY', 'ext': 'mp4', 'title': 'KATEE OWEN STRIPTIASE IN SEXY RED LINGERIE', - 'description': None, 'timestamp': 1507052209, 'upload_date': '20171003', 'thumbnail': r're:^https?://.*\.jpg$', @@ -177,7 +175,6 @@ class PlayVidsIE(PeekVidsBaseIE): 'display_id': 'z3_7iwWCmqt', 'ext': 'mp4', 'title': 'SEXY TEEN FILIPINA STRIPTEASE - Beautiful Pinay Bargirl Strips and Dances', - 'description': None, 'timestamp': 1607470323, 'upload_date': '20201208', 'thumbnail': r're:^https?://.*\.jpg$', diff --git a/yt_dlp/extractor/peertube.py b/yt_dlp/extractor/peertube.py index 68e15737b9..2b69c7e6cf 100644 --- a/yt_dlp/extractor/peertube.py +++ b/yt_dlp/extractor/peertube.py @@ -2,8 +2,8 @@ import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( + OnDemandPagedList, format_field, int_or_none, parse_resolution, @@ -12,643 +12,908 @@ unified_timestamp, url_or_none, urljoin, - OnDemandPagedList, ) class PeerTubeIE(InfoExtractor): _INSTANCES_RE = r'''(?: # Taken from https://instances.joinpeertube.org/instances - 40two\.tube| - a\.metube\.ch| - advtv\.ml| - algorithmic\.tv| - alimulama\.com| - arcana\.fun| - archive\.vidicon\.org| - artefac-paris\.tv| - auf1\.eu| + 0ch\.tv| + 3dctube\.3dcandy\.social| + all\.electric\.kitchen| + alterscope\.fr| + anarchy\.tube| + apathy\.tv| + apertatube\.net| + archive\.nocopyrightintended\.tv| + archive\.reclaim\.tv| + area51\.media| + astrotube-ufe\.obspm\.fr| + astrotube\.obspm\.fr| + audio\.freediverse\.com| + azxtube\.youssefc\.tn| + bark\.video| battlepenguin\.video| - beertube\.epgn\.ch| - befree\.nohost\.me| + bava\.tv| + bee-tube\.fr| + beetoons\.tv| + biblion\.refchat\.net| + biblioteca\.theowlclub\.net| bideoak\.argia\.eus| - birkeundnymphe\.de| + bideoteka\.eus| + birdtu\.be| bitcointv\.com| - cattube\.org| - clap\.nerv-project\.eu| - climatejustice\.video| + bonn\.video| + breeze\.tube| + brioco\.live| + brocosoup\.fr| + canal\.facil\.services| + canard\.tube| + cdn01\.tilvids\.com| + celluloid-media\.huma-num\.fr| + chicago1\.peertube\.support| + cliptube\.org| + cloudtube\.ise\.fraunhofer\.de| comf\.tube| + comics\.peertube\.biz| + commons\.tube| + communitymedia\.video| conspiracydistillery\.com| + crank\.recoil\.org| + dalek\.zone| + dalliance\.network| + dangly\.parts| darkvapor\.nohost\.me| daschauher\.aksel\.rocks| digitalcourage\.video| - dreiecksnebel\.alex-detsch\.de| - eduvid\.org| + displayeurope\.video| + ds106\.tv| + dud-video\.inf\.tu-dresden\.de| + dud175\.inf\.tu-dresden\.de| + dytube\.com| + ebildungslabor\.video| evangelisch\.video| - exo\.tube| fair\.tube| + fedi\.video| + fedimovie\.com| fediverse\.tv| film\.k-prod\.fr| - flim\.txmn\.tk| + flipboard\.video| + foss\.video| + fossfarmers\.company| fotogramas\.politicaconciencia\.org| - ftsi\.ru| - gary\.vger\.cloud| - graeber\.video| + freediverse\.com| + freesoto-u2151\.vm\.elestio\.app| + freesoto\.tv| + garr\.tv| greatview\.video| grypstube\.uni-greifswald\.de| - highvoltage\.tv| - hpstube\.fr| - htp\.live| - hyperreal\.tube| + habratube\.site| + ilbjach\.ru| + infothema\.net| + itvplus\.iiens\.net| + johnydeep\.net| juggling\.digital| + jupiter\.tube| + kadras\.live| kino\.kompot\.si| kino\.schuerz\.at| kinowolnosc\.pl| kirche\.peertube-host\.de| + kiwi\.froggirl\.club| kodcast\.com| kolektiva\.media| - kraut\.zone| + kpop\.22x22\.ru| kumi\.tube| + la2\.peertube\.support| + la3\.peertube\.support| + la4\.peertube\.support| lastbreach\.tv| - lepetitmayennais\.fr\.nf| - lexx\.impa\.me| - libertynode\.tv| - libra\.syntazia\.org| - libremedia\.video| + lawsplaining\.peertube\.biz| + leopard\.tube| + live\.codinglab\.ch| live\.libratoi\.org| - live\.nanao\.moe| - live\.toobnix\.org| - livegram\.net| - lolitube\.freedomchan\.moe| + live\.oldskool\.fi| + live\.solari\.com| lucarne\.balsamine\.be| - maindreieck-tv\.de| - mani\.tube| - manicphase\.me| + luxtube\.lu| + makertube\.net| + media\.econoalchemist\.com| + media\.exo\.cat| media\.fsfe\.org| media\.gzevd\.de| - media\.inno3\.cricket| - media\.kaitaia\.life| + media\.interior\.edu\.uy| media\.krashboyz\.org| - media\.over-world\.org| - media\.skewed\.de| + media\.mzhd\.de| + media\.smz-ma\.de| + media\.theplattform\.net| media\.undeadnetwork\.de| + medias\.debrouillonet\.org| medias\.pingbase\.net| + mediatube\.fermalo\.fr| melsungen\.peertube-host\.de| - mirametube\.fr| - mojotube\.net| - monplaisirtube\.ddns\.net| + merci-la-police\.fr| + mindlyvideos\.com| + mirror\.peertube\.metalbanana\.net| + mirrored\.rocks| + mix\.video| mountaintown\.video| - my\.bunny\.cafe| - myfreetube\.de| + movies\.metricsmaster\.eu| + mtube\.mooo\.com| mytube\.kn-cloud\.de| + mytube\.le5emeaxe\.fr| mytube\.madzel\.de| - myworkoutarenapeertube\.cf| + nadajemy\.com| nanawel-peertube\.dyndns\.org| - nastub\.cz| - offenes\.tv| - orgdup\.media| - ovaltube\.codinglab\.ch| + neat\.tube| + nethack\.tv| + nicecrew\.tv| + nightshift\.minnix\.dev| + nolog\.media| + nyltube\.nylarea\.com| + ocfedtest\.hosted\.spacebear\.ee| + openmedia\.edunova\.it| p2ptv\.ru| p\.eertu\.be| p\.lu| + pastafriday\.club| + patriottube\.sonsofliberty\.red| + pcbu\.nl| peer\.azurs\.fr| - peertube1\.zeteo\.me| + peer\.d0g4\.me| + peer\.lukeog\.com| + peer\.madiator\.cloud| + peer\.raise-uav\.com| + peershare\.togart\.de| + peertube-blablalinux\.be| + peertube-demo\.learning-hub\.fr| + peertube-docker\.cpy\.re| + peertube-eu\.howlround\.com| + peertube-u5014\.vm\.elestio\.app| + peertube-us\.howlround\.com| peertube\.020\.pl| peertube\.0x5e\.eu| + peertube\.1984\.cz| + peertube\.2i2l\.net| + peertube\.adjutor\.xyz| + peertube\.adresse\.data\.gouv\.fr| peertube\.alpharius\.io| peertube\.am-networks\.fr| peertube\.anduin\.net| - peertube\.anzui\.dev| - peertube\.arbleizez\.bzh| + peertube\.anti-logic\.com| + peertube\.arch-linux\.cz| peertube\.art3mis\.de| - peertube\.atilla\.org| + peertube\.artsrn\.ualberta\.ca| + peertube\.askan\.info| + peertube\.astral0pitek\.synology\.me| peertube\.atsuchan\.page| - peertube\.aukfood\.net| - peertube\.aventer\.biz| + peertube\.automat\.click| peertube\.b38\.rural-it\.org| - peertube\.beeldengeluid\.nl| peertube\.be| + peertube\.beeldengeluid\.nl| peertube\.bgzashtita\.es| - peertube\.bitsandlinux\.com| + peertube\.bike| + peertube\.bildung-ekhn\.de| peertube\.biz| - peertube\.boba\.best| peertube\.br0\.fr| peertube\.bridaahost\.ynh\.fr| peertube\.bubbletea\.dev| peertube\.bubuit\.net| peertube\.cabaal\.net| - peertube\.cats-home\.net| - peertube\.chemnitz\.freifunk\.net| - peertube\.chevro\.fr| - peertube\.chrisspiegl\.com| + peertube\.chatinbit\.com| + peertube\.chaunchy\.com| + peertube\.chir\.rs| + peertube\.christianpacaud\.com| peertube\.chtisurel\.net| + peertube\.chuggybumba\.com| peertube\.cipherbliss\.com| + peertube\.cirkau\.art| + peertube\.cloud\.nerdraum\.de| peertube\.cloud\.sans\.pub| + peertube\.coko\.foundation| + peertube\.communecter\.org| + peertube\.concordia\.social| + peertube\.corrigan\.xyz| peertube\.cpge-brizeux\.fr| peertube\.ctseuro\.com| peertube\.cuatrolibertades\.org| - peertube\.cybercirujas\.club| - peertube\.cythin\.com| + peertube\.cube4fun\.net| + peertube\.dair-institute\.org| peertube\.davigge\.com| peertube\.dc\.pini\.fr| + peertube\.deadtom\.me| peertube\.debian\.social| + peertube\.delta0189\.xyz| peertube\.demonix\.fr| peertube\.designersethiques\.org| peertube\.desmu\.fr| - peertube\.devloprog\.org| peertube\.devol\.it| - peertube\.dtmf\.ca| - peertube\.ecologie\.bzh| + peertube\.dk| + peertube\.doesstuff\.social| + peertube\.eb8\.org| + peertube\.education-forum\.com| + peertube\.elforcer\.ru| + peertube\.em\.id\.lv| + peertube\.ethibox\.fr| peertube\.eu\.org| peertube\.european-pirates\.eu| + peertube\.eus| peertube\.euskarabildua\.eus| + peertube\.expi\.studio| + peertube\.familie-berner\.de| + peertube\.familleboisteau\.fr| + peertube\.fedihost\.website| peertube\.fenarinarsa\.com| - peertube\.fomin\.site| - peertube\.forsud\.be| - peertube\.francoispelletier\.org| - peertube\.freenet\.ru| - peertube\.freetalklive\.com| + peertube\.festnoz\.de| + peertube\.forteza\.fr| + peertube\.freestorm\.online| peertube\.functional\.cafe| - peertube\.gardeludwig\.fr| + peertube\.gaminglinux\.fr| peertube\.gargantia\.fr| - peertube\.gcfamily\.fr| + peertube\.geekgalaxy\.fr| + peertube\.gemlog\.ca| peertube\.genma\.fr| peertube\.get-racing\.de| + peertube\.ghis94\.ovh| peertube\.gidikroon\.eu| - peertube\.gruezishop\.ch| - peertube\.habets\.house| - peertube\.hackerfraternity\.org| + peertube\.giftedmc\.com| + peertube\.grosist\.fr| + peertube\.gruntwerk\.org| + peertube\.gsugambit\.com| + peertube\.hackerfoo\.com| + peertube\.hellsite\.net| + peertube\.helvetet\.eu| + peertube\.histoirescrepues\.fr| + peertube\.home\.x0r\.fr| + peertube\.hyperfreedom\.org| peertube\.ichigo\.everydayimshuflin\.com| - peertube\.ignifi\.me| + peertube\.ifwo\.eu| + peertube\.in\.ua| peertube\.inapurna\.org| peertube\.informaction\.info| peertube\.interhop\.org| - peertube\.iselfhost\.com| peertube\.it| + peertube\.it-arts\.net| peertube\.jensdiemer\.de| - peertube\.joffreyverd\.fr| + peertube\.johntheserg\.al| + peertube\.kaleidos\.net| peertube\.kalua\.im| - peertube\.kathryl\.fr| + peertube\.kcore\.org| peertube\.keazilla\.net| peertube\.klaewyss\.fr| - peertube\.kodcast\.com| + peertube\.kleph\.eu| + peertube\.kodein\.be| + peertube\.kooperatywa\.tech| + peertube\.kriom\.net| peertube\.kx\.studio| + peertube\.kyriog\.eu| + peertube\.la-famille-muller\.fr| + peertube\.labeuropereunion\.eu| peertube\.lagvoid\.com| - peertube\.lavallee\.tech| - peertube\.le5emeaxe\.fr| - peertube\.lestutosdeprocessus\.fr| - peertube\.librenet\.co\.za| + peertube\.lhc\.net\.br| + peertube\.libresolutions\.network| + peertube\.libretic\.fr| + peertube\.librosphere\.fr| peertube\.logilab\.fr| + peertube\.lon\.tv| peertube\.louisematic\.site| peertube\.luckow\.org| peertube\.luga\.at| peertube\.lyceeconnecte\.fr| - peertube\.manalejandro\.com| + peertube\.madixam\.xyz| + peertube\.magicstone\.dev| + peertube\.marienschule\.de| peertube\.marud\.fr| - peertube\.mattone\.net| peertube\.maxweiss\.io| + peertube\.miguelcr\.me| + peertube\.mikemestnik\.net| + peertube\.mobilsicher\.de| peertube\.monlycee\.net| peertube\.mxinfo\.fr| - peertube\.myrasp\.eu| - peertube\.nebelcloud\.de| + peertube\.naln1\.ca| peertube\.netzbegruenung\.de| - peertube\.newsocial\.tech| peertube\.nicolastissot\.fr| + peertube\.nogafam\.fr| + peertube\.normalgamingcommunity\.cz| peertube\.nz| peertube\.offerman\.com| + peertube\.ohioskates\.com| + peertube\.onionstorm\.net| peertube\.opencloud\.lu| - peertube\.orthus\.link| - peertube\.patapouf\.xyz| - peertube\.pi2\.dev| - peertube\.plataformess\.org| - peertube\.pl| - peertube\.portaesgnos\.org| + peertube\.otakufarms\.com| + peertube\.paladyn\.org| + peertube\.pix-n-chill\.fr| peertube\.r2\.enst\.fr| peertube\.r5c3\.fr| - peertube\.radres\.xyz| - peertube\.red| - peertube\.robonomics\.network| - peertube\.rtnkv\.cloud| - peertube\.runfox\.tk| + peertube\.redpill-insight\.com| + peertube\.researchinstitute\.at| + peertube\.revelin\.fr| + peertube\.rlp\.schule| + peertube\.rokugan\.fr| + peertube\.rougevertbleu\.tv| + peertube\.roundpond\.net| + peertube\.rural-it\.org| peertube\.satoshishop\.de| - peertube\.scic-tetris\.org| + peertube\.scyldings\.com| peertube\.securitymadein\.lu| + peertube\.semperpax\.com| peertube\.semweb\.pro| - peertube\.social\.my-wan\.de| - peertube\.soykaf\.org| - peertube\.stefofficiel\.me| + peertube\.sensin\.eu| + peertube\.sidh\.bzh| + peertube\.skorpil\.cz| + peertube\.smertrios\.com| + peertube\.sqweeb\.net| + peertube\.stattzeitung\.org| peertube\.stream| peertube\.su| peertube\.swrs\.net| peertube\.takeko\.cyou| - peertube\.tangentfox\.com| peertube\.taxinachtegel\.de| - peertube\.thenewoil\.xyz| + peertube\.teftera\.com| + peertube\.teutronic-services\.de| peertube\.ti-fr\.com| peertube\.tiennot\.net| - peertube\.troback\.com| + peertube\.tmp\.rcp\.tf| peertube\.tspu\.edu\.ru| - peertube\.tux\.ovh| peertube\.tv| peertube\.tweb\.tv| - peertube\.ucy\.de| peertube\.underworld\.fr| - peertube\.us\.to| - peertube\.ventresmous\.fr| + peertube\.vapronva\.pw| + peertube\.veen\.world| + peertube\.vesdia\.eu| + peertube\.virtual-assembly\.org| + peertube\.viviers-fibre\.net| peertube\.vlaki\.cz| - peertube\.w\.utnw\.de| - peertube\.westring\.digital| + peertube\.wiesbaden\.social| + peertube\.wivodaim\.net| + peertube\.wtf| + peertube\.wtfayla\.net| + peertube\.xrcb\.cat| peertube\.xwiki\.com| + peertube\.zd\.do| + peertube\.zetamc\.net| + peertube\.zmuuf\.org| peertube\.zoz-serv\.org| + peertube\.zwindler\.fr| peervideo\.ru| periscope\.numenaute\.org| - perron-tube\.de| + pete\.warpnine\.de| petitlutinartube\.fr| phijkchu\.com| - pierre\.tube| + phoenixproject\.group| piraten\.space| - play\.rosano\.ca| + pirtube\.calut\.fr| + pityu\.flaki\.hu| + play\.mittdata\.se| player\.ojamajo\.moe| - plextube\.nl| - pocketnetpeertube1\.nohost\.me| - pocketnetpeertube3\.nohost\.me| - pocketnetpeertube4\.nohost\.me| - pocketnetpeertube5\.nohost\.me| - pocketnetpeertube6\.nohost\.me| - pt\.24-7\.ro| - pt\.apathy\.top| + podlibre\.video| + portal\.digilab\.nfa\.cz| + private\.fedimovie\.com| + pt01\.lehrerfortbildung-bw\.de| pt\.diaspodon\.fr| - pt\.fedi\.tech| - pt\.maciej\.website| + pt\.freedomwolf\.cc| + pt\.gordons\.gen\.nz| + pt\.ilyamikcoder\.com| + pt\.irnok\.net| + pt\.mezzo\.moe| + pt\.na4\.eu| + pt\.netcraft\.ch| + pt\.rwx\.ch| + pt\.sfunk1x\.com| + pt\.thishorsie\.rocks| + pt\.vern\.cc| ptb\.lunarviews\.net| - ptmir1\.inter21\.net| - ptmir2\.inter21\.net| - ptmir3\.inter21\.net| - ptmir4\.inter21\.net| - ptmir5\.inter21\.net| - ptube\.horsentiers\.fr| - ptube\.xmanifesto\.club| - queermotion\.org| - re-wizja\.re-medium\.com| - regarder\.sans\.pub| - ruraletv\.ovh| - s1\.gegenstimme\.tv| - s2\.veezee\.tube| + ptube\.de| + ptube\.ranranhome\.info| + puffy\.tube| + puppet\.zone| + qtube\.qlyoung\.net| + quantube\.win| + rankett\.net| + replay\.jres\.org| + review\.peertube\.biz| sdmtube\.fr| - sender-fm\.veezee\.tube| - serv1\.wiki-tube\.de| + secure\.direct-live\.net| + secure\.scanovid\.com| + seka\.pona\.la| serv3\.wiki-tube\.de| - sickstream\.net| - sleepy\.tube| + skeptube\.fr| + social\.fedimovie\.com| + socpeertube\.ru| sovran\.video| + special\.videovortex\.tv| spectra\.video| + stl1988\.peertube-host\.de| + stream\.biovisata\.lt| + stream\.conesphere\.cloud| stream\.elven\.pw| + stream\.jurnalfm\.md| stream\.k-prod\.fr| - stream\.shahab\.nohost\.me| - streamsource\.video| + stream\.litera\.tools| + stream\.nuemedia\.se| + stream\.rlp-media\.de| + stream\.vrse\.be| studios\.racer159\.com| - testtube\.florimond\.eu| + styxhexenhammer666\.com| + syrteplay\.obspm\.fr| + t\.0x0\.st| + tbh\.co-shaoghal\.net| + test-fab\.ynh\.fr| + testube\.distrilab\.fr| tgi\.hosted\.spacebear\.ee| - thaitube\.in\.th| - the\.jokertv\.eu| theater\.ethernia\.net| thecool\.tube| + thevideoverse\.com| tilvids\.com| - toob\.bub\.org| - tpaw\.video| - truetube\.media| - tuba\.lhub\.pl| - tube-aix-marseille\.beta\.education\.fr| - tube-amiens\.beta\.education\.fr| - tube-besancon\.beta\.education\.fr| - tube-bordeaux\.beta\.education\.fr| - tube-clermont-ferrand\.beta\.education\.fr| - tube-corse\.beta\.education\.fr| - tube-creteil\.beta\.education\.fr| - tube-dijon\.beta\.education\.fr| - tube-education\.beta\.education\.fr| - tube-grenoble\.beta\.education\.fr| - tube-lille\.beta\.education\.fr| - tube-limoges\.beta\.education\.fr| - tube-montpellier\.beta\.education\.fr| - tube-nancy\.beta\.education\.fr| - tube-nantes\.beta\.education\.fr| - tube-nice\.beta\.education\.fr| - tube-normandie\.beta\.education\.fr| - tube-orleans-tours\.beta\.education\.fr| - tube-outremer\.beta\.education\.fr| - tube-paris\.beta\.education\.fr| - tube-poitiers\.beta\.education\.fr| - tube-reims\.beta\.education\.fr| - tube-rennes\.beta\.education\.fr| - tube-strasbourg\.beta\.education\.fr| - tube-toulouse\.beta\.education\.fr| - tube-versailles\.beta\.education\.fr| - tube1\.it\.tuwien\.ac\.at| + tinkerbetter\.tube| + tinsley\.video| + trailers\.ddigest\.com| + tube-action-educative\.apps\.education\.fr| + tube-arts-lettres-sciences-humaines\.apps\.education\.fr| + tube-cycle-2\.apps\.education\.fr| + tube-cycle-3\.apps\.education\.fr| + tube-education-physique-et-sportive\.apps\.education\.fr| + tube-enseignement-professionnel\.apps\.education\.fr| + tube-institutionnel\.apps\.education\.fr| + tube-langues-vivantes\.apps\.education\.fr| + tube-maternelle\.apps\.education\.fr| + tube-numerique-educatif\.apps\.education\.fr| + tube-sciences-technologies\.apps\.education\.fr| + tube-test\.apps\.education\.fr| + tube1\.perron-service\.de| + tube\.9minuti\.it| tube\.abolivier\.bzh| - tube\.ac-amiens\.fr| - tube\.aerztefueraufklaerung\.de| - tube\.alexx\.ml| + tube\.alado\.space| tube\.amic37\.fr| - tube\.anufrij\.de| - tube\.apolut\.net| - tube\.arkhalabs\.io| + tube\.area404\.cloud| tube\.arthack\.nz| - tube\.as211696\.net| - tube\.avensio\.de| + tube\.asulia\.fr| + tube\.awkward\.company| tube\.azbyka\.ru| tube\.azkware\.net| - tube\.bachaner\.fr| - tube\.bmesh\.org| - tube\.borked\.host| + tube\.bartrip\.me\.uk| + tube\.belowtoxic\.media| + tube\.bingle\.plus| + tube\.bit-friends\.de| tube\.bstly\.de| - tube\.chaoszone\.tv| - tube\.chatelet\.ovh| - tube\.cloud-libre\.eu| + tube\.chosto\.me| tube\.cms\.garden| - tube\.cowfee\.moe| - tube\.cryptography\.dog| - tube\.darknight-coffee\.org| - tube\.dev\.lhub\.pl| + tube\.communia\.org| + tube\.cyberia\.club| + tube\.cybershock\.life| + tube\.dembased\.xyz| + tube\.dev\.displ\.eu| + tube\.digitalesozialearbeit\.de| tube\.distrilab\.fr| + tube\.doortofreedom\.org| tube\.dsocialize\.net| + tube\.e-jeremy\.com| tube\.ebin\.club| + tube\.elemac\.fr| + tube\.erzbistum-hamburg\.de| + tube\.exozy\.me| tube\.fdn\.fr| - tube\.florimond\.eu| - tube\.foxarmy\.ml| - tube\.foxden\.party| - tube\.frischesicht\.de| + tube\.fedi\.quebec| + tube\.fediverse\.at| + tube\.felinn\.org| + tube\.flokinet\.is| + tube\.foad\.me\.uk| + tube\.freepeople\.fr| + tube\.friloux\.me| + tube\.froth\.zone| + tube\.fulda\.social| tube\.futuretic\.fr| - tube\.gnous\.eu| + tube\.g1zm0\.de| + tube\.g4rf\.net| + tube\.gaiac\.io| + tube\.geekyboo\.net| + tube\.genb\.de| + tube\.ghk-academy\.info| + tube\.gi-it\.de| tube\.grap\.coop| tube\.graz\.social| tube\.grin\.hu| - tube\.hackerscop\.org| - tube\.hordearii\.fr| + tube\.hokai\.lol| + tube\.int5\.net| + tube\.interhacker\.space| + tube\.invisible\.ch| + tube\.io18\.top| + tube\.itsg\.host| tube\.jeena\.net| - tube\.kai-stuht\.com| + tube\.kh-berlin\.de| tube\.kockatoo\.org| tube\.kotur\.org| + tube\.koweb\.fr| + tube\.la-dina\.net| + tube\.lab\.nrw| tube\.lacaveatonton\.ovh| + tube\.laurent-malys\.fr| + tube\.leetdreams\.ch| tube\.linkse\.media| tube\.lokad\.com| tube\.lucie-philou\.com| - tube\.melonbread\.xyz| - tube\.mfraters\.net| - tube\.motuhake\.xyz| - tube\.mrbesen\.de| - tube\.nah\.re| - tube\.nchoco\.net| + tube\.media-techport\.de| + tube\.morozoff\.pro| + tube\.neshweb\.net| + tube\.nestor\.coop| + tube\.network\.europa\.eu| + tube\.nicfab\.eu| + tube\.nieuwwestbrabant\.nl| + tube\.nogafa\.org| tube\.novg\.net| tube\.nox-rhea\.org| tube\.nuagelibre\.fr| + tube\.numerique\.gouv\.fr| + tube\.nuxnik\.com| tube\.nx12\.net| tube\.octaplex\.net| - tube\.odat\.xyz| tube\.oisux\.org| + tube\.okcinfo\.news| + tube\.onlinekirche\.net| tube\.opportunis\.me| + tube\.oraclefilms\.com| tube\.org\.il| - tube\.ortion\.xyz| - tube\.others\.social| + tube\.pacapime\.ovh| + tube\.parinux\.org| + tube\.pastwind\.top| tube\.picasoft\.net| - tube\.plomlompom\.com| + tube\.pilgerweg-21\.de| tube\.pmj\.rocks| + tube\.pol\.social| + tube\.ponsonaille\.fr| tube\.portes-imaginaire\.org| + tube\.public\.apolut\.net| + tube\.pustule\.org| tube\.pyngu\.com| + tube\.querdenken-711\.de| tube\.rebellion\.global| + tube\.reseau-canope\.fr| tube\.rhythms-of-resistance\.org| - tube\.rita\.moe| + tube\.risedsky\.ovh| + tube\.rooty\.fr| tube\.rsi\.cnr\.it| - tube\.s1gm4\.eu| - tube\.saumon\.io| + tube\.ryne\.moe| tube\.schleuss\.online| tube\.schule\.social| - tube\.seditio\.fr| + tube\.sekretaerbaer\.net| tube\.shanti\.cafe| tube\.shela\.nu| tube\.skrep\.in| + tube\.sleeping\.town| tube\.sp-codes\.de| - tube\.sp4ke\.com| - tube\.superseriousbusiness\.org| + tube\.spdns\.org| + tube\.systerserver\.net| tube\.systest\.eu| tube\.tappret\.fr| - tube\.tardis\.world| - tube\.toontoet\.nl| + tube\.techeasy\.org| + tube\.thierrytalbert\.fr| + tube\.tinfoil-hat\.net| + tube\.toldi\.eu| tube\.tpshd\.de| + tube\.trax\.im| tube\.troopers\.agency| + tube\.ttk\.is| + tube\.tuxfriend\.fr| tube\.tylerdavis\.xyz| + tube\.ullihome\.de| + tube\.ulne\.be| tube\.undernet\.uy| - tube\.vigilian-consulting\.nl| - tube\.vraphim\.com| - tube\.wehost\.lgbt| - tube\.wien\.rocks| + tube\.vrpnet\.org| tube\.wolfe\.casa| tube\.xd0\.de| + tube\.xn--baw-joa\.social| tube\.xy-space\.de| tube\.yapbreak\.fr| tubedu\.org| - tubes\.jodh\.us| - tuktube\.com| - turkum\.me| + tubulus\.openlatin\.org| + turtleisland\.video| tututu\.tube| - tuvideo\.encanarias\.info| - tv1\.cocu\.cc| - tv1\.gomntu\.space| - tv2\.cocu\.cc| + tv\.adast\.dk| tv\.adn\.life| + tv\.arns\.lt| tv\.atmx\.ca| - tv\.bitma\.st| - tv\.generallyrubbish\.net\.au| + tv\.based\.quest| + tv\.farewellutopia\.com| + tv\.filmfreedom\.net| + tv\.gravitons\.org| + tv\.io\.seg\.br| tv\.lumbung\.space| - tv\.mattchristiansenmedia\.com| - tv\.netwhood\.online| - tv\.neue\.city| - tv\.piejacker\.net| tv\.pirateradio\.social| + tv\.pirati\.cz| + tv\.santic-zombie\.ru| tv\.undersco\.re| + tv\.zonepl\.net| tvox\.ru| twctube\.twc-zone\.eu| - unfilter\.tube| + twobeek\.com| + urbanists\.video| + v\.9tail\.net| v\.basspistol\.org| + v\.j4\.lc| v\.kisombrella\.top| - v\.lastorder\.xyz| + v\.koa\.im| + v\.kyaru\.xyz| v\.lor\.sh| - v\.phreedom\.club| - v\.sil\.sh| - v\.szy\.io| - v\.xxxapex\.com| - veezee\.tube| - vid\.dascoyote\.xyz| - vid\.garwood\.io| - vid\.ncrypt\.at| - vid\.pravdastalina\.info| - vid\.qorg11\.net| - vid\.rajeshtaylor\.com| - vid\.samtripoli\.com| - vid\.werefox\.dev| + v\.mkp\.ca| + v\.posm\.gay| + v\.slaycer\.top| + veedeo\.org| + vhs\.absturztau\.be| + vid\.cthos\.dev| + vid\.kinuseka\.us| + vid\.mkp\.ca| + vid\.nocogabriel\.fr| + vid\.norbipeti\.eu| + vid\.northbound\.online| + vid\.ohboii\.de| + vid\.plantplotting\.co\.uk| + vid\.pretok\.tv| + vid\.prometheus\.systems| + vid\.soafen\.love| + vid\.twhtv\.club| vid\.wildeboer\.net| video-cave-v2\.de| + video-liberty\.com| video\.076\.ne\.jp| video\.1146\.nohost\.me| - video\.altertek\.org| + video\.9wd\.eu| + video\.abraum\.de| + video\.ados\.accoord\.fr| + video\.amiga-ng\.org| video\.anartist\.org| - video\.apps\.thedoodleproject\.net| - video\.artist\.cx| video\.asgardius\.company| - video\.balsillie\.net| + video\.audiovisuel-participatif\.org| video\.bards\.online| - video\.binarydad\.com| + video\.barkoczy\.social| + video\.benetou\.fr| + video\.beyondwatts\.social| + video\.bgeneric\.net| + video\.bilecik\.edu\.tr| video\.blast-info\.fr| + video\.bmu\.cloud| video\.catgirl\.biz| + video\.causa-arcana\.com| + video\.chasmcity\.net| + video\.chbmeyer\.de| video\.cigliola\.com| - video\.cm-en-transition\.fr| + video\.citizen4\.eu| + video\.clumsy\.computer| + video\.cnnumerique\.fr| + video\.cnr\.it| video\.cnt\.social| video\.coales\.co| - video\.codingfield\.com| - video\.comptoir\.net| video\.comune\.trento\.it| - video\.cpn\.so| + video\.coyp\.us| video\.csc49\.fr| - video\.cybre\.town| - video\.demokratischer-sommer\.de| - video\.discord-insoumis\.fr| - video\.dolphincastle\.com| + video\.davduf\.net| + video\.davejansen\.com| + video\.dlearning\.nl| + video\.dnfi\.no| video\.dresden\.network| - video\.ecole-89\.com| - video\.elgrillolibertario\.org| + video\.drgnz\.club| + video\.dudenas\.lt| + video\.eientei\.org| + video\.ellijaymakerspace\.org| video\.emergeheart\.info| video\.eradicatinglove\.xyz| - video\.ethantheenigma\.me| - video\.exodus-privacy\.eu\.org| - video\.fbxl\.net| + video\.everythingbagel\.me| + video\.extremelycorporate\.ca| + video\.fabiomanganiello\.com| + video\.fedi\.bzh| video\.fhtagn\.org| - video\.greenmycity\.eu| - video\.guerredeclasse\.fr| + video\.firehawk-systems\.com| + video\.fox-romka\.ru| + video\.fuss\.bz\.it| + video\.glassbeadcollective\.org| + video\.graine-pdl\.org| video\.gyt\.is| - video\.hackers\.town| + video\.hainry\.fr| video\.hardlimit\.com| - video\.hooli\.co| + video\.hostux\.net| video\.igem\.org| + video\.infojournal\.fr| video\.internet-czas-dzialac\.pl| + video\.interru\.io| + video\.ipng\.ch| + video\.ironsysadmin\.com| video\.islameye\.com| - video\.kicik\.fr| + video\.jacen\.moe| + video\.jadin\.me| + video\.jeffmcbride\.net| + video\.jigmedatse\.com| video\.kuba-orlik\.name| - video\.kyushojitsu\.ca| + video\.lacalligramme\.fr| + video\.lanceurs-alerte\.fr| + video\.laotra\.red| + video\.lapineige\.fr| + video\.laraffinerie\.re| video\.lavolte\.net| - video\.lespoesiesdheloise\.fr| video\.liberta\.vip| - video\.liege\.bike| + video\.libreti\.net| + video\.licentia\.net| video\.linc\.systems| video\.linux\.it| video\.linuxtrent\.it| - video\.lokal\.social| + video\.liveitlive\.show| video\.lono\.space| - video\.lunasqu\.ee| + video\.lrose\.de| + video\.lunago\.net| video\.lundi\.am| + video\.lycee-experimental\.org| + video\.maechler\.cloud| video\.marcorennmaus\.de| video\.mass-trespass\.uk| + video\.matomocamp\.org| + video\.medienzentrum-harburg\.de| + video\.mentality\.rip| + video\.metaversum\.wtf| + video\.midreality\.com| + video\.mttv\.it| video\.mugoreve\.fr| - video\.mundodesconocido\.com| + video\.mxtthxw\.art| video\.mycrowd\.ca| + video\.niboe\.info| video\.nogafam\.es| - video\.odayacres\.farm| + video\.nstr\.no| + video\.occm\.cc| + video\.off-investigation\.fr| + video\.olos311\.org| + video\.ordinobsolete\.fr| + video\.osvoj\.ru| + video\.ourcommon\.cloud| video\.ozgurkon\.org| - video\.p1ng0ut\.social| - video\.p3x\.de| video\.pcf\.fr| - video\.pony\.gallery| - video\.potate\.space| - video\.pourpenser\.pro| - video\.progressiv\.dev| + video\.pcgaldo\.com| + video\.phyrone\.de| + video\.poul\.org| + video\.publicspaces\.net| + video\.pullopen\.xyz| + video\.r3s\.nrw| + video\.rainevixen\.com| video\.resolutions\.it| - video\.rw501\.de| - video\.screamer\.wiki| - video\.sdm-tools\.net| + video\.retroedge\.tech| + video\.rhizome\.org| + video\.rlp-media\.de| + video\.rs-einrich\.de| + video\.rubdos\.be| + video\.sadmin\.io| video\.sftblw\.moe| video\.shitposter\.club| - video\.skyn3t\.in| + video\.simplex-software\.ru| + video\.slipfox\.xyz| + video\.snug\.moe| + video\.software-fuer-engagierte\.de| video\.soi\.ch| - video\.stuartbrand\.co\.uk| + video\.sonet\.ws| + video\.surazal\.net| + video\.taskcards\.eu| + video\.team-lcbs\.eu| + video\.techforgood\.social| + video\.telemillevaches\.net| + video\.thepolarbear\.co\.uk| video\.thinkof\.name| - video\.toot\.pt| + video\.tii\.space| + video\.tkz\.es| + video\.trankil\.info| video\.triplea\.fr| + video\.tum\.social| video\.turbo\.chat| + video\.uriopss-pdl\.fr| + video\.ustim\.ru| + video\.ut0pia\.org| video\.vaku\.org\.ua| + video\.vegafjord\.me| video\.veloma\.org| video\.violoncello\.ch| - video\.wilkie\.how| - video\.wsf2021\.info| - videorelay\.co| + video\.voidconspiracy\.band| + video\.wakkeren\.nl| + video\.windfluechter\.org| + video\.ziez\.eu| videos-passages\.huma-num\.fr| - videos\.3d-wolf\.com| + videos\.aadtp\.be| videos\.ahp-numerique\.fr| - videos\.alexandrebadalo\.pt| + videos\.alamaisondulibre\.org| videos\.archigny\.net| + videos\.aroaduntraveled\.com| + videos\.b4tech\.org| videos\.benjaminbrady\.ie| - videos\.buceoluegoexisto\.com| - videos\.capas\.se| - videos\.casually\.cat| + videos\.bik\.opencloud\.lu| videos\.cloudron\.io| + videos\.codingotaku\.com| videos\.coletivos\.org| + videos\.collate\.social| videos\.danksquad\.org| - videos\.denshi\.live| - videos\.fromouter\.space| + videos\.digitaldragons\.eu| + videos\.dromeadhere\.fr| + videos\.explain-it\.org| + videos\.factsonthegroundshow\.com| + videos\.foilen\.com| videos\.fsci\.in| + videos\.gamercast\.net| + videos\.gianmarco\.gg| videos\.globenet\.org| + videos\.grafo\.zone| videos\.hauspie\.fr| videos\.hush\.is| + videos\.hyphalfusion\.network| + videos\.icum\.to| + videos\.im\.allmendenetz\.de| + videos\.jacksonchen666\.com| videos\.john-livingston\.fr| - videos\.jordanwarne\.xyz| - videos\.lavoixdessansvoix\.org| + videos\.knazarov\.com| + videos\.kuoushi\.com| + videos\.laliguepaysdelaloire\.org| + videos\.lemouvementassociatif-pdl\.org| videos\.leslionsfloorball\.fr| - videos\.lucero\.top| - videos\.martyn\.berlin| + videos\.librescrum\.org| videos\.mastodont\.cat| - videos\.monstro1\.com| - videos\.npo\.city| - videos\.optoutpod\.com| - videos\.petch\.rocks| - videos\.pzelawski\.xyz| + videos\.metus\.ca| + videos\.miolo\.org| + videos\.offroad\.town| + videos\.openmandriva\.org| + videos\.parleur\.net| + videos\.pcorp\.us| + videos\.pop\.eu\.com| videos\.rampin\.org| + videos\.rauten\.co\.za| + videos\.ritimo\.org| + videos\.sarcasmstardust\.com| videos\.scanlines\.xyz| videos\.shmalls\.pw| - videos\.sibear\.fr| videos\.stadtfabrikanten\.org| - videos\.tankernn\.eu| + videos\.supertuxkart\.net| videos\.testimonia\.org| - videos\.thisishowidontdisappear\.com| - videos\.traumaheilung\.net| + videos\.thinkerview\.com| + videos\.torrenezzi10\.xyz| videos\.trom\.tf| - videos\.wakkerewereld\.nu| - videos\.weblib\.re| + videos\.utsukta\.org| + videos\.viorsan\.com| + videos\.wherelinux\.xyz| + videos\.wikilibriste\.fr| videos\.yesil\.club| + videos\.yeswiki\.net| + videotube\.duckdns\.org| + vids\.capypara\.de| vids\.roshless\.me| + vids\.stary\.pc\.pl| vids\.tekdmn\.me| - vidz\.dou\.bet| - vod\.lumikko\.dev| - vs\.uniter\.network| + vidz\.julien\.ovh| + views\.southfox\.me| + virtual-girls-are\.definitely-for\.me| + viste\.pt| + vnchich\.com| + vnop\.org| + vod\.newellijay\.tv| + voluntarytube\.com| + vtr\.chikichiki\.tube| vulgarisation-informatique\.fr| - watch\.breadtube\.tv| - watch\.deranalyst\.ch| + watch\.easya\.solutions| + watch\.goodluckgabe\.life| watch\.ignorance\.eu| - watch\.krazy\.party| + watch\.jimmydore\.com| watch\.libertaria\.space| - watch\.rt4mn\.org| - watch\.softinio\.com| + watch\.nuked\.social| + watch\.ocaml\.org| + watch\.thelema\.social| watch\.tubelab\.video| web-fellow\.de| webtv\.vandoeuvre\.net| - wechill\.space| + wetubevid\.online| wikileaks\.video| wiwi\.video| - worldofvids\.com| - wwtube\.net| - www4\.mir\.inter21\.net| - www\.birkeundnymphe\.de| - www\.captain-german\.com| - www\.wiki-tube\.de| + wow\.such\.disappointment\.fail| + www\.jvideos\.net| + www\.kotikoff\.net| + www\.makertube\.net| + www\.mypeer\.tube| + www\.nadajemy\.com| + www\.neptube\.io| + www\.rocaguinarda\.tv| + www\.vnshow\.net| xxivproduction\.video| - xxx\.noho\.st| + yt\.orokoro\.ru| + ytube\.retronerd\.at| + zumvideo\.de| # from youtube-dl peertube\.rainbowswingers\.net| @@ -1050,13 +1315,13 @@ class PeerTubeIE(InfoExtractor): )''' _UUID_RE = r'[\da-zA-Z]{22}|[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' _API_BASE = 'https://%s/api/v1/videos/%s/%s' - _VALID_URL = r'''(?x) + _VALID_URL = rf'''(?x) (?: peertube:(?P<host>[^:]+):| - https?://(?P<host_2>%s)/(?:videos/(?:watch|embed)|api/v\d/videos|w)/ + https?://(?P<host_2>{_INSTANCES_RE})/(?:videos/(?:watch|embed)|api/v\d/videos|w)/ ) - (?P<id>%s) - ''' % (_INSTANCES_RE, _UUID_RE) + (?P<id>{_UUID_RE}) + ''' _EMBED_REGEX = [r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//{_INSTANCES_RE}/videos/embed/{cls._UUID_RE})'''] _TESTS = [{ 'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d', @@ -1083,7 +1348,7 @@ class PeerTubeIE(InfoExtractor): 'dislike_count': int, 'tags': ['framasoft', 'peertube'], 'categories': ['Science & Technology'], - } + }, }, { 'url': 'https://peertube2.cpy.re/w/122d093a-1ede-43bd-bd34-59d2931ffc5e', 'info_dict': { @@ -1094,7 +1359,7 @@ class PeerTubeIE(InfoExtractor): 'timestamp': 1589276219, 'upload_date': '20200512', 'uploader': 'chocobozzz', - } + }, }, { 'url': 'https://peertube2.cpy.re/w/3fbif9S3WmtTP8gGsC5HBd', 'info_dict': { @@ -1128,7 +1393,7 @@ class PeerTubeIE(InfoExtractor): 'timestamp': 1587401293, 'upload_date': '20200420', 'uploader': 'Drew DeVault', - } + }, }, { 'url': 'https://peertube.debian.social/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', 'only_matching': True, @@ -1150,14 +1415,13 @@ class PeerTubeIE(InfoExtractor): @staticmethod def _extract_peertube_url(webpage, source_url): mobj = re.match( - r'https?://(?P<host>[^/]+)/(?:videos/(?:watch|embed)|w)/(?P<id>%s)' - % PeerTubeIE._UUID_RE, source_url) + rf'https?://(?P<host>[^/]+)/(?:videos/(?:watch|embed)|w)/(?P<id>{PeerTubeIE._UUID_RE})', source_url) if mobj and any(p in webpage for p in ( 'meta property="og:platform" content="PeerTube"', '<title>PeerTube<', 'There will be other non JS-based clients to access PeerTube', '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')): - return 'peertube:%s:%s' % mobj.group('host', 'id') + return 'peertube:{}:{}'.format(*mobj.group('host', 'id')) @classmethod def _extract_embed_urls(cls, url, webpage): @@ -1185,8 +1449,8 @@ def _get_subtitles(self, host, video_id): return subtitles = {} for e in data: - language_id = try_get(e, lambda x: x['language']['id'], compat_str) - caption_url = urljoin('https://%s' % host, e.get('captionPath')) + language_id = try_get(e, lambda x: x['language']['id'], str) + caption_url = urljoin(f'https://{host}', e.get('captionPath')) if not caption_url: continue subtitles.setdefault(language_id or 'en', []).append({ @@ -1204,11 +1468,15 @@ def _real_extract(self, url): title = video['name'] - formats = [] + formats, is_live = [], False files = video.get('files') or [] for playlist in (video.get('streamingPlaylists') or []): if not isinstance(playlist, dict): continue + if playlist_url := url_or_none(playlist.get('playlistUrl')): + is_live = True + formats.extend(self._extract_m3u8_formats( + playlist_url, video_id, fatal=False, live=True)) playlist_files = playlist.get('files') if not (playlist_files and isinstance(playlist_files, list)): continue @@ -1221,7 +1489,7 @@ def _real_extract(self, url): continue file_size = int_or_none(file_.get('size')) format_id = try_get( - file_, lambda x: x['resolution']['label'], compat_str) + file_, lambda x: x['resolution']['label'], str) f = parse_resolution(format_id) f.update({ 'url': file_url, @@ -1232,6 +1500,7 @@ def _real_extract(self, url): f['vcodec'] = 'none' else: f['fps'] = int_or_none(file_.get('fps')) + is_live = False formats.append(f) description = video.get('description') @@ -1255,7 +1524,7 @@ def account_data(field, type_): def channel_data(field, type_): return data('channel', field, type_) - category = data('category', 'label', compat_str) + category = data('category', 'label', str) categories = [category] if category else None nsfw = video.get('nsfw') @@ -1264,7 +1533,7 @@ def channel_data(field, type_): else: age_limit = None - webpage_url = 'https://%s/videos/watch/%s' % (host, video_id) + webpage_url = f'https://{host}/videos/watch/{video_id}' return { 'id': video_id, @@ -1272,14 +1541,14 @@ def channel_data(field, type_): 'description': description, 'thumbnail': urljoin(webpage_url, video.get('thumbnailPath')), 'timestamp': unified_timestamp(video.get('publishedAt')), - 'uploader': account_data('displayName', compat_str), + 'uploader': account_data('displayName', str), 'uploader_id': str_or_none(account_data('id', int)), - 'uploader_url': url_or_none(account_data('url', compat_str)), - 'channel': channel_data('displayName', compat_str), + 'uploader_url': url_or_none(account_data('url', str)), + 'channel': channel_data('displayName', str), 'channel_id': str_or_none(channel_data('id', int)), - 'channel_url': url_or_none(channel_data('url', compat_str)), - 'language': data('language', 'id', compat_str), - 'license': data('licence', 'label', compat_str), + 'channel_url': url_or_none(channel_data('url', str)), + 'language': data('language', 'id', str), + 'license': data('licence', 'label', str), 'duration': int_or_none(video.get('duration')), 'view_count': int_or_none(video.get('views')), 'like_count': int_or_none(video.get('likes')), @@ -1289,6 +1558,7 @@ def channel_data(field, type_): 'categories': categories, 'formats': formats, 'subtitles': subtitles, + 'is_live': is_live, 'webpage_url': webpage_url, } @@ -1301,28 +1571,10 @@ class PeerTubePlaylistIE(InfoExtractor): 'w/p': 'video-playlists', } _VALID_URL = r'''(?x) - https?://(?P<host>%s)/(?P<type>(?:%s))/ + https?://(?P<host>{})/(?P<type>(?:{}))/ (?P<id>[^/]+) - ''' % (PeerTubeIE._INSTANCES_RE, '|'.join(_TYPES.keys())) + '''.format(PeerTubeIE._INSTANCES_RE, '|'.join(_TYPES.keys())) _TESTS = [{ - 'url': 'https://peertube.tux.ovh/w/p/3af94cba-95e8-4b74-b37a-807ab6d82526', - 'info_dict': { - 'id': '3af94cba-95e8-4b74-b37a-807ab6d82526', - 'description': 'playlist', - 'timestamp': 1611171863, - 'title': 'playlist', - }, - 'playlist_mincount': 6, - }, { - 'url': 'https://peertube.tux.ovh/w/p/wkyqcQBnsvFxtUB2pkYc1e', - 'info_dict': { - 'id': 'wkyqcQBnsvFxtUB2pkYc1e', - 'description': 'Cette liste de vidéos contient uniquement les jeux qui peuvent être terminés en une seule vidéo.', - 'title': 'Let\'s Play', - 'timestamp': 1604147331, - }, - 'playlist_mincount': 6, - }, { 'url': 'https://peertube.debian.social/w/p/hFdJoTuyhNJVa1cDWd1d12', 'info_dict': { 'id': 'hFdJoTuyhNJVa1cDWd1d12', @@ -1363,21 +1615,21 @@ def call_api(self, host, name, path, base, **kwargs): return self._download_json( self._API_BASE % (host, base, name, path), name, **kwargs) - def fetch_page(self, host, id, type, page): + def fetch_page(self, host, playlist_id, playlist_type, page): page += 1 video_data = self.call_api( - host, id, + host, playlist_id, f'/videos?sort=-createdAt&start={self._PAGE_SIZE * (page - 1)}&count={self._PAGE_SIZE}&nsfw=both', - type, note=f'Downloading page {page}').get('data', []) + playlist_type, note=f'Downloading page {page}').get('data', []) for video in video_data: - shortUUID = video.get('shortUUID') or try_get(video, lambda x: x['video']['shortUUID']) + short_uuid = video.get('shortUUID') or try_get(video, lambda x: x['video']['shortUUID']) video_title = video.get('name') or try_get(video, lambda x: x['video']['name']) yield self.url_result( - f'https://{host}/w/{shortUUID}', PeerTubeIE.ie_key(), - video_id=shortUUID, video_title=video_title) + f'https://{host}/w/{short_uuid}', PeerTubeIE.ie_key(), + video_id=short_uuid, video_title=video_title) - def _extract_playlist(self, host, type, id): - info = self.call_api(host, id, '', type, note='Downloading playlist information', fatal=False) + def _extract_playlist(self, host, playlist_type, playlist_id): + info = self.call_api(host, playlist_id, '', playlist_type, note='Downloading playlist information', fatal=False) playlist_title = info.get('displayName') playlist_description = info.get('description') @@ -1387,13 +1639,12 @@ def _extract_playlist(self, host, type, id): thumbnail = format_field(info, 'thumbnailPath', f'https://{host}%s') entries = OnDemandPagedList(functools.partial( - self.fetch_page, host, id, type), self._PAGE_SIZE) + self.fetch_page, host, playlist_id, playlist_type), self._PAGE_SIZE) return self.playlist_result( - entries, id, playlist_title, playlist_description, + entries, playlist_id, playlist_title, playlist_description, timestamp=playlist_timestamp, channel=channel, channel_id=channel_id, thumbnail=thumbnail) def _real_extract(self, url): - type, host, id = self._match_valid_url(url).group('type', 'host', 'id') - type = self._TYPES[type] - return self._extract_playlist(host, type, id) + playlist_type, host, playlist_id = self._match_valid_url(url).group('type', 'host', 'id') + return self._extract_playlist(host, self._TYPES[playlist_type], playlist_id) diff --git a/yt_dlp/extractor/peertv.py b/yt_dlp/extractor/peertv.py index a709e21b44..726d5e14c3 100644 --- a/yt_dlp/extractor/peertv.py +++ b/yt_dlp/extractor/peertv.py @@ -48,5 +48,5 @@ def _real_extract(self, url): 'title': self._html_search_regex(r'<h1>(.+?)</h1>', webpage, 'title').replace('\xa0', ' '), 'formats': formats, 'description': self._html_search_meta(('og:description', 'description'), webpage), - 'thumbnail': self._html_search_meta(('og:image', 'image'), webpage) + 'thumbnail': self._html_search_meta(('og:image', 'image'), webpage), } diff --git a/yt_dlp/extractor/peloton.py b/yt_dlp/extractor/peloton.py index 4835822cf5..5999d4a6a0 100644 --- a/yt_dlp/extractor/peloton.py +++ b/yt_dlp/extractor/peloton.py @@ -3,7 +3,7 @@ import urllib.parse from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, float_or_none, @@ -36,12 +36,12 @@ class PelotonIE(InfoExtractor): 'chapters': 'count:1', 'subtitles': {'en': [{ 'url': r're:^https?://.+', - 'ext': 'vtt' + 'ext': 'vtt', }]}, }, 'params': { 'skip_download': 'm3u8', }, - '_skip': 'Account needed' + 'skip': 'Account needed', }, { 'url': 'https://members.onepeloton.com/classes/player/26603d53d6bb4de1b340514864a6a6a8', 'info_dict': { @@ -57,11 +57,11 @@ class PelotonIE(InfoExtractor): 'duration': 1802, 'categories': ['Running'], 'is_live': False, - 'chapters': 'count:3' + 'chapters': 'count:3', }, 'params': { 'skip_download': 'm3u8', }, - '_skip': 'Account needed' + 'skip': 'Account needed', }] _MANIFEST_URL_TEMPLATE = '%s?hdnea=%s' @@ -79,12 +79,12 @@ def _login(self, video_id): data=json.dumps({ 'username_or_email': username, 'password': password, - 'with_pubsub': False + 'with_pubsub': False, }).encode(), headers={'Content-Type': 'application/json', 'User-Agent': 'web'}) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - json_string = self._webpage_read_content(e.cause, None, video_id) + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + json_string = self._webpage_read_content(e.cause.response, None, video_id) res = self._parse_json(json_string, video_id) raise ExtractorError(res['message'], expected=res['message'] == 'Login failed') else: @@ -96,8 +96,8 @@ def _get_token(self, video_id): 'https://api.onepeloton.com/api/subscription/stream', video_id, note='Downloading token', data=json.dumps({}).encode(), headers={'Content-Type': 'application/json'}) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - json_string = self._webpage_read_content(e.cause, None, video_id) + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + json_string = self._webpage_read_content(e.cause.response, None, video_id) res = self._parse_json(json_string, video_id) raise ExtractorError(res['message'], expected=res['message'] == 'Stream limit reached') else: @@ -109,13 +109,13 @@ def _real_extract(self, url): try: self._start_session(video_id) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: self._login(video_id) self._start_session(video_id) else: raise - metadata = self._download_json('https://api.onepeloton.com/api/ride/%s/details?stream_source=multichannel' % video_id, video_id) + metadata = self._download_json(f'https://api.onepeloton.com/api/ride/{video_id}/details?stream_source=multichannel', video_id) ride_data = metadata.get('ride') if not ride_data: raise ExtractorError('Missing stream metadata') @@ -133,7 +133,7 @@ def _real_extract(self, url): subtitles = {} else: if ride_data.get('vod_stream_url'): - url = 'https://members.onepeloton.com/.netlify/functions/m3u8-proxy?displayLanguage=en&acceptedSubtitles=%s&url=%s?hdnea=%s' % ( + url = 'https://members.onepeloton.com/.netlify/functions/m3u8-proxy?displayLanguage=en&acceptedSubtitles={}&url={}?hdnea={}'.format( ','.join([re.sub('^([a-z]+)-([A-Z]+)$', r'\1', caption) for caption in ride_data['captions']]), ride_data['vod_stream_url'], urllib.parse.quote(urllib.parse.quote(token))) @@ -147,14 +147,14 @@ def _real_extract(self, url): if metadata.get('instructor_cues'): subtitles['cues'] = [{ 'data': json.dumps(metadata.get('instructor_cues')), - 'ext': 'json' + 'ext': 'json', }] category = ride_data.get('fitness_discipline_display_name') chapters = [{ 'start_time': segment.get('start_time_offset'), 'end_time': segment.get('start_time_offset') + segment.get('length'), - 'title': segment.get('name') + 'title': segment.get('name'), } for segment in traverse_obj(metadata, ('segments', 'segment_list'))] return { @@ -171,7 +171,7 @@ def _real_extract(self, url): 'categories': [category] if category else None, 'tags': traverse_obj(ride_data, ('equipment_tags', ..., 'name')), 'is_live': is_live, - 'chapters': chapters + 'chapters': chapters, } @@ -194,12 +194,12 @@ class PelotonLiveIE(InfoExtractor): 'duration': 2014, 'categories': ['Cycling'], 'is_live': False, - 'chapters': 'count:3' + 'chapters': 'count:3', }, 'params': { 'skip_download': 'm3u8', }, - '_skip': 'Account needed' + 'skip': 'Account needed', } def _real_extract(self, url): @@ -208,7 +208,7 @@ def _real_extract(self, url): if peloton.get('ride_id'): if not peloton.get('is_live') or peloton.get('is_encore') or peloton.get('status') != 'PRE_START': - return self.url_result('https://members.onepeloton.com/classes/player/%s' % peloton['ride_id']) + return self.url_result('https://members.onepeloton.com/classes/player/{}'.format(peloton['ride_id'])) else: raise ExtractorError('Ride has not started', expected=True) else: diff --git a/yt_dlp/extractor/people.py b/yt_dlp/extractor/people.py deleted file mode 100644 index c5143c3edd..0000000000 --- a/yt_dlp/extractor/people.py +++ /dev/null @@ -1,29 +0,0 @@ -from .common import InfoExtractor - - -class PeopleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?people\.com/people/videos/0,,(?P<id>\d+),00\.html' - - _TEST = { - 'url': 'http://www.people.com/people/videos/0,,20995451,00.html', - 'info_dict': { - 'id': 'ref:20995451', - 'ext': 'mp4', - 'title': 'Astronaut Love Triangle Victim Speaks Out: “The Crime in 2007 Hasn’t Defined Us”', - 'description': 'Colleen Shipman speaks to PEOPLE for the first time about life after the attack', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 246.318, - 'timestamp': 1458720585, - 'upload_date': '20160323', - 'uploader_id': '416418724', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['BrightcoveNew'], - } - - def _real_extract(self, url): - return self.url_result( - 'http://players.brightcove.net/416418724/default_default/index.html?videoId=ref:%s' - % self._match_id(url), 'BrightcoveNew') diff --git a/yt_dlp/extractor/performgroup.py b/yt_dlp/extractor/performgroup.py index f4d7f22d0a..df726c975b 100644 --- a/yt_dlp/extractor/performgroup.py +++ b/yt_dlp/extractor/performgroup.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import int_or_none, join_nonempty class PerformGroupIE(InfoExtractor): @@ -15,12 +15,12 @@ class PerformGroupIE(InfoExtractor): 'description': 'md5:7cd3b459c82725b021e046ab10bf1c5b', 'timestamp': 1511533477, 'upload_date': '20171124', - } + }, }] def _call_api(self, service, auth_token, content_id, referer_url): return self._download_json( - 'http://ep3.performfeeds.com/ep%s/%s/%s/' % (service, auth_token, content_id), + f'http://ep3.performfeeds.com/ep{service}/{auth_token}/{content_id}/', content_id, headers={ 'Referer': referer_url, 'Origin': 'http://player.performgroup.com', @@ -50,11 +50,8 @@ def _real_extract(self, url): if not c_url: continue tbr = int_or_none(c.get('bitrate'), 1000) - format_id = 'http' - if tbr: - format_id += '-%d' % tbr formats.append({ - 'format_id': format_id, + 'format_id': join_nonempty('http', tbr), 'url': c_url, 'tbr': tbr, 'width': int_or_none(c.get('width')), diff --git a/yt_dlp/extractor/periscope.py b/yt_dlp/extractor/periscope.py index 84bcf1573a..e3b33c4d98 100644 --- a/yt_dlp/extractor/periscope.py +++ b/yt_dlp/extractor/periscope.py @@ -4,38 +4,43 @@ parse_iso8601, unescapeHTML, ) +from ..utils.traversal import traverse_obj class PeriscopeBaseIE(InfoExtractor): _M3U8_HEADERS = { - 'Referer': 'https://www.periscope.tv/' + 'Referer': 'https://www.periscope.tv/', } def _call_api(self, method, query, item_id): return self._download_json( - 'https://api.periscope.tv/api/v2/%s' % method, + f'https://api.periscope.tv/api/v2/{method}', item_id, query=query) def _parse_broadcast_data(self, broadcast, video_id): title = broadcast.get('status') or 'Periscope Broadcast' uploader = broadcast.get('user_display_name') or broadcast.get('username') - title = '%s - %s' % (uploader, title) if uploader else title - is_live = broadcast.get('state').lower() == 'running' - + title = f'{uploader} - {title}' if uploader else title thumbnails = [{ 'url': broadcast[image], - } for image in ('image_url', 'image_url_small') if broadcast.get(image)] + } for image in ('image_url', 'image_url_medium', 'image_url_small') if broadcast.get(image)] return { 'id': broadcast.get('id') or video_id, 'title': title, - 'timestamp': parse_iso8601(broadcast.get('created_at')), + 'timestamp': parse_iso8601(broadcast.get('created_at')) or int_or_none( + broadcast.get('created_at_ms'), scale=1000), + 'release_timestamp': int_or_none(broadcast.get('scheduled_start_ms'), scale=1000), 'uploader': uploader, 'uploader_id': broadcast.get('user_id') or broadcast.get('username'), 'thumbnails': thumbnails, 'view_count': int_or_none(broadcast.get('total_watched')), + 'concurrent_view_count': int_or_none(broadcast.get('total_watching')), 'tags': broadcast.get('tags'), - 'is_live': is_live, + 'live_status': { + 'running': 'is_live', + 'not_started': 'is_upcoming', + }.get(traverse_obj(broadcast, ('state', {str.lower}))) or 'was_live', } @staticmethod @@ -160,7 +165,7 @@ def _real_extract(self, url): webpage, 'data store', default='{}', group='data')), user_name) - user = list(data_store['UserCache']['users'].values())[0]['user'] + user = next(iter(data_store['UserCache']['users'].values()))['user'] user_id = user['id'] session_id = data_store['SessionToken']['public']['broadcastHistory']['token']['session_id'] @@ -177,7 +182,7 @@ def _real_extract(self, url): entries = [ self.url_result( - 'https://www.periscope.tv/%s/%s' % (user_name, broadcast_id)) + f'https://www.periscope.tv/{user_name}/{broadcast_id}') for broadcast_id in broadcast_ids] return self.playlist_result(entries, user_id, title, description) diff --git a/yt_dlp/extractor/pgatour.py b/yt_dlp/extractor/pgatour.py new file mode 100644 index 0000000000..36c2c6207d --- /dev/null +++ b/yt_dlp/extractor/pgatour.py @@ -0,0 +1,47 @@ +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor + + +class PGATourIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pgatour\.com/video/[\w-]+/(?P<tc>T)?(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.pgatour.com/video/competition/T6322447785112/adam-hadwin-2023-the-players-round-4-18th-hole-shot-1', + 'info_dict': { + 'id': '6322447785112', + 'ext': 'mp4', + 'title': 'Adam Hadwin | 2023 THE PLAYERS | Round 4 | 18th hole | Shot 1', + 'uploader_id': '6116716431001', + 'upload_date': '20230312', + 'timestamp': 1678653136, + 'duration': 20.011, + 'thumbnail': r're:^https://.+\.jpg', + 'tags': 'count:7', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.pgatour.com/video/features/6322506425112/follow-the-players-trophy-on-championship-sunday', + 'info_dict': { + 'id': '6322506425112', + 'ext': 'mp4', + 'title': 'Follow THE PLAYERS trophy on Championship Sunday', + 'description': 'md5:4d29e4bdfa03694a0ebfd08950398568', + 'uploader_id': '6082840763001', + 'upload_date': '20230313', + 'timestamp': 1678739835, + 'duration': 123.435, + 'thumbnail': r're:^https://.+\.jpg', + 'tags': 'count:8', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id, is_tourcast = self._match_valid_url(url).group('id', 'tc') + + # From https://www.pgatour.com/_next/static/chunks/pages/_app-8bcf849560daf38d.js + account_id = '6116716431001' if is_tourcast else '6082840763001' + player_id = 'Vsd5Umu8r' if is_tourcast else 'FWIBYMBPj' + + return self.url_result( + f'https://players.brightcove.net/{account_id}/{player_id}_default/index.html?videoId={video_id}', + BrightcoveNewIE) diff --git a/yt_dlp/extractor/philharmoniedeparis.py b/yt_dlp/extractor/philharmoniedeparis.py index e8494a0840..310ea0f742 100644 --- a/yt_dlp/extractor/philharmoniedeparis.py +++ b/yt_dlp/extractor/philharmoniedeparis.py @@ -1,5 +1,4 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import try_get @@ -48,7 +47,7 @@ def _real_extract(self, url): video_id = self._match_id(url) config = self._download_json( - 'https://otoplayer.philharmoniedeparis.fr/fr/config/%s.json' % video_id, video_id, query={ + f'https://otoplayer.philharmoniedeparis.fr/fr/config/{video_id}.json', video_id, query={ 'id': video_id, 'lang': 'fr-FR', }) @@ -66,7 +65,7 @@ def extract_entry(source): formats = [] for format_id in ('mobile', 'desktop'): format_url = try_get( - files, lambda x: x[format_id]['file'], compat_str) + files, lambda x: x[format_id]['file'], str) if not format_url or format_url in format_urls: continue format_urls.add(format_url) @@ -91,7 +90,7 @@ def extract_entry(source): entry = extract_entry(chapter) if entry is None: continue - entry['id'] = '%s-%d' % (video_id, num) + entry['id'] = f'{video_id}-{num}' entries.append(entry) return self.playlist_result(entries, video_id, config.get('title')) diff --git a/yt_dlp/extractor/phoenix.py b/yt_dlp/extractor/phoenix.py index 5fa133afea..63c256019e 100644 --- a/yt_dlp/extractor/phoenix.py +++ b/yt_dlp/extractor/phoenix.py @@ -2,7 +2,6 @@ from .youtube import YoutubeIE from .zdf import ZDFBaseIE -from ..compat import compat_str from ..utils import ( int_or_none, merge_dicts, @@ -64,7 +63,7 @@ def _real_extract(self, url): article_id = self._match_id(url) article = self._download_json( - 'https://www.phoenix.de/response/id/%s' % article_id, article_id, + f'https://www.phoenix.de/response/id/{article_id}', article_id, 'Downloading article JSON') video = article['absaetze'][0] @@ -76,7 +75,7 @@ def _real_extract(self, url): video_id, ie=YoutubeIE.ie_key(), video_id=video_id, video_title=title) - video_id = compat_str(video.get('basename') or video.get('content')) + video_id = str(video.get('basename') or video.get('content')) details = self._download_json( 'https://www.phoenix.de/php/mediaplayer/data/beitrags_details.php', @@ -91,7 +90,7 @@ def _real_extract(self, url): content_id = details['tracking']['nielsen']['content']['assetid'] info = self._extract_ptmd( - 'https://tmd.phoenix.de/tmd/2/ngplayer_2_3/vod/ptmd/phoenix/%s' % content_id, + f'https://tmd.phoenix.de/tmd/2/ngplayer_2_3/vod/ptmd/phoenix/{content_id}', content_id, None, url) duration = int_or_none(try_get( @@ -99,7 +98,7 @@ def _real_extract(self, url): timestamp = unified_timestamp(details.get('editorialDate')) series = try_get( details, lambda x: x['tracking']['nielsen']['content']['program'], - compat_str) + str) episode = title if details.get('contentType') == 'episode' else None thumbnails = [] diff --git a/yt_dlp/extractor/photobucket.py b/yt_dlp/extractor/photobucket.py index 71e9a4805b..a7e5bc007d 100644 --- a/yt_dlp/extractor/photobucket.py +++ b/yt_dlp/extractor/photobucket.py @@ -1,7 +1,7 @@ import json +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote class PhotobucketIE(InfoExtractor): @@ -16,7 +16,7 @@ class PhotobucketIE(InfoExtractor): 'upload_date': '20130504', 'uploader': 'rachaneronas', 'title': 'Tired of Link Building? Try BacklinkMyDomain.com!', - } + }, } def _real_extract(self, url): @@ -31,7 +31,7 @@ def _real_extract(self, url): info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);', webpage, 'info json') info = json.loads(info_json) - url = compat_urllib_parse_unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url')) + url = urllib.parse.unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url')) return { 'id': video_id, 'url': url, diff --git a/yt_dlp/extractor/piapro.py b/yt_dlp/extractor/piapro.py index d8d9c78010..72e3748a20 100644 --- a/yt_dlp/extractor/piapro.py +++ b/yt_dlp/extractor/piapro.py @@ -1,7 +1,10 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( ExtractorError, + clean_html, + get_element_by_class, parse_duration, parse_filesize, str_to_int, @@ -12,18 +15,23 @@ class PiaproIE(InfoExtractor): _NETRC_MACHINE = 'piapro' - _VALID_URL = r'https?://piapro\.jp/t/(?P<id>\w+)/?' + _VALID_URL = r'https?://piapro\.jp/(?:t|content)/(?P<id>[\w-]+)/?' _TESTS = [{ 'url': 'https://piapro.jp/t/NXYR', - 'md5': 'a9d52f27d13bafab7ee34116a7dcfa77', + 'md5': 'f7c0f760913fb1d44a1c45a4af793909', 'info_dict': { 'id': 'NXYR', 'ext': 'mp3', 'uploader': 'wowaka', 'uploader_id': 'wowaka', 'title': '裏表ラバーズ', - 'thumbnail': r're:^https?://.*\.jpg$', - } + 'description': 'http://www.nicovideo.jp/watch/sm8082467', + 'duration': 189.0, + 'timestamp': 1251785475, + 'thumbnail': r're:^https?://.*\.(?:png|jpg)$', + 'upload_date': '20090901', + 'view_count': int, + }, }, { 'note': 'There are break lines in description, mandating (?s) flag', 'url': 'https://piapro.jp/t/9cSd', @@ -34,8 +42,19 @@ class PiaproIE(InfoExtractor): 'title': '青に溶けた風船 / 初音ミク', 'description': 'md5:d395a9bd151447631a5a1460bc7f9132', 'uploader': 'シアン・キノ', + 'duration': 229.0, + 'timestamp': 1644030039, + 'upload_date': '20220205', + 'view_count': int, + 'thumbnail': r're:^https?://.*\.(?:png|jpg)$', 'uploader_id': 'cyankino', - } + }, + }, { + 'url': 'https://piapro.jp/content/hcw0z3a169wtemz6', + 'only_matching': True, + }, { + 'url': 'https://piapro.jp/t/-SO-', + 'only_matching': True, }] _login_status = False @@ -46,7 +65,7 @@ def _perform_login(self, username, password): '_username': username, '_password': password, '_remember_me': 'on', - 'login': 'ログイン' + 'login': 'ログイン', } self._request_webpage('https://piapro.jp/login/', None) urlh = self._request_webpage( @@ -56,7 +75,7 @@ def _perform_login(self, username, password): if urlh is False: login_ok = False else: - parts = compat_urlparse.urlparse(urlh.geturl()) + parts = urllib.parse.urlparse(urlh.url) if parts.path != '/': login_ok = False if not login_ok: @@ -72,34 +91,22 @@ def _real_extract(self, url): if category_id not in ('1', '2', '21', '22', '23', '24', '25'): raise ExtractorError('The URL does not contain audio.', expected=True) - str_duration, str_filesize = self._search_regex( - r'サイズ:</span>(.+?)/\(([0-9,]+?[KMG]?B))', webpage, 'duration and size', - group=(1, 2), default=(None, None)) - str_viewcount = self._search_regex(r'閲覧数:</span>([0-9,]+)\s+', webpage, 'view count', fatal=False) - - uploader_id, uploader = self._search_regex( - r'<a\s+class="cd_user-name"\s+href="/(.*)">([^<]+)さん<', webpage, 'uploader', - group=(1, 2), default=(None, None)) - content_id = self._search_regex(r'contentId\:\'(.+)\'', webpage, 'content ID') - create_date = self._search_regex(r'createDate\:\'(.+)\'', webpage, 'timestamp') - - player_webpage = self._download_webpage( - f'https://piapro.jp/html5_player_popup/?id={content_id}&cdate={create_date}', - video_id, note='Downloading player webpage') + def extract_info(name, description): + return self._search_regex(rf'{name}[::]\s*([\d\s,:/]+)\s*</p>', webpage, description, default=None) return { 'id': video_id, - 'title': self._html_search_regex(r'<h1\s+class="cd_works-title">(.+?)</h1>', webpage, 'title', fatal=False), - 'description': self._html_search_regex(r'(?s)<p\s+class="cd_dtl_cap">(.+?)</p>\s*<div', webpage, 'description', fatal=False), - 'uploader': uploader, - 'uploader_id': uploader_id, - 'timestamp': unified_timestamp(create_date, False), - 'duration': parse_duration(str_duration), - 'view_count': str_to_int(str_viewcount), + 'title': clean_html(get_element_by_class('contents_title', webpage)), + 'description': clean_html(get_element_by_class('contents_description', webpage)), + 'uploader': clean_html(get_element_by_class('contents_creator_txt', webpage)), + 'uploader_id': self._search_regex( + r'<a\s+href="/([^"]+)"', get_element_by_class('contents_creator', webpage), 'uploader id', default=None), + 'timestamp': unified_timestamp(extract_info('投稿日', 'timestamp'), False), + 'duration': parse_duration(extract_info('長さ', 'duration')), + 'view_count': str_to_int(extract_info('閲覧数', 'view count')), 'thumbnail': self._html_search_meta('twitter:image', webpage), - - 'filesize_approx': parse_filesize(str_filesize.replace(',', '')), - 'url': self._search_regex(r'mp3:\s*\'(.*?)\'\}', player_webpage, 'url'), + 'filesize_approx': parse_filesize((extract_info('サイズ', 'size') or '').replace(',', '')), + 'url': self._search_regex(r'\"url\":\s*\"(.*?)\"', webpage, 'url'), 'ext': 'mp3', 'vcodec': 'none', } diff --git a/yt_dlp/extractor/piaulizaportal.py b/yt_dlp/extractor/piaulizaportal.py new file mode 100644 index 0000000000..1eb6d92b72 --- /dev/null +++ b/yt_dlp/extractor/piaulizaportal.py @@ -0,0 +1,70 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + parse_qs, + time_seconds, + traverse_obj, +) + + +class PIAULIZAPortalIE(InfoExtractor): + IE_DESC = 'ulizaportal.jp - PIA LIVE STREAM' + _VALID_URL = r'https?://(?:www\.)?ulizaportal\.jp/pages/(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + _TESTS = [{ + 'url': 'https://ulizaportal.jp/pages/005f18b7-e810-5618-cb82-0987c5755d44', + 'info_dict': { + 'id': '005f18b7-e810-5618-cb82-0987c5755d44', + 'title': 'プレゼンテーションプレイヤーのサンプル', + 'live_status': 'not_live', + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + }, { + 'url': 'https://ulizaportal.jp/pages/005e1b23-fe93-5780-19a0-98e917cc4b7d?expires=4102412400&signature=f422a993b683e1068f946caf406d211c17d1ef17da8bef3df4a519502155aa91&version=1', + 'info_dict': { + 'id': '005e1b23-fe93-5780-19a0-98e917cc4b7d', + 'title': '【確認用】視聴サンプルページ(ULIZA)', + 'live_status': 'not_live', + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + expires = int_or_none(traverse_obj(parse_qs(url), ('expires', 0))) + if expires and expires <= time_seconds(): + raise ExtractorError('The link is expired.', video_id=video_id, expected=True) + + webpage = self._download_webpage(url, video_id) + + player_data = self._download_webpage( + self._search_regex( + r'<script [^>]*\bsrc="(https://player-api\.p\.uliza\.jp/v1/players/[^"]+)"', + webpage, 'player data url'), + video_id, headers={'Referer': 'https://ulizaportal.jp/'}, + note='Fetching player data', errnote='Unable to fetch player data') + + formats = self._extract_m3u8_formats( + self._search_regex( + r'["\'](https://vms-api\.p\.uliza\.jp/v1/prog-index\.m3u8[^"\']+)', player_data, + 'm3u8 url', default=None), + video_id, fatal=False) + m3u8_type = self._search_regex( + r'/hls/(dvr|video)/', traverse_obj(formats, (0, 'url')), 'm3u8 type', default=None) + + return { + 'id': video_id, + 'title': self._html_extract_title(webpage), + 'formats': formats, + 'live_status': { + 'video': 'is_live', + 'dvr': 'was_live', # short-term archives + }.get(m3u8_type, 'not_live'), # VOD or long-term archives + } diff --git a/yt_dlp/extractor/picarto.py b/yt_dlp/extractor/picarto.py index 36a062def3..72e89c31ed 100644 --- a/yt_dlp/extractor/picarto.py +++ b/yt_dlp/extractor/picarto.py @@ -1,7 +1,11 @@ +import urllib.parse + from .common import InfoExtractor from ..utils import ( ExtractorError, - js_to_json, + str_or_none, + traverse_obj, + update_url, ) @@ -14,14 +18,14 @@ class PicartoIE(InfoExtractor): 'ext': 'mp4', 'title': 're:^Setz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'timestamp': int, - 'is_live': True + 'is_live': True, }, 'skip': 'Stream is offline', } @classmethod def suitable(cls, url): - return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url) + return False if PicartoVodIE.suitable(url) else super().suitable(url) def _real_extract(self, url): channel_id = self._match_id(url) @@ -39,16 +43,17 @@ def _real_extract(self, url): getLoadBalancerUrl(channel_name: "%s") { url } -}''' % (channel_id, channel_id), - })['data'] +}''' % (channel_id, channel_id), # noqa: UP031 + }, headers={'Accept': '*/*', 'Content-Type': 'application/json'})['data'] metadata = data['channel'] if metadata.get('online') == 0: raise ExtractorError('Stream is offline', expected=True) title = metadata['title'] - cdn_data = self._download_json( - data['getLoadBalancerUrl']['url'] + '/stream/json_' + metadata['stream_name'] + '.js', + cdn_data = self._download_json(''.join(( + update_url(data['getLoadBalancerUrl']['url'], scheme='https'), + '/stream/json_', metadata['stream_name'], '.js')), channel_id, 'Downloading load balancing info') formats = [] @@ -77,14 +82,14 @@ def _real_extract(self, url): 'is_live': True, 'channel': channel_id, 'channel_id': metadata.get('id'), - 'channel_url': 'https://picarto.tv/%s' % channel_id, + 'channel_url': f'https://picarto.tv/{channel_id}', 'age_limit': age_limit, 'formats': formats, } class PicartoVodIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?picarto\.tv/videopopout/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?picarto\.tv/(?:videopopout|\w+/videos)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://picarto.tv/videopopout/ArtofZod_2017.12.12.00.13.23.flv', 'md5': '3ab45ba4352c52ee841a28fb73f2d9ca', @@ -92,7 +97,19 @@ class PicartoVodIE(InfoExtractor): 'id': 'ArtofZod_2017.12.12.00.13.23.flv', 'ext': 'mp4', 'title': 'ArtofZod_2017.12.12.00.13.23.flv', - 'thumbnail': r're:^https?://.*\.jpg' + 'thumbnail': r're:^https?://.*\.jpg', + }, + 'skip': 'The VOD does not exist', + }, { + 'url': 'https://picarto.tv/ArtofZod/videos/771008', + 'md5': 'abef5322f2700d967720c4c6754b2a34', + 'info_dict': { + 'id': '771008', + 'ext': 'mp4', + 'title': 'Art of Zod - Drawing and Painting', + 'thumbnail': r're:^https?://.*\.jpg', + 'channel': 'ArtofZod', + 'age_limit': 18, }, }, { 'url': 'https://picarto.tv/videopopout/Plague', @@ -102,21 +119,36 @@ class PicartoVodIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + data = self._download_json( + 'https://ptvintern.picarto.tv/ptvapi', video_id, query={ + 'query': f'''{{ + video(id: "{video_id}") {{ + id + title + adult + file_name + video_recording_image_url + channel {{ + name + }} + }} +}}''', + }, headers={'Accept': '*/*', 'Content-Type': 'application/json'})['data']['video'] - vod_info = self._parse_json( - self._search_regex( - r'(?s)#vod-player["\']\s*,\s*(\{.+?\})\s*\)', webpage, - 'vod player'), - video_id, transform_source=js_to_json) + file_name = data['file_name'] + netloc = urllib.parse.urlparse(data['video_recording_image_url']).netloc formats = self._extract_m3u8_formats( - vod_info['vod'], video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + f'https://{netloc}/stream/hls/{file_name}/index.m3u8', video_id, 'mp4', m3u8_id='hls') return { 'id': video_id, - 'title': video_id, - 'thumbnail': vod_info.get('vodThumb'), + **traverse_obj(data, { + 'id': ('id', {str_or_none}), + 'title': ('title', {str}), + 'thumbnail': 'video_recording_image_url', + 'channel': ('channel', 'name', {str}), + 'age_limit': ('adult', {lambda x: 18 if x else 0}), + }), 'formats': formats, } diff --git a/yt_dlp/extractor/piksel.py b/yt_dlp/extractor/piksel.py index cc60b304e5..ceb65627af 100644 --- a/yt_dlp/extractor/piksel.py +++ b/yt_dlp/extractor/piksel.py @@ -2,13 +2,15 @@ from .common import InfoExtractor from ..utils import ( - dict_get, ExtractorError, + dict_get, int_or_none, join_nonempty, parse_iso8601, + traverse_obj, try_get, unescapeHTML, + urljoin, ) @@ -23,29 +25,31 @@ class PikselIE(InfoExtractor): )| (?:api|player)\.multicastmedia| (?:api-ovp|player)\.piksel - )\.com| + )\.(?:com|tech)| (?: mz-edge\.stream\.co| movie-s\.nhk\.or )\.jp| vidego\.baltimorecity\.gov )/v/(?:refid/(?P<refid>[^/]+)/prefid/)?(?P<id>[\w-]+)''' - _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)'] + _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.(?:com|tech)/v/[a-z0-9]+)'] _TESTS = [ { - 'url': 'http://player.piksel.com/v/ums2867l', + 'url': 'http://player.piksel.tech/v/ums2867l', 'md5': '34e34c8d89dc2559976a6079db531e85', 'info_dict': { 'id': 'ums2867l', 'ext': 'mp4', 'title': 'GX-005 with Caption', 'timestamp': 1481335659, - 'upload_date': '20161210' - } + 'upload_date': '20161210', + 'description': '', + 'thumbnail': 'https://thumbs.piksel.tech/thumbs/aid/t1488331553/3238987.jpg?w=640&h=480', + }, }, { # Original source: http://www.uscourts.gov/cameras-courts/state-washington-vs-donald-j-trump-et-al - 'url': 'https://player.piksel.com/v/v80kqp41', + 'url': 'https://player.piksel.tech/v/v80kqp41', 'md5': '753ddcd8cc8e4fa2dda4b7be0e77744d', 'info_dict': { 'id': 'v80kqp41', @@ -53,21 +57,22 @@ class PikselIE(InfoExtractor): 'title': 'WAW- State of Washington vs. Donald J. Trump, et al', 'description': 'State of Washington vs. Donald J. Trump, et al, Case Number 17-CV-00141-JLR, TRO Hearing, Civil Rights Case, 02/3/2017, 1:00 PM (PST), Seattle Federal Courthouse, Seattle, WA, Judge James L. Robart presiding.', 'timestamp': 1486171129, - 'upload_date': '20170204' - } + 'upload_date': '20170204', + 'thumbnail': 'https://thumbs.piksel.tech/thumbs/aid/t1495569155/3279887.jpg?w=640&h=360', + }, }, { # https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2019240/ 'url': 'http://player.piksel.com/v/refid/nhkworld/prefid/nw_vod_v_en_2019_240_20190823233000_02_1566873477', 'only_matching': True, - } + }, ] - def _call_api(self, app_token, resource, display_id, query, fatal=True): - response = (self._download_json( - 'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token), - display_id, query=query, fatal=fatal) or {}).get('response') - failure = try_get(response, lambda x: x['failure']['reason']) + def _call_api(self, app_token, resource, display_id, query, host='https://player.piksel.tech', fatal=True): + url = urljoin(host, f'/ws/ws_{resource}/api/{app_token}/mode/json/apiv/5') + response = traverse_obj( + self._download_json(url, display_id, query=query, fatal=fatal), ('response', {dict})) or {} + failure = traverse_obj(response, ('failure', 'reason')) if response else 'Empty response from API' if failure: if fatal: raise ExtractorError(failure, expected=True) @@ -79,11 +84,11 @@ def _real_extract(self, url): webpage = self._download_webpage(url, display_id) app_token = self._search_regex([ r'clientAPI\s*:\s*"([^"]+)"', - r'data-de-api-key\s*=\s*"([^"]+)"' + r'data-de-api-key\s*=\s*"([^"]+)"', ], webpage, 'app token') query = {'refid': ref_id, 'prefid': display_id} if ref_id else {'v': display_id} program = self._call_api( - app_token, 'program', display_id, query)['WsProgramResponse']['program'] + app_token, 'program', display_id, query, url)['WsProgramResponse']['program'] video_id = program['uuid'] video_data = program['asset'] title = video_data['title'] @@ -129,7 +134,7 @@ def process_asset_files(asset_files): process_asset_files(try_get(self._call_api( app_token, 'asset_file', display_id, { 'assetid': asset_id, - }, False), lambda x: x['WsAssetFileResponse']['AssetFiles'])) + }, url, False), lambda x: x['WsAssetFileResponse']['AssetFiles'])) m3u8_url = dict_get(video_data, [ 'm3u8iPadURL', @@ -144,7 +149,7 @@ def process_asset_files(asset_files): smil_url = dict_get(video_data, ['httpSmil', 'hdSmil', 'rtmpSmil']) if smil_url: - transform_source = None + transform_source = lambda x: x.replace('src="/', 'src="') if ref_id == 'nhkworld': # TODO: figure out if this is something to be fixed in urljoin, # _parse_smil_formats or keep it here diff --git a/yt_dlp/extractor/pinkbike.py b/yt_dlp/extractor/pinkbike.py index e4e1caaa25..0cd9632c2b 100644 --- a/yt_dlp/extractor/pinkbike.py +++ b/yt_dlp/extractor/pinkbike.py @@ -27,7 +27,7 @@ class PinkbikeIE(InfoExtractor): 'location': 'Victoria, British Columbia, Canada', 'view_count': int, 'comment_count': int, - } + }, }, { 'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629', 'only_matching': True, @@ -37,7 +37,7 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://www.pinkbike.com/video/%s' % video_id, video_id) + f'http://www.pinkbike.com/video/{video_id}', video_id) formats = [] for _, format_id, src in re.findall( @@ -72,7 +72,7 @@ def _real_extract(self, url): def extract_count(webpage, label): return str_to_int(self._search_regex( - r'<span[^>]+class="stat-num"[^>]*>([\d,.]+)</span>\s*<span[^>]+class="stat-label"[^>]*>%s' % label, + rf'<span[^>]+class="stat-num"[^>]*>([\d,.]+)</span>\s*<span[^>]+class="stat-label"[^>]*>{label}', webpage, label, fatal=False)) view_count = extract_count(webpage, 'Views') @@ -89,5 +89,5 @@ def extract_count(webpage, label): 'location': location, 'view_count': view_count, 'comment_count': comment_count, - 'formats': formats + 'formats': formats, } diff --git a/yt_dlp/extractor/pinterest.py b/yt_dlp/extractor/pinterest.py index 8361fbbc5f..07f249498c 100644 --- a/yt_dlp/extractor/pinterest.py +++ b/yt_dlp/extractor/pinterest.py @@ -22,9 +22,9 @@ class PinterestBaseIE(InfoExtractor): def _call_api(self, resource, video_id, options): return self._download_json( - 'https://www.pinterest.com/resource/%sResource/get/' % resource, - video_id, 'Download %s JSON metadata' % resource, query={ - 'data': json.dumps({'options': options}) + f'https://www.pinterest.com/resource/{resource}Resource/get/', + video_id, f'Download {resource} JSON metadata', query={ + 'data': json.dumps({'options': options}), })['resource_response'] def _extract_video(self, data, extract_formats=True): @@ -32,7 +32,7 @@ def _extract_video(self, data, extract_formats=True): thumbnails = [] images = data.get('images') if isinstance(images, dict): - for thumbnail_id, thumbnail in images.items(): + for thumbnail in images.values(): if not isinstance(thumbnail, dict): continue thumbnail_url = url_or_none(thumbnail.get('url')) @@ -109,7 +109,7 @@ def _extract_video(self, data, extract_formats=True): class PinterestIE(PinterestBaseIE): - _VALID_URL = r'%s/pin/(?P<id>\d+)' % PinterestBaseIE._VALID_URL_BASE + _VALID_URL = rf'{PinterestBaseIE._VALID_URL_BASE}/pin/(?P<id>\d+)' _TESTS = [{ # formats found in data['videos'] 'url': 'https://www.pinterest.com/pin/664281013778109217/', @@ -187,7 +187,7 @@ def _real_extract(self, url): class PinterestCollectionIE(PinterestBaseIE): - _VALID_URL = r'%s/(?P<username>[^/]+)/(?P<id>[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE + _VALID_URL = rf'{PinterestBaseIE._VALID_URL_BASE}/(?P<username>[^/]+)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://www.pinterest.ca/mashal0407/cool-diys/', 'info_dict': { @@ -207,15 +207,14 @@ class PinterestCollectionIE(PinterestBaseIE): @classmethod def suitable(cls, url): - return False if PinterestIE.suitable(url) else super( - PinterestCollectionIE, cls).suitable(url) + return False if PinterestIE.suitable(url) else super().suitable(url) def _real_extract(self, url): username, slug = self._match_valid_url(url).groups() board = self._call_api( 'Board', slug, { 'slug': slug, - 'username': username + 'username': username, })['data'] board_id = board['id'] options = { diff --git a/yt_dlp/extractor/pixivsketch.py b/yt_dlp/extractor/pixivsketch.py index 850c6f23d9..344cdb3d05 100644 --- a/yt_dlp/extractor/pixivsketch.py +++ b/yt_dlp/extractor/pixivsketch.py @@ -81,7 +81,7 @@ def _real_extract(self, url): 'channel_id': str(traverse_obj(data, ('user', 'pixiv_user_id'), ('owner', 'user', 'pixiv_user_id'))), 'age_limit': 18 if data.get('is_r18') else 15 if data.get('is_r15') else 0, 'timestamp': unified_timestamp(data.get('created_at')), - 'is_live': True + 'is_live': True, } @@ -101,7 +101,7 @@ class PixivSketchUserIE(PixivSketchBaseIE): @classmethod def suitable(cls, url): - return super(PixivSketchUserIE, cls).suitable(url) and not PixivSketchIE.suitable(url) + return super().suitable(url) and not PixivSketchIE.suitable(url) def _real_extract(self, url): user_id = self._match_id(url) diff --git a/yt_dlp/extractor/pladform.py b/yt_dlp/extractor/pladform.py index dcf18e1f3b..f4355d0cf5 100644 --- a/yt_dlp/extractor/pladform.py +++ b/yt_dlp/extractor/pladform.py @@ -1,11 +1,11 @@ from .common import InfoExtractor from ..utils import ( - determine_ext, ExtractorError, + determine_ext, int_or_none, parse_qs, - xpath_text, qualities, + xpath_text, ) @@ -35,12 +35,11 @@ class PladformIE(InfoExtractor): 'thumbnail': str, 'view_count': int, 'description': str, - 'category': list, 'uploader_id': '12082', 'uploader': 'Comedy Club', 'duration': 367, }, - 'expected_warnings': ['HTTP Error 404: Not Found'] + 'expected_warnings': ['HTTP Error 404: Not Found'], }, { 'url': 'https://out.pladform.ru/player?pl=64471&videoid=3777899&vk_puid15=0&vk_puid34=0', 'md5': '53362fac3a27352da20fa2803cc5cd6f', @@ -74,14 +73,14 @@ def _real_extract(self, url): def fail(text): raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, text), + f'{self.IE_NAME} returned error: {text}', expected=True) if not video: - targetUrl = self._request_webpage(url, video_id, note='Resolving final URL').geturl() - if targetUrl == url: + target_url = self._request_webpage(url, video_id, note='Resolving final URL').url + if target_url == url: raise ExtractorError('Can\'t parse page') - return self.url_result(targetUrl) + return self.url_result(target_url) if video.tag == 'error': fail(video.text) @@ -112,7 +111,7 @@ def fail(text): fail(error) webpage = self._download_webpage( - 'http://video.pladform.ru/catalog/video/videoid/%s' % video_id, + f'http://video.pladform.ru/catalog/video/videoid/{video_id}', video_id) title = self._og_search_title(webpage, fatal=False) or xpath_text( diff --git a/yt_dlp/extractor/planetmarathi.py b/yt_dlp/extractor/planetmarathi.py index 25753fe7ee..94861836a3 100644 --- a/yt_dlp/extractor/planetmarathi.py +++ b/yt_dlp/extractor/planetmarathi.py @@ -20,12 +20,11 @@ class PlanetMarathiIE(InfoExtractor): 'title': 'ek unad divas', 'alt_title': 'चित्रपट', 'description': 'md5:41c7ed6b041c2fea9820a3f3125bd881', - 'season_number': None, 'episode_number': 1, 'duration': 5539, 'upload_date': '20210829', }, - }] # Trailer skipped + }], # Trailer skipped }, { 'url': 'https://www.planetmarathi.com/titles/baap-beep-baap-season-1', 'playlist_mincount': 10, @@ -44,18 +43,19 @@ class PlanetMarathiIE(InfoExtractor): 'duration': 29, 'upload_date': '20210829', }, - }] # Trailers, Episodes, other Character profiles skipped + }], # Trailers, Episodes, other Character profiles skipped }] def _real_extract(self, url): - id = self._match_id(url) + playlist_id = self._match_id(url) entries = [] - json_data = self._download_json(f'https://www.planetmarathi.com/api/v1/titles/{id}/assets', id)['assets'] + json_data = self._download_json( + f'https://www.planetmarathi.com/api/v1/titles/{playlist_id}/assets', playlist_id)['assets'] for asset in json_data: asset_title = asset['mediaAssetName']['en'] if asset_title == 'Movie': - asset_title = id.replace('-', ' ') - asset_id = f'{asset["sk"]}_{id}'.replace('#', '-') + asset_title = playlist_id.replace('-', ' ') + asset_id = f'{asset["sk"]}_{playlist_id}'.replace('#', '-') formats, subtitles = self._extract_m3u8_formats_and_subtitles(asset['mediaAssetURL'], asset_id) entries.append({ 'id': asset_id, @@ -69,4 +69,4 @@ def _real_extract(self, url): 'formats': formats, 'subtitles': subtitles, }) - return self.playlist_result(entries, playlist_id=id) + return self.playlist_result(entries, playlist_id=playlist_id) diff --git a/yt_dlp/extractor/platzi.py b/yt_dlp/extractor/platzi.py index b8a4414940..e29f4c2090 100644 --- a/yt_dlp/extractor/platzi.py +++ b/yt_dlp/extractor/platzi.py @@ -1,11 +1,9 @@ +import base64 + from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_str, -) from ..utils import ( - clean_html, ExtractorError, + clean_html, int_or_none, str_or_none, try_get, @@ -36,7 +34,7 @@ def _perform_login(self, username, password): headers={'Referer': self._LOGIN_URL}) # login succeeded - if 'platzi.com/login' not in urlh.geturl(): + if 'platzi.com/login' not in urlh.url: return login_error = self._webpage_read_content( @@ -48,10 +46,10 @@ def _perform_login(self, username, password): None) for kind in ('error', 'password', 'nonFields'): - error = str_or_none(login.get('%sError' % kind)) + error = str_or_none(login.get(f'{kind}Error')) if error: raise ExtractorError( - 'Unable to login: %s' % error, expected=True) + f'Unable to login: {error}', expected=True) raise ExtractorError('Unable to log in') @@ -120,16 +118,16 @@ def _real_extract(self, url): formats.extend(self._extract_m3u8_formats( format_url, lecture_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, - note='Downloading %s m3u8 information' % server_id, + note=f'Downloading {server_id} m3u8 information', fatal=False)) elif format_id == 'dash': formats.extend(self._extract_mpd_formats( format_url, lecture_id, mpd_id=format_id, - note='Downloading %s MPD manifest' % server_id, + note=f'Downloading {server_id} MPD manifest', fatal=False)) content = str_or_none(desc.get('content')) - description = (clean_html(compat_b64decode(content).decode('utf-8')) + description = (clean_html(base64.b64decode(content).decode('utf-8')) if content else None) duration = int_or_none(material.get('duration'), invscale=60) @@ -168,7 +166,7 @@ class PlatziCourseIE(PlatziBaseIE): @classmethod def suitable(cls, url): - return False if PlatziIE.suitable(url) else super(PlatziCourseIE, cls).suitable(url) + return False if PlatziIE.suitable(url) else super().suitable(url) def _real_extract(self, url): course_name = self._match_id(url) @@ -207,7 +205,7 @@ def _real_extract(self, url): 'chapter_id': chapter_id, }) - course_id = compat_str(try_get(props, lambda x: x['course']['id'])) - course_title = try_get(props, lambda x: x['course']['name'], compat_str) + course_id = str(try_get(props, lambda x: x['course']['id'])) + course_title = try_get(props, lambda x: x['course']['name'], str) return self.playlist_result(entries, course_id, course_title) diff --git a/yt_dlp/extractor/playfm.py b/yt_dlp/extractor/playfm.py deleted file mode 100644 index e895ba480c..0000000000 --- a/yt_dlp/extractor/playfm.py +++ /dev/null @@ -1,70 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - int_or_none, - parse_iso8601, -) - - -class PlayFMIE(InfoExtractor): - IE_NAME = 'play.fm' - _VALID_URL = r'https?://(?:www\.)?play\.fm/(?P<slug>(?:[^/]+/)+(?P<id>[^/]+))/?(?:$|[?#])' - - _TEST = { - 'url': 'https://www.play.fm/dan-drastic/sven-tasnadi-leipzig-electronic-music-batofar-paris-fr-2014-07-12', - 'md5': 'c505f8307825a245d0c7ad1850001f22', - 'info_dict': { - 'id': '71276', - 'ext': 'mp3', - 'title': 'Sven Tasnadi - LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12', - 'description': '', - 'duration': 5627, - 'timestamp': 1406033781, - 'upload_date': '20140722', - 'uploader': 'Dan Drastic', - 'uploader_id': '71170', - 'view_count': int, - 'comment_count': int, - }, - } - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - slug = mobj.group('slug') - - recordings = self._download_json( - 'http://v2api.play.fm/recordings/slug/%s' % slug, video_id) - - error = recordings.get('error') - if isinstance(error, dict): - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error.get('message')), - expected=True) - - audio_url = recordings['audio'] - video_id = compat_str(recordings.get('id') or video_id) - title = recordings['title'] - description = recordings.get('description') - duration = int_or_none(recordings.get('recordingDuration')) - timestamp = parse_iso8601(recordings.get('created_at')) - uploader = recordings.get('page', {}).get('title') - uploader_id = compat_str(recordings.get('page', {}).get('id')) - view_count = int_or_none(recordings.get('playCount')) - comment_count = int_or_none(recordings.get('commentCount')) - categories = [tag['name'] for tag in recordings.get('tags', []) if tag.get('name')] - - return { - 'id': video_id, - 'url': audio_url, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'view_count': view_count, - 'comment_count': comment_count, - 'categories': categories, - } diff --git a/yt_dlp/extractor/playplustv.py b/yt_dlp/extractor/playplustv.py index 316f220f79..a4439c8bc5 100644 --- a/yt_dlp/extractor/playplustv.py +++ b/yt_dlp/extractor/playplustv.py @@ -1,13 +1,9 @@ import json from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - clean_html, - ExtractorError, - int_or_none, - PUTRequest, -) +from ..networking import PUTRequest +from ..networking.exceptions import HTTPError +from ..utils import ExtractorError, clean_html, int_or_none class PlayPlusTVIE(InfoExtractor): @@ -47,9 +43,9 @@ def _perform_login(self, username, password): try: self._token = self._download_json(req, None)['token'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: raise ExtractorError(self._parse_json( - e.cause.read(), None)['errorMessage'], expected=True) + e.cause.response.read(), None)['errorMessage'], expected=True) raise self._profile = self._call_api('Profiles')['list'][0]['_id'] diff --git a/yt_dlp/extractor/plays.py b/yt_dlp/extractor/plays.py deleted file mode 100644 index 9371f7b237..0000000000 --- a/yt_dlp/extractor/plays.py +++ /dev/null @@ -1,49 +0,0 @@ -import re - -from .common import InfoExtractor -from ..utils import int_or_none - - -class PlaysTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?plays\.tv/(?:video|embeds)/(?P<id>[0-9a-f]{18})' - _TESTS = [{ - 'url': 'https://plays.tv/video/56af17f56c95335490/when-you-outplay-the-azir-wall', - 'md5': 'dfeac1198506652b5257a62762cec7bc', - 'info_dict': { - 'id': '56af17f56c95335490', - 'ext': 'mp4', - 'title': 'Bjergsen - When you outplay the Azir wall', - 'description': 'Posted by Bjergsen', - } - }, { - 'url': 'https://plays.tv/embeds/56af17f56c95335490', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://plays.tv/video/%s' % video_id, video_id) - - info = self._search_json_ld(webpage, video_id,) - - mpd_url, sources = re.search( - r'(?s)<video[^>]+data-mpd="([^"]+)"[^>]*>(.+?)</video>', - webpage).groups() - formats = self._extract_mpd_formats( - self._proto_relative_url(mpd_url), video_id, mpd_id='DASH') - for format_id, height, format_url in re.findall(r'<source\s+res="((\d+)h?)"\s+src="([^"]+)"', sources): - formats.append({ - 'url': self._proto_relative_url(format_url), - 'format_id': 'http-' + format_id, - 'height': int_or_none(height), - }) - - info.update({ - 'id': video_id, - 'description': self._og_search_description(webpage), - 'thumbnail': info.get('thumbnail') or self._og_search_thumbnail(webpage), - 'formats': formats, - }) - - return info diff --git a/yt_dlp/extractor/playstuff.py b/yt_dlp/extractor/playstuff.py deleted file mode 100644 index b424ba1872..0000000000 --- a/yt_dlp/extractor/playstuff.py +++ /dev/null @@ -1,63 +0,0 @@ -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - smuggle_url, - try_get, -) - - -class PlayStuffIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?play\.stuff\.co\.nz/details/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://play.stuff.co.nz/details/608778ac1de1c4001a3fa09a', - 'md5': 'c82d3669e5247c64bc382577843e5bd0', - 'info_dict': { - 'id': '6250584958001', - 'ext': 'mp4', - 'title': 'Episode 1: Rotorua/Mt Maunganui/Tauranga', - 'description': 'md5:c154bafb9f0dd02d01fd4100fb1c1913', - 'uploader_id': '6005208634001', - 'timestamp': 1619491027, - 'upload_date': '20210427', - }, - 'add_ie': ['BrightcoveNew'], - }, { - # geo restricted, bypassable - 'url': 'https://play.stuff.co.nz/details/_6155660351001', - 'only_matching': True, - }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - state = self._parse_json( - self._search_regex( - r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'state'), - video_id) - - account_id = try_get( - state, lambda x: x['configurations']['accountId'], - compat_str) or '6005208634001' - player_id = try_get( - state, lambda x: x['configurations']['playerId'], - compat_str) or 'default' - - entries = [] - for item_id, video in state['items'].items(): - if not isinstance(video, dict): - continue - asset_id = try_get( - video, lambda x: x['content']['attributes']['assetId'], - compat_str) - if not asset_id: - continue - entries.append(self.url_result( - smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, asset_id), - {'geo_countries': ['NZ']}), - 'BrightcoveNew', video_id)) - - return self.playlist_result(entries, video_id) diff --git a/yt_dlp/extractor/playsuisse.py b/yt_dlp/extractor/playsuisse.py index a635ac92f1..905f8fc2f6 100644 --- a/yt_dlp/extractor/playsuisse.py +++ b/yt_dlp/extractor/playsuisse.py @@ -1,14 +1,28 @@ import json from .common import InfoExtractor -from ..utils import int_or_none, traverse_obj +from ..utils import ( + ExtractorError, + int_or_none, + parse_qs, + traverse_obj, + update_url_query, + urlencode_postdata, +) class PlaySuisseIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?playsuisse\.ch/watch/(?P<id>[0-9]+)' + _NETRC_MACHINE = 'playsuisse' + _VALID_URL = r'https?://(?:www\.)?playsuisse\.ch/(?:watch|detail)/(?:[^#]*[?&]episodeId=)?(?P<id>[0-9]+)' _TESTS = [ { + # Old URL 'url': 'https://www.playsuisse.ch/watch/763211/0', + 'only_matching': True, + }, + { + # episode in a series + 'url': 'https://www.playsuisse.ch/watch/763182?episodeId=763211', 'md5': '82df2a470b2dfa60c2d33772a8a60cf8', 'info_dict': { 'id': '763211', @@ -21,11 +35,11 @@ class PlaySuisseIE(InfoExtractor): 'season_number': 1, 'episode': 'Knochen', 'episode_number': 1, - 'thumbnail': 'md5:9260abe0c0ec9b69914d0a10d54c5878' - } - }, - { - 'url': 'https://www.playsuisse.ch/watch/808675/0', + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', + }, + }, { + # film + 'url': 'https://www.playsuisse.ch/watch/808675', 'md5': '818b94c1d2d7c4beef953f12cb8f3e75', 'info_dict': { 'id': '808675', @@ -33,27 +47,61 @@ class PlaySuisseIE(InfoExtractor): 'title': 'Der Läufer', 'description': 'md5:9f61265c7e6dcc3e046137a792b275fd', 'duration': 5280, - 'episode': 'Der Läufer', - 'thumbnail': 'md5:44af7d65ee02bbba4576b131868bb783' - } - }, - { - 'url': 'https://www.playsuisse.ch/watch/817193/0', - 'md5': '1d6c066f92cd7fffd8b28a53526d6b59', + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', + }, + }, { + # series (treated as a playlist) + 'url': 'https://www.playsuisse.ch/detail/1115687', 'info_dict': { - 'id': '817193', - 'ext': 'mp4', - 'title': 'Die Einweihungsparty', - 'description': 'md5:91ebf04d3a42cb3ab70666acf750a930', - 'duration': 1380, - 'series': 'Nr. 47', - 'season': 'Season 1', - 'season_number': 1, - 'episode': 'Die Einweihungsparty', - 'episode_number': 1, - 'thumbnail': 'md5:637585fb106e3a4bcd991958924c7e44' - } - } + 'description': 'md5:e4a2ae29a8895823045b5c3145a02aa3', + 'id': '1115687', + 'series': 'They all came out to Montreux', + 'title': 'They all came out to Montreux', + }, + 'playlist': [{ + 'info_dict': { + 'description': 'md5:f2462744834b959a31adc6292380cda2', + 'duration': 3180, + 'episode': 'Folge 1', + 'episode_number': 1, + 'id': '1112663', + 'season': 'Season 1', + 'season_number': 1, + 'series': 'They all came out to Montreux', + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', + 'title': 'Folge 1', + 'ext': 'mp4', + }, + }, { + 'info_dict': { + 'description': 'md5:9dfd308699fe850d3bce12dc1bad9b27', + 'duration': 2935, + 'episode': 'Folge 2', + 'episode_number': 2, + 'id': '1112661', + 'season': 'Season 1', + 'season_number': 1, + 'series': 'They all came out to Montreux', + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', + 'title': 'Folge 2', + 'ext': 'mp4', + }, + }, { + 'info_dict': { + 'description': 'md5:14a93a3356b2492a8f786ab2227ef602', + 'duration': 2994, + 'episode': 'Folge 3', + 'episode_number': 3, + 'id': '1112664', + 'season': 'Season 1', + 'season_number': 1, + 'series': 'They all came out to Montreux', + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', + 'title': 'Folge 3', + 'ext': 'mp4', + }, + }], + }, ] _GRAPHQL_QUERY = ''' @@ -94,22 +142,60 @@ class PlaySuisseIE(InfoExtractor): id url }''' + _LOGIN_BASE_URL = 'https://login.srgssr.ch/srgssrlogin.onmicrosoft.com' + _LOGIN_PATH = 'B2C_1A__SignInV2' + _ID_TOKEN = None + + def _perform_login(self, username, password): + login_page = self._download_webpage( + 'https://www.playsuisse.ch/api/sso/login', None, note='Downloading login page', + query={'x': 'x', 'locale': 'de', 'redirectUrl': 'https://www.playsuisse.ch/'}) + settings = self._search_json(r'var\s+SETTINGS\s*=', login_page, 'settings', None) + + csrf_token = settings['csrf'] + query = {'tx': settings['transId'], 'p': self._LOGIN_PATH} + + status = traverse_obj(self._download_json( + f'{self._LOGIN_BASE_URL}/{self._LOGIN_PATH}/SelfAsserted', None, 'Logging in', + query=query, headers={'X-CSRF-TOKEN': csrf_token}, data=urlencode_postdata({ + 'request_type': 'RESPONSE', + 'signInName': username, + 'password': password, + }), expected_status=400), ('status', {int_or_none})) + if status == 400: + raise ExtractorError('Invalid username or password', expected=True) + + urlh = self._request_webpage( + f'{self._LOGIN_BASE_URL}/{self._LOGIN_PATH}/api/CombinedSigninAndSignup/confirmed', + None, 'Downloading ID token', query={ + 'rememberMe': 'false', + 'csrf_token': csrf_token, + **query, + 'diags': '', + }) + + self._ID_TOKEN = traverse_obj(parse_qs(urlh.url), ('id_token', 0)) + if not self._ID_TOKEN: + raise ExtractorError('Login failed') def _get_media_data(self, media_id): # NOTE In the web app, the "locale" header is used to switch between languages, # However this doesn't seem to take effect when passing the header here. response = self._download_json( - 'https://4bbepzm4ef.execute-api.eu-central-1.amazonaws.com/prod/graphql', + 'https://www.playsuisse.ch/api/graphql', media_id, data=json.dumps({ 'operationName': 'AssetWatch', 'query': self._GRAPHQL_QUERY, - 'variables': {'assetId': media_id} - }).encode('utf-8'), + 'variables': {'assetId': media_id}, + }).encode(), headers={'Content-Type': 'application/json', 'locale': 'de'}) return response['data']['assetV2'] def _real_extract(self, url): + if not self._ID_TOKEN: + self.raise_login_required(method='password') + media_id = self._match_id(url) media_data = self._get_media_data(media_id) info = self._extract_single(media_data) @@ -128,7 +214,8 @@ def _extract_single(self, media_data): if not media.get('url') or media.get('type') != 'HLS': continue f, subs = self._extract_m3u8_formats_and_subtitles( - media['url'], media_data['id'], 'mp4', m3u8_id='HLS', fatal=False) + update_url_query(media['url'], {'id_token': self._ID_TOKEN}), + media_data['id'], 'mp4', m3u8_id='HLS', fatal=False) formats.extend(f) self._merge_subtitles(subs, target=subtitles) @@ -142,6 +229,6 @@ def _extract_single(self, media_data): 'subtitles': subtitles, 'series': media_data.get('seriesName'), 'season_number': int_or_none(media_data.get('seasonNumber')), - 'episode': media_data.get('name'), + 'episode': media_data.get('name') if media_data.get('episodeNumber') else None, 'episode_number': int_or_none(media_data.get('episodeNumber')), } diff --git a/yt_dlp/extractor/playtvak.py b/yt_dlp/extractor/playtvak.py index c418f88cb1..12e2fa255b 100644 --- a/yt_dlp/extractor/playtvak.py +++ b/yt_dlp/extractor/playtvak.py @@ -1,8 +1,6 @@ +import urllib.parse + from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_urllib_parse_urlencode, -) from ..utils import ( ExtractorError, int_or_none, @@ -27,7 +25,7 @@ class PlaytvakIE(InfoExtractor): 'timestamp': 1438732860, 'upload_date': '20150805', 'is_live': False, - } + }, }, { # live video test 'url': 'http://slowtv.playtvak.cz/planespotting-0pr-/planespotting.aspx?c=A150624_164934_planespotting_cat', 'info_dict': { @@ -64,7 +62,7 @@ class PlaytvakIE(InfoExtractor): 'timestamp': 1438969140, 'upload_date': '20150807', 'is_live': False, - } + }, }, { # lidovky.cz 'url': 'http://www.lidovky.cz/dalsi-demonstrace-v-praze-o-migraci-duq-/video.aspx?c=A150808_214044_ln-video_ELE', 'md5': 'c7209ac4ba9d234d4ad5bab7485bcee8', @@ -77,7 +75,7 @@ class PlaytvakIE(InfoExtractor): 'timestamp': 1439052180, 'upload_date': '20150808', 'is_live': False, - } + }, }, { # metro.cz 'url': 'http://www.metro.cz/video-pod-billboardem-se-na-vltavske-roztocil-kolotoc-deti-vozil-jen-par-hodin-1hx-/metro-extra.aspx?c=A141111_173251_metro-extra_row', 'md5': '84fc1deedcac37b7d4a6ccae7c716668', @@ -90,7 +88,7 @@ class PlaytvakIE(InfoExtractor): 'timestamp': 1415725500, 'upload_date': '20141111', 'is_live': False, - } + }, }, { 'url': 'http://www.playtvak.cz/embed.aspx?idvideo=V150729_141549_play-porad_kuko', 'only_matching': True, @@ -104,16 +102,16 @@ def _real_extract(self, url): info_url = self._html_search_regex( r'Misc\.video(?:FLV)?\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url') - parsed_url = compat_urlparse.urlparse(info_url) + parsed_url = urllib.parse.urlparse(info_url) - qs = compat_urlparse.parse_qs(parsed_url.query) + qs = urllib.parse.parse_qs(parsed_url.query) qs.update({ 'reklama': ['0'], 'type': ['js'], }) - info_url = compat_urlparse.urlunparse( - parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + info_url = urllib.parse.urlunparse( + parsed_url._replace(query=urllib.parse.urlencode(qs, True))) json_info = self._download_json( info_url, video_id, @@ -136,7 +134,7 @@ def _real_extract(self, url): continue format_ = fmt['format'] - format_id = '%s_%s' % (format_, fmt['quality']) + format_id = '{}_{}'.format(format_, fmt['quality']) preference = None if format_ in ('mp4', 'webm'): diff --git a/yt_dlp/extractor/playvid.py b/yt_dlp/extractor/playvid.py deleted file mode 100644 index 1e0989d0aa..0000000000 --- a/yt_dlp/extractor/playvid.py +++ /dev/null @@ -1,90 +0,0 @@ -import re -import urllib.parse - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote -from ..utils import ExtractorError, clean_html - - -class PlayvidIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)' - _TESTS = [{ - 'url': 'http://www.playvid.com/watch/RnmBNgtrrJu', - 'md5': 'ffa2f6b2119af359f544388d8c01eb6c', - 'info_dict': { - 'id': 'RnmBNgtrrJu', - 'ext': 'mp4', - 'title': 'md5:9256d01c6317e3f703848b5906880dc8', - 'duration': 82, - 'age_limit': 18, - }, - 'skip': 'Video removed due to ToS', - }, { - 'url': 'http://www.playvid.com/watch/hwb0GpNkzgH', - 'md5': '39d49df503ad7b8f23a4432cbf046477', - 'info_dict': { - 'id': 'hwb0GpNkzgH', - 'ext': 'mp4', - 'title': 'Ellen Euro Cutie Blond Takes a Sexy Survey Get Facial in The Park', - 'age_limit': 18, - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - m_error = re.search( - r'<div class="block-error">\s*<div class="heading">\s*<div>(?P<msg>.+?)</div>\s*</div>', webpage) - if m_error: - raise ExtractorError(clean_html(m_error.group('msg')), expected=True) - - video_title = None - duration = None - video_thumbnail = None - formats = [] - - # most of the information is stored in the flashvars - flashvars = self._html_search_regex( - r'flashvars="(.+?)"', webpage, 'flashvars') - - infos = compat_urllib_parse_unquote(flashvars).split(r'&') - for info in infos: - videovars_match = re.match(r'^video_vars\[(.+?)\]=(.+?)$', info) - if videovars_match: - key = videovars_match.group(1) - val = videovars_match.group(2) - - if key == 'title': - video_title = urllib.parse.unquote_plus(val) - if key == 'duration': - try: - duration = int(val) - except ValueError: - pass - if key == 'big_thumb': - video_thumbnail = val - - videourl_match = re.match( - r'^video_urls\]\[(?P<resolution>[0-9]+)p', key) - if videourl_match: - height = int(videourl_match.group('resolution')) - formats.append({ - 'height': height, - 'url': val, - }) - - # Extract title - should be in the flashvars; if not, look elsewhere - if video_title is None: - video_title = self._html_extract_title(webpage) - - return { - 'id': video_id, - 'formats': formats, - 'title': video_title, - 'thumbnail': video_thumbnail, - 'duration': duration, - 'description': None, - 'age_limit': 18 - } diff --git a/yt_dlp/extractor/playwire.py b/yt_dlp/extractor/playwire.py index 1057bff3a0..8539a4b56c 100644 --- a/yt_dlp/extractor/playwire.py +++ b/yt_dlp/extractor/playwire.py @@ -48,7 +48,7 @@ def _real_extract(self, url): publisher_id, video_id = mobj.group('publisher_id'), mobj.group('id') player = self._download_json( - 'http://config.playwire.com/%s/videos/v2/%s/zeus.json' % (publisher_id, video_id), + f'http://config.playwire.com/{publisher_id}/videos/v2/{video_id}/zeus.json', video_id) title = player['settings']['title'] diff --git a/yt_dlp/extractor/pluralsight.py b/yt_dlp/extractor/pluralsight.py index 809b65608f..d3f03f7eec 100644 --- a/yt_dlp/extractor/pluralsight.py +++ b/yt_dlp/extractor/pluralsight.py @@ -3,15 +3,12 @@ import os import random import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) from ..utils import ( - dict_get, ExtractorError, + dict_get, float_or_none, int_or_none, parse_duration, @@ -27,7 +24,7 @@ class PluralsightBaseIE(InfoExtractor): _API_BASE = 'https://app.pluralsight.com' - _GRAPHQL_EP = '%s/player/api/graphql' % _API_BASE + _GRAPHQL_EP = f'{_API_BASE}/player/api/graphql' _GRAPHQL_HEADERS = { 'Content-Type': 'application/json;charset=UTF-8', } @@ -95,8 +92,8 @@ def _download_course_rpc(self, course_id, url, display_id): response = self._download_json( self._GRAPHQL_EP, display_id, data=json.dumps({ 'query': self._GRAPHQL_COURSE_TMPL % course_id, - 'variables': {} - }).encode('utf-8'), headers=self._GRAPHQL_HEADERS) + 'variables': {}, + }).encode(), headers=self._GRAPHQL_HEADERS) course = try_get( response, lambda x: x['data']['rpc']['bootstrapPlayer']['course'], @@ -105,7 +102,7 @@ def _download_course_rpc(self, course_id, url, display_id): return course raise ExtractorError( - '%s said: %s' % (self.IE_NAME, response['error']['message']), + '{} said: {}'.format(self.IE_NAME, response['error']['message']), expected=True) @@ -176,7 +173,7 @@ def _perform_login(self, username, password): 'post url', default=self._LOGIN_URL, group='url') if not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + post_url = urllib.parse.urljoin(self._LOGIN_URL, post_url) response = self._download_webpage( post_url, None, 'Logging in', @@ -187,7 +184,7 @@ def _perform_login(self, username, password): r'<span[^>]+class="field-validation-error"[^>]*>([^<]+)</span>', response, 'error message', default=None) if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError(f'Unable to login: {error}', expected=True) if all(not re.search(p, response) for p in ( r'__INITIAL_STATE__', r'["\']currentUser["\']', @@ -196,13 +193,12 @@ def _perform_login(self, username, password): BLOCKED = 'Your account has been blocked due to suspicious activity' if BLOCKED in response: raise ExtractorError( - 'Unable to login: %s' % BLOCKED, expected=True) + f'Unable to login: {BLOCKED}', expected=True) MUST_AGREE = 'To continue using Pluralsight, you must agree to' if any(p in response for p in (MUST_AGREE, '>Disagree<', '>Agree<')): raise ExtractorError( - 'Unable to login: %s some documents. Go to pluralsight.com, ' - 'log in and agree with what Pluralsight requires.' - % MUST_AGREE, expected=True) + f'Unable to login: {MUST_AGREE} some documents. Go to pluralsight.com, ' + 'log in and agree with what Pluralsight requires.', expected=True) raise ExtractorError('Unable to log in') @@ -210,8 +206,7 @@ def _get_subtitles(self, author, clip_idx, clip_id, lang, name, duration, video_ captions = None if clip_id: captions = self._download_json( - '%s/transcript/api/v1/caption/json/%s/%s' - % (self._API_BASE, clip_id, lang), video_id, + f'{self._API_BASE}/transcript/api/v1/caption/json/{clip_id}/{lang}', video_id, 'Downloading captions JSON', 'Unable to download captions JSON', fatal=False) if not captions: @@ -222,9 +217,9 @@ def _get_subtitles(self, author, clip_idx, clip_id, lang, name, duration, video_ 'm': name, } captions = self._download_json( - '%s/player/retrieve-captions' % self._API_BASE, video_id, + f'{self._API_BASE}/player/retrieve-captions', video_id, 'Downloading captions JSON', 'Unable to download captions JSON', - fatal=False, data=json.dumps(captions_post).encode('utf-8'), + fatal=False, data=json.dumps(captions_post).encode(), headers={'Content-Type': 'application/json;charset=utf-8'}) if captions: return { @@ -234,7 +229,7 @@ def _get_subtitles(self, author, clip_idx, clip_id, lang, name, duration, video_ }, { 'ext': 'srt', 'data': self._convert_subtitles(duration, captions), - }] + }], } @staticmethod @@ -255,10 +250,8 @@ def _convert_subtitles(duration, subs): continue srt += os.linesep.join( ( - '%d' % num, - '%s --> %s' % ( - srt_subtitles_timecode(start), - srt_subtitles_timecode(end)), + f'{num}', + f'{srt_subtitles_timecode(start)} --> {srt_subtitles_timecode(end)}', text, os.linesep, )) @@ -272,10 +265,10 @@ def _real_extract(self, url): clip_idx = qs.get('clip', [None])[0] course_name = qs.get('course', [None])[0] - if any(not f for f in (author, name, clip_idx, course_name,)): + if any(not f for f in (author, name, clip_idx, course_name)): raise ExtractorError('Invalid URL', expected=True) - display_id = '%s-%s' % (name, clip_idx) + display_id = f'{name}-{clip_idx}' course = self._download_course(course_name, url, display_id) @@ -291,7 +284,7 @@ def _real_extract(self, url): clip_index = clip_.get('index') if clip_index is None: continue - if compat_str(clip_index) == clip_idx: + if str(clip_index) == clip_idx: clip = clip_ break @@ -308,14 +301,14 @@ def _real_extract(self, url): 'high-widescreen': {'width': 1280, 'height': 720}, } - QUALITIES_PREFERENCE = ('low', 'medium', 'high', 'high-widescreen',) + QUALITIES_PREFERENCE = ('low', 'medium', 'high', 'high-widescreen') quality_key = qualities(QUALITIES_PREFERENCE) AllowedQuality = collections.namedtuple('AllowedQuality', ['ext', 'qualities']) ALLOWED_QUALITIES = ( - AllowedQuality('webm', ['high', ]), - AllowedQuality('mp4', ['low', 'medium', 'high', ]), + AllowedQuality('webm', ['high']), + AllowedQuality('mp4', ['low', 'medium', 'high']), ) # Some courses also offer widescreen resolution for high quality (see @@ -359,23 +352,23 @@ def guess_allowed_qualities(): 'mediaType': ext, 'quality': '%dx%d' % (f['width'], f['height']), } - format_id = '%s-%s' % (ext, quality) + format_id = f'{ext}-{quality}' try: viewclip = self._download_json( self._GRAPHQL_EP, display_id, - 'Downloading %s viewclip graphql' % format_id, + f'Downloading {format_id} viewclip graphql', data=json.dumps({ 'query': self.GRAPHQL_VIEWCLIP_TMPL % clip_post, - 'variables': {} - }).encode('utf-8'), + 'variables': {}, + }).encode(), headers=self._GRAPHQL_HEADERS)['data']['viewClip'] except ExtractorError: # Still works but most likely will go soon viewclip = self._download_json( - '%s/video/clips/viewclip' % self._API_BASE, display_id, - 'Downloading %s viewclip JSON' % format_id, fatal=False, - data=json.dumps(clip_post).encode('utf-8'), + f'{self._API_BASE}/video/clips/viewclip', display_id, + f'Downloading {format_id} viewclip JSON', fatal=False, + data=json.dumps(clip_post).encode(), headers={'Content-Type': 'application/json;charset=utf-8'}) # Pluralsight tracks multiple sequential calls to ViewClip API and start @@ -404,7 +397,7 @@ def guess_allowed_qualities(): clip_f.update({ 'url': clip_url, 'ext': ext, - 'format_id': '%s-%s' % (format_id, cdn) if cdn else format_id, + 'format_id': f'{format_id}-{cdn}' if cdn else format_id, 'quality': quality_key(quality), 'source_preference': int_or_none(clip_url_data.get('rank')), }) @@ -472,7 +465,7 @@ def _real_extract(self, url): if clip_index is None: continue clip_url = update_url_query( - '%s/player' % self._API_BASE, query={ + f'{self._API_BASE}/player', query={ 'mode': 'live', 'course': course_name, 'author': author, diff --git a/yt_dlp/extractor/plutotv.py b/yt_dlp/extractor/plutotv.py index caffeb21df..234ee987b6 100644 --- a/yt_dlp/extractor/plutotv.py +++ b/yt_dlp/extractor/plutotv.py @@ -1,11 +1,8 @@ import re +import urllib.parse import uuid from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) from ..utils import ( ExtractorError, float_or_none, @@ -16,6 +13,7 @@ class PlutoTVIE(InfoExtractor): + _WORKING = False _VALID_URL = r'''(?x) https?://(?:www\.)?pluto\.tv(?:/[^/]+)?/on-demand /(?P<video_type>movies|series) @@ -30,14 +28,14 @@ class PlutoTVIE(InfoExtractor): _INFO_QUERY_PARAMS = { 'appName': 'web', 'appVersion': 'na', - 'clientID': compat_str(uuid.uuid1()), + 'clientID': str(uuid.uuid1()), 'clientModelNumber': 'na', 'serverSideAds': 'false', 'deviceMake': 'unknown', 'deviceModel': 'web', 'deviceType': 'web', 'deviceVersion': 'unknown', - 'sid': compat_str(uuid.uuid1()), + 'sid': str(uuid.uuid1()), } _TESTS = [ { @@ -53,21 +51,21 @@ class PlutoTVIE(InfoExtractor): 'season_number': 2, 'episode_number': 3, 'duration': 3600, - } + }, }, { 'url': 'https://pluto.tv/on-demand/series/i-love-money/season/1/', 'playlist_count': 11, 'info_dict': { 'id': '5de6c582e9379ae4912dedbd', 'title': 'I Love Money - Season 1', - } + }, }, { 'url': 'https://pluto.tv/on-demand/series/i-love-money/', 'playlist_count': 26, 'info_dict': { 'id': '5de6c582e9379ae4912dedbd', 'title': 'I Love Money', - } + }, }, { 'url': 'https://pluto.tv/on-demand/movies/arrival-2015-1-1', 'md5': '3cead001d317a018bf856a896dee1762', @@ -77,7 +75,7 @@ class PlutoTVIE(InfoExtractor): 'title': 'Arrival', 'description': 'When mysterious spacecraft touch down across the globe, an elite team - led by expert translator Louise Banks (Academy Award® nominee Amy Adams) – races against time to decipher their intent.', 'duration': 9000, - } + }, }, { 'url': 'https://pluto.tv/en/on-demand/series/manhunters-fugitive-task-force/seasons/1/episode/third-times-the-charm-1-1', 'only_matching': True, @@ -94,8 +92,8 @@ class PlutoTVIE(InfoExtractor): 'title': 'Attack of the Killer Tomatoes', 'description': 'A group of scientists band together to save the world from mutated tomatoes that KILL! (1978)', 'duration': 5700, - } - } + }, + }, ] def _to_ad_free_formats(self, video_id, formats, subtitles): @@ -111,14 +109,14 @@ def _to_ad_free_formats(self, video_id, formats, subtitles): re.MULTILINE) if first_segment_url: m3u8_urls.add( - compat_urlparse.urljoin(first_segment_url.group(1), '0-end/master.m3u8')) + urllib.parse.urljoin(first_segment_url.group(1), '0-end/master.m3u8')) continue first_segment_url = re.search( r'^(https?://.*/).+\-0+[0-1]0\.ts$', res, re.MULTILINE) if first_segment_url: m3u8_urls.add( - compat_urlparse.urljoin(first_segment_url.group(1), 'master.m3u8')) + urllib.parse.urljoin(first_segment_url.group(1), 'master.m3u8')) continue for m3u8_url in m3u8_urls: diff --git a/yt_dlp/extractor/podbayfm.py b/yt_dlp/extractor/podbayfm.py index 2a26fd2b36..0141eca909 100644 --- a/yt_dlp/extractor/podbayfm.py +++ b/yt_dlp/extractor/podbayfm.py @@ -1,28 +1,40 @@ from .common import InfoExtractor -from ..utils import OnDemandPagedList, int_or_none, jwt_decode_hs256, try_call +from ..utils import ( + OnDemandPagedList, + clean_html, + int_or_none, + jwt_decode_hs256, + url_or_none, +) +from ..utils.traversal import traverse_obj -def result_from_props(props, episode_id=None): +def result_from_props(props): return { - 'id': props.get('podcast_id') or episode_id, - 'title': props.get('title'), - 'url': props['mediaURL'], + **traverse_obj(props, { + 'id': ('_id', {str}), + 'title': ('title', {str}), + 'url': ('mediaURL', {url_or_none}), + 'description': ('description', {clean_html}), + 'thumbnail': ('image', {jwt_decode_hs256}, 'url', {url_or_none}), + 'timestamp': ('timestamp', {int_or_none}), + 'duration': ('duration', {int_or_none}), + }), 'ext': 'mp3', - 'thumbnail': try_call(lambda: jwt_decode_hs256(props['image'])['url']), - 'timestamp': props.get('timestamp'), - 'duration': int_or_none(props.get('duration')), + 'vcodec': 'none', } class PodbayFMIE(InfoExtractor): - _VALID_URL = r'https?://podbay\.fm/p/[^/]*/e/(?P<id>[^/]*)/?(?:[\?#].*)?$' + _VALID_URL = r'https?://podbay\.fm/p/[^/?#]+/e/(?P<id>\d+)' _TESTS = [{ 'url': 'https://podbay.fm/p/behind-the-bastards/e/1647338400', - 'md5': '98b41285dcf7989d105a4ed0404054cf', + 'md5': '895ac8505de349515f5ee8a4a3195c93', 'info_dict': { - 'id': '1647338400', + 'id': '62306451f4a48e58d0c4d6a8', 'title': 'Part One: Kissinger', 'ext': 'mp3', + 'description': r're:^We begin our epic six part series on Henry Kissinger.+', 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1647338400, 'duration': 5001, @@ -34,24 +46,25 @@ def _real_extract(self, url): episode_id = self._match_id(url) webpage = self._download_webpage(url, episode_id) data = self._search_nextjs_data(webpage, episode_id) - return result_from_props(data['props']['pageProps']['episode'], episode_id) + return result_from_props(data['props']['pageProps']['episode']) class PodbayFMChannelIE(InfoExtractor): - _VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/]*)/?(?:[\?#].*)?$' + _VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/?#]+)/?(?:$|[?#])' _TESTS = [{ 'url': 'https://podbay.fm/p/behind-the-bastards', 'info_dict': { 'id': 'behind-the-bastards', 'title': 'Behind the Bastards', }, + 'playlist_mincount': 21, }] _PAGE_SIZE = 10 def _fetch_page(self, channel_id, pagenum): return self._download_json( f'https://podbay.fm/api/podcast?reverse=true&page={pagenum}&slug={channel_id}', - channel_id)['podcast'] + f'Downloading channel JSON page {pagenum + 1}', channel_id)['podcast'] @staticmethod def _results_from_page(channel_id, page): diff --git a/yt_dlp/extractor/podchaser.py b/yt_dlp/extractor/podchaser.py index 290c488171..4570f0f175 100644 --- a/yt_dlp/extractor/podchaser.py +++ b/yt_dlp/extractor/podchaser.py @@ -29,8 +29,8 @@ class PodchaserIE(InfoExtractor): 'duration': 3708, 'timestamp': 1636531259, 'upload_date': '20211110', - 'rating': 4.0 - } + 'average_rating': 4.0, + }, }, { 'url': 'https://www.podchaser.com/podcasts/the-bone-zone-28853', 'info_dict': { @@ -38,15 +38,15 @@ class PodchaserIE(InfoExtractor): 'title': 'The Bone Zone', 'description': 'Podcast by The Bone Zone', }, - 'playlist_count': 275 + 'playlist_count': 275, }, { 'url': 'https://www.podchaser.com/podcasts/sean-carrolls-mindscape-scienc-699349/episodes', 'info_dict': { 'id': '699349', 'title': 'Sean Carroll\'s Mindscape: Science, Society, Philosophy, Culture, Arts, and Ideas', - 'description': 'md5:2cbd8f4749891a84dc8235342e0b5ff1' + 'description': 'md5:2cbd8f4749891a84dc8235342e0b5ff1', }, - 'playlist_mincount': 225 + 'playlist_mincount': 225, }] @staticmethod @@ -59,7 +59,7 @@ def _parse_episode(episode, podcast): 'thumbnail': episode.get('image_url'), 'duration': str_to_int(episode.get('length')), 'timestamp': unified_timestamp(episode.get('air_date')), - 'rating': float_or_none(episode.get('rating')), + 'average_rating': float_or_none(episode.get('rating')), 'categories': list(set(traverse_obj(podcast, (('summary', None), 'categories', ..., 'text')))), 'tags': traverse_obj(podcast, ('tags', ..., 'text')), 'series': podcast.get('title'), @@ -77,9 +77,9 @@ def _fetch_page(self, podcast_id, podcast, page): 'count': self._PAGE_SIZE, 'sort_order': 'SORT_ORDER_RECENT', 'filters': { - 'podcast_id': podcast_id + 'podcast_id': podcast_id, }, - 'options': {} + 'options': {}, }).encode()) for episode in json_response['entities']: diff --git a/yt_dlp/extractor/podomatic.py b/yt_dlp/extractor/podomatic.py index 985bfae9dd..b5af82471a 100644 --- a/yt_dlp/extractor/podomatic.py +++ b/yt_dlp/extractor/podomatic.py @@ -5,6 +5,7 @@ class PodomaticIE(InfoExtractor): + _WORKING = False IE_NAME = 'podomatic' _VALID_URL = r'''(?x) (?P<proto>https?):// @@ -25,7 +26,7 @@ class PodomaticIE(InfoExtractor): 'uploader_id': 'scienceteachingtips', 'title': '64. When the Moon Hits Your Eye', 'duration': 446, - } + }, }, { 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00', 'md5': 'd2cf443931b6148e27638650e2638297', @@ -36,7 +37,7 @@ class PodomaticIE(InfoExtractor): 'uploader_id': 'ostbahnhof', 'title': 'Einunddreizig', 'duration': 3799, - } + }, }, { 'url': 'https://www.podomatic.com/podcasts/scienceteachingtips/episodes/2009-01-02T16_03_35-08_00', 'only_matching': True, @@ -47,16 +48,15 @@ def _real_extract(self, url): video_id = mobj.group('id') channel = mobj.group('channel') or mobj.group('channel_2') - json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' - + '?permalink=true&rtmp=0') % - (mobj.group('proto'), channel, video_id)) + json_url = ('{}://{}.podomatic.com/entry/embed_params/{}?permalink=true&rtmp=0'.format( + mobj.group('proto'), channel, video_id)) data_json = self._download_webpage( json_url, video_id, 'Downloading video info') data = json.loads(data_json) video_url = data['downloadLink'] if not video_url: - video_url = '%s/%s' % (data['streamer'].replace('rtmp', 'http'), data['mediaLocation']) + video_url = '{}/{}'.format(data['streamer'].replace('rtmp', 'http'), data['mediaLocation']) uploader = data['podcast'] title = data['title'] thumbnail = data['imageLocation'] diff --git a/yt_dlp/extractor/pokemon.py b/yt_dlp/extractor/pokemon.py index 0911893d43..1769684f72 100644 --- a/yt_dlp/extractor/pokemon.py +++ b/yt_dlp/extractor/pokemon.py @@ -48,7 +48,7 @@ def _real_extract(self, url): video_id, display_id = self._match_valid_url(url).groups() webpage = self._download_webpage(url, video_id or display_id) video_data = extract_attributes(self._search_regex( - r'(<[^>]+data-video-id="%s"[^>]*>)' % (video_id if video_id else '[a-z0-9]{32}'), + r'(<[^>]+data-video-id="{}"[^>]*>)'.format(video_id if video_id else '[a-z0-9]{32}'), webpage, 'video data element')) video_id = video_data['data-video-id'] title = video_data.get('data-video-title') or self._html_search_meta( @@ -57,7 +57,7 @@ def _real_extract(self, url): return { '_type': 'url_transparent', 'id': video_id, - 'url': 'limelight:media:%s' % video_id, + 'url': f'limelight:media:{video_id}', 'title': title, 'description': video_data.get('data-video-summary'), 'thumbnail': video_data.get('data-video-poster'), @@ -80,13 +80,13 @@ class PokemonWatchIE(InfoExtractor): 'ext': 'mp4', 'title': 'Lillier and the Staff!', 'description': 'md5:338841b8c21b283d24bdc9b568849f04', - } + }, }, { 'url': 'https://watch.pokemon.com/en-us/#/player?id=3fe7752ba09141f0b0f7756d1981c6b2', - 'only_matching': True + 'only_matching': True, }, { 'url': 'https://watch.pokemon.com/de-de/player.html?id=b3c402e111a4459eb47e12160ab0ba07', - 'only_matching': True + 'only_matching': True, }] def _extract_media(self, channel_array, video_id): @@ -102,7 +102,7 @@ def _real_extract(self, url): info = { '_type': 'url', 'id': video_id, - 'url': 'limelight:media:%s' % video_id, + 'url': f'limelight:media:{video_id}', 'ie_key': 'LimelightMedia', } @@ -120,7 +120,7 @@ def _real_extract(self, url): if video_data is None: raise ExtractorError( - 'Video %s does not exist' % video_id, expected=True) + f'Video {video_id} does not exist', expected=True) info['_type'] = 'url_transparent' images = video_data.get('images') diff --git a/yt_dlp/extractor/pokergo.py b/yt_dlp/extractor/pokergo.py index 5c7baadf24..72cbce0a0c 100644 --- a/yt_dlp/extractor/pokergo.py +++ b/yt_dlp/extractor/pokergo.py @@ -5,6 +5,7 @@ ExtractorError, try_get, ) +from ..utils.traversal import traverse_obj class PokerGoBaseIE(InfoExtractor): @@ -49,26 +50,27 @@ class PokerGoIE(PokerGoBaseIE): 'episode': 'Episode 2', 'display_id': '2a70ec4e-4a80-414b-97ec-725d9b72a7dc', }, - 'params': {'skip_download': True} + 'params': {'skip_download': True}, }] def _real_extract(self, url): - id = self._match_id(url) - data_json = self._download_json(f'https://api.pokergo.com/v2/properties/{self._PROPERTY_ID}/videos/{id}', id, - headers={'authorization': f'Bearer {self._AUTH_TOKEN}'})['data'] + video_id = self._match_id(url) + data_json = self._download_json( + f'https://api.pokergo.com/v2/properties/{self._PROPERTY_ID}/videos/{video_id}', video_id, + headers={'authorization': f'Bearer {self._AUTH_TOKEN}'})['data'] v_id = data_json['source'] thumbnails = [{ 'url': image['url'], 'id': image.get('label'), 'width': image.get('width'), - 'height': image.get('height') + 'height': image.get('height'), } for image in data_json.get('images') or [] if image.get('url')] - series_json = next(dct for dct in data_json.get('show_tags') or [] if dct.get('video_id') == id) or {} + series_json = traverse_obj(data_json, ('show_tags', lambda _, v: v['video_id'] == video_id, any)) or {} return { '_type': 'url_transparent', - 'display_id': id, + 'display_id': video_id, 'title': data_json.get('title'), 'description': data_json.get('description'), 'duration': data_json.get('duration'), @@ -76,7 +78,7 @@ def _real_extract(self, url): 'season_number': series_json.get('season'), 'episode_number': series_json.get('episode_number'), 'series': try_get(series_json, lambda x: x['tag']['name']), - 'url': f'https://cdn.jwplayer.com/v2/media/{v_id}' + 'url': f'https://cdn.jwplayer.com/v2/media/{v_id}', } @@ -91,9 +93,10 @@ class PokerGoCollectionIE(PokerGoBaseIE): }, }] - def _entries(self, id): - data_json = self._download_json(f'https://api.pokergo.com/v2/properties/{self._PROPERTY_ID}/collections/{id}?include=entities', - id, headers={'authorization': f'Bearer {self._AUTH_TOKEN}'})['data'] + def _entries(self, playlist_id): + data_json = self._download_json( + f'https://api.pokergo.com/v2/properties/{self._PROPERTY_ID}/collections/{playlist_id}?include=entities', + playlist_id, headers={'authorization': f'Bearer {self._AUTH_TOKEN}'})['data'] for video in data_json.get('collection_video') or []: video_id = video.get('id') if video_id: @@ -102,5 +105,5 @@ def _entries(self, id): ie=PokerGoIE.ie_key(), video_id=video_id) def _real_extract(self, url): - id = self._match_id(url) - return self.playlist_result(self._entries(id), playlist_id=id) + playlist_id = self._match_id(url) + return self.playlist_result(self._entries(playlist_id), playlist_id=playlist_id) diff --git a/yt_dlp/extractor/polsatgo.py b/yt_dlp/extractor/polsatgo.py index 1524a1fb9f..d4a0d6ab87 100644 --- a/yt_dlp/extractor/polsatgo.py +++ b/yt_dlp/extractor/polsatgo.py @@ -1,12 +1,12 @@ -from uuid import uuid4 import json +import uuid from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, try_get, url_or_none, - ExtractorError, ) @@ -33,7 +33,7 @@ def _extract_formats(self, sources, video_id): continue yield { 'url': url, - 'height': int_or_none(try_get(source, lambda x: x['quality'][:-1])) + 'height': int_or_none(try_get(source, lambda x: x['quality'][:-1])), } def _real_extract(self, url): @@ -47,11 +47,11 @@ def _real_extract(self, url): 'id': video_id, 'title': media['displayInfo']['title'], 'formats': formats, - 'age_limit': int_or_none(media['displayInfo']['ageGroup']) + 'age_limit': int_or_none(media['displayInfo']['ageGroup']), } def _call_api(self, endpoint, media_id, method, params): - rand_uuid = str(uuid4()) + rand_uuid = str(uuid.uuid4()) res = self._download_json( f'https://b2c-mobile.redefine.pl/rpc/{endpoint}/', media_id, note=f'Downloading {method} JSON metadata', @@ -77,7 +77,7 @@ def _call_api(self, endpoint, media_id, method, params): 'clientId': rand_uuid, 'cpid': 1, }, - }).encode('utf-8'), + }).encode(), headers={'Content-type': 'application/json'}) if not res.get('result'): if res['error']['code'] == 13404: diff --git a/yt_dlp/extractor/polskieradio.py b/yt_dlp/extractor/polskieradio.py index 68c4a2afd0..6fb21e156d 100644 --- a/yt_dlp/extractor/polskieradio.py +++ b/yt_dlp/extractor/polskieradio.py @@ -2,26 +2,23 @@ import json import math import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, - compat_urlparse -) from ..utils import ( - determine_ext, - extract_attributes, ExtractorError, InAdvancePagedList, + determine_ext, + extract_attributes, int_or_none, js_to_json, parse_iso8601, strip_or_none, traverse_obj, - unified_timestamp, unescapeHTML, + unified_timestamp, url_or_none, + urljoin, ) @@ -39,12 +36,12 @@ def _extract_webpage_player_entries(self, webpage, playlist_id, base_data): media_urls.add(media_url) entry = base_data.copy() entry.update({ - 'id': compat_str(media['id']), + 'id': str(media['id']), 'url': media_url, 'duration': int_or_none(media.get('length')), 'vcodec': 'none' if media.get('provider') == 'audio' else None, }) - entry_title = compat_urllib_parse_unquote(media['desc']) + entry_title = urllib.parse.unquote(media['desc']) if entry_title: entry['title'] = entry_title yield entry @@ -70,7 +67,7 @@ class PolskieRadioLegacyIE(PolskieRadioBaseExtractor): 'timestamp': 1592654400, 'upload_date': '20200620', 'duration': 1430, - 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' + 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$', }, }], }, { @@ -130,10 +127,11 @@ def _real_extract(self, url): return self.playlist_result(entries, playlist_id, title, description) -class PolskieRadioIE(InfoExtractor): - # new next.js sites, excluding radiokierowcow.pl - _VALID_URL = r'https?://(?:[^/]+\.)?polskieradio(?:24)?\.pl/artykul/(?P<id>\d+)' +class PolskieRadioIE(PolskieRadioBaseExtractor): + # new next.js sites + _VALID_URL = r'https?://(?:[^/]+\.)?(?:polskieradio(?:24)?|radiokierowcow)\.pl/artykul/(?P<id>\d+)' _TESTS = [{ + # articleData, attachments 'url': 'https://jedynka.polskieradio.pl/artykul/1587943', 'info_dict': { 'id': '1587943', @@ -148,6 +146,31 @@ class PolskieRadioIE(InfoExtractor): 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', }, }], + }, { + # post, legacy html players + 'url': 'https://trojka.polskieradio.pl/artykul/2589163,Czy-wciaz-otrzymujemy-zdjecia-z-sond-Voyager', + 'info_dict': { + 'id': '2589163', + 'title': 'Czy wciąż otrzymujemy zdjęcia z sond Voyager?', + 'description': 'md5:cf1a7f348d63a2db9c0d7a63d1669473', + }, + 'playlist': [{ + 'info_dict': { + 'id': '2577880', + 'ext': 'mp3', + 'title': 'md5:a57d10a0c02abd34dd675cb33707ad5a', + 'duration': 321, + }, + }], + }, { + # data, legacy + 'url': 'https://radiokierowcow.pl/artykul/2694529', + 'info_dict': { + 'id': '2694529', + 'title': 'Zielona fala reliktem przeszłości?', + 'description': 'md5:f20a9a7ed9cb58916c54add94eae3bc0', + }, + 'playlist_count': 3, }, { 'url': 'https://trojka.polskieradio.pl/artykul/1632955', 'only_matching': True, @@ -166,7 +189,8 @@ def _real_extract(self, url): webpage = self._download_webpage(url, playlist_id) article_data = traverse_obj( - self._search_nextjs_data(webpage, playlist_id), ('props', 'pageProps', 'data', 'articleData')) + self._search_nextjs_data(webpage, playlist_id), ( + 'props', 'pageProps', (('data', 'articleData'), 'post', 'data')), get_all=False) title = strip_or_none(article_data['title']) @@ -178,7 +202,13 @@ def _real_extract(self, url): 'id': self._search_regex( r'([a-f\d]{8}-(?:[a-f\d]{4}-){3}[a-f\d]{12})', entry['file'], 'entry id'), 'title': strip_or_none(entry.get('description')) or title, - } for entry in article_data.get('attachments') or () if entry['fileType'] in ('Audio', )] + } for entry in article_data.get('attachments') or () if entry.get('fileType') in ('Audio', )] + + if not entries: + # some legacy articles have no json attachments, but players in body + entries = self._extract_webpage_player_entries(article_data['content'], playlist_id, { + 'title': title, + }) return self.playlist_result(entries, playlist_id, title, description) @@ -214,6 +244,15 @@ class PolskieRadioAuditionIE(InfoExtractor): 'thumbnail': r're:https://static\.prsa\.pl/images/.+', }, 'playlist_mincount': 722, + }, { + # some articles were "promoted to main page" and thus link to old frontend + 'url': 'https://trojka.polskieradio.pl/audycja/305', + 'info_dict': { + 'id': '305', + 'title': 'Co w mowie piszczy?', + 'thumbnail': r're:https://static\.prsa\.pl/images/.+', + }, + 'playlist_count': 1523, }] def _call_lp3(self, path, query, video_id, note): @@ -222,14 +261,14 @@ def _call_lp3(self, path, query, video_id, note): query=query, headers={'x-api-key': '9bf6c5a2-a7d0-4980-9ed7-a3f7291f2a81'}) def _entries(self, playlist_id, has_episodes, has_articles): - for i in itertools.count(1) if has_episodes else []: + for i in itertools.count(0) if has_episodes else []: page = self._call_lp3( 'AudioArticle/GetListByCategoryId', { 'categoryId': playlist_id, 'PageSize': 10, 'skip': i, 'format': 400, - }, playlist_id, f'Downloading episode list page {i}') + }, playlist_id, f'Downloading episode list page {i + 1}') if not traverse_obj(page, 'data'): break for episode in page['data']: @@ -241,20 +280,19 @@ def _entries(self, playlist_id, has_episodes, has_articles): 'timestamp': parse_iso8601(episode.get('datePublic')), } - for i in itertools.count(1) if has_articles else []: + for i in itertools.count(0) if has_articles else []: page = self._call_lp3( 'Article/GetListByCategoryId', { 'categoryId': playlist_id, 'PageSize': 9, 'skip': i, 'format': 400, - }, playlist_id, f'Downloading article list page {i}') + }, playlist_id, f'Downloading article list page {i + 1}') if not traverse_obj(page, 'data'): break for article in page['data']: yield { '_type': 'url_transparent', - 'ie_key': PolskieRadioIE.ie_key(), 'id': str(article['id']), 'url': article['url'], 'title': article.get('shortTitle'), @@ -282,24 +320,51 @@ def _real_extract(self, url): class PolskieRadioCategoryIE(InfoExtractor): # legacy sites IE_NAME = 'polskieradio:category' - _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/(?:\d+(?:,[^/]+)?/|[^/]+/Tag)(?P<id>\d+)' _TESTS = [{ - 'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source', - 'only_matching': True - }, { 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow', 'info_dict': { 'id': '4143', 'title': 'Kierunek Kraków', }, - 'playlist_mincount': 61 + 'playlist_mincount': 61, }, { 'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka', 'info_dict': { 'id': '214', 'title': 'Muzyka', }, - 'playlist_mincount': 61 + 'playlist_mincount': 61, + }, { + # billennium tabs + 'url': 'https://www.polskieradio.pl/8/2385', + 'info_dict': { + 'id': '2385', + 'title': 'Droga przez mąkę', + }, + 'playlist_mincount': 111, + }, { + 'url': 'https://www.polskieradio.pl/10/4930', + 'info_dict': { + 'id': '4930', + 'title': 'Teraz K-pop!', + }, + 'playlist_mincount': 392, + }, { + # post back pages, audio content directly without articles + 'url': 'https://www.polskieradio.pl/8,dwojka/7376,nowa-mowa', + 'info_dict': { + 'id': '7376', + 'title': 'Nowa mowa', + }, + 'playlist_mincount': 244, + }, { + 'url': 'https://www.polskieradio.pl/Krzysztof-Dziuba/Tag175458', + 'info_dict': { + 'id': '175458', + 'title': 'Krzysztof Dziuba', + }, + 'playlist_mincount': 420, }, { 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka', 'only_matching': True, @@ -311,25 +376,61 @@ def suitable(cls, url): def _entries(self, url, page, category_id): content = page + is_billennium_tabs = 'onclick="TB_LoadTab(' in page + is_post_back = 'onclick="__doPostBack(' in page + pagination = page if is_billennium_tabs else None for page_num in itertools.count(2): for a_entry, entry_id in re.findall( - r'(?s)<article[^>]+>.*?(<a[^>]+href=["\']/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>', + r'(?s)<article[^>]+>.*?(<a[^>]+href=["\'](?:(?:https?)?://[^/]+)?/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>', content): entry = extract_attributes(a_entry) - href = entry.get('href') - if not href: - continue - yield self.url_result( - compat_urlparse.urljoin(url, href), PolskieRadioLegacyIE, - entry_id, entry.get('title')) - mobj = re.search( - r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', - content) - if not mobj: - break - next_url = compat_urlparse.urljoin(url, mobj.group('url')) - content = self._download_webpage( - next_url, category_id, 'Downloading page %s' % page_num) + if entry.get('href'): + yield self.url_result( + urljoin(url, entry['href']), PolskieRadioLegacyIE, entry_id, entry.get('title')) + for a_entry in re.findall(r'<span data-media=({[^ ]+})', content): + yield traverse_obj(self._parse_json(a_entry, category_id), { + 'url': 'file', + 'id': 'uid', + 'duration': 'length', + 'title': ('title', {urllib.parse.unquote}), + 'description': ('desc', {urllib.parse.unquote}), + }) + if is_billennium_tabs: + params = self._search_json( + r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+onclick=["\']TB_LoadTab\(', + pagination, 'next page params', category_id, default=None, close_objects=1, + contains_pattern='.+', transform_source=lambda x: f'[{js_to_json(unescapeHTML(x))}') + if not params: + break + tab_content = self._download_json( + 'https://www.polskieradio.pl/CMS/TemplateBoxesManagement/TemplateBoxTabContent.aspx/GetTabContent', + category_id, f'Downloading page {page_num}', headers={'content-type': 'application/json'}, + data=json.dumps(dict(zip(( + 'boxInstanceId', 'tabId', 'categoryType', 'sectionId', 'categoryId', 'pagerMode', + 'subjectIds', 'tagIndexId', 'queryString', 'name', 'openArticlesInParentTemplate', + 'idSectionFromUrl', 'maxDocumentAge', 'showCategoryForArticle', 'pageNumber', + ), params))).encode())['d'] + content, pagination = tab_content['Content'], tab_content.get('PagerContent') + elif is_post_back: + target = self._search_regex( + r'onclick=(?:["\'])__doPostBack\((?P<q1>["\'])(?P<target>[\w$]+)(?P=q1)\s*,\s*(?P<q2>["\'])Next(?P=q2)', + content, 'pagination postback target', group='target', default=None) + if not target: + break + content = self._download_webpage( + url, category_id, f'Downloading page {page_num}', + data=urllib.parse.urlencode({ + **self._hidden_inputs(content), + '__EVENTTARGET': target, + '__EVENTARGUMENT': 'Next', + }).encode()) + else: + next_url = urljoin(url, self._search_regex( + r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', + content, 'next page url', group='url', default=None)) + if not next_url: + break + content = self._download_webpage(next_url, category_id, f'Downloading page {page_num}') def _real_extract(self, url): category_id = self._match_id(url) @@ -337,7 +438,7 @@ def _real_extract(self, url): if PolskieRadioAuditionIE.suitable(urlh.url): return self.url_result(urlh.url, PolskieRadioAuditionIE, category_id) title = self._html_search_regex( - r'<title>([^<]+) - [^<]+ - [^<]+', + r'([^<]+)(?: - [^<]+ - [^<]+| w [Pp]olskie[Rr]adio\.pl\s*)', webpage, 'title', fatal=False) return self.playlist_result( self._entries(url, webpage, category_id), @@ -409,7 +510,7 @@ def _real_extract(self, url): }) return { - 'id': compat_str(channel['id']), + 'id': str(channel['id']), 'formats': formats, 'title': channel.get('name') or channel.get('streamName'), 'display_id': channel_url, @@ -501,44 +602,8 @@ def _real_extract(self, url): podcast_id, 'Downloading podcast metadata', data=json.dumps({ 'guids': [podcast_id], - }).encode('utf-8'), + }).encode(), headers={ 'Content-Type': 'application/json', }) return self._parse_episode(data[0]) - - -class PolskieRadioRadioKierowcowIE(PolskieRadioBaseExtractor): - _VALID_URL = r'https?://(?:www\.)?radiokierowcow\.pl/artykul/(?P[0-9]+)' - IE_NAME = 'polskieradio:kierowcow' - - _TESTS = [{ - 'url': 'https://radiokierowcow.pl/artykul/2694529', - 'info_dict': { - 'id': '2694529', - 'title': 'Zielona fala reliktem przeszłości?', - 'description': 'md5:343950a8717c9818fdfd4bd2b8ca9ff2', - }, - 'playlist_count': 3, - }] - - def _real_extract(self, url): - media_id = self._match_id(url) - webpage = self._download_webpage(url, media_id) - nextjs_build = self._search_nextjs_data(webpage, media_id)['buildId'] - article = self._download_json( - f'https://radiokierowcow.pl/_next/data/{nextjs_build}/artykul/{media_id}.json?articleId={media_id}', - media_id) - data = article['pageProps']['data'] - title = data['title'] - entries = self._extract_webpage_player_entries(data['content'], media_id, { - 'title': title, - }) - - return { - '_type': 'playlist', - 'id': media_id, - 'entries': entries, - 'title': title, - 'description': data.get('lead'), - } diff --git a/yt_dlp/extractor/popcorntimes.py b/yt_dlp/extractor/popcorntimes.py index ddc5ec8c8e..cfece86ee7 100644 --- a/yt_dlp/extractor/popcorntimes.py +++ b/yt_dlp/extractor/popcorntimes.py @@ -1,5 +1,6 @@ +import base64 + from .common import InfoExtractor -from ..compat import compat_b64decode from ..utils import int_or_none @@ -49,7 +50,7 @@ def _real_extract(self, url): c_ord -= 26 loc_b64 += chr(c_ord) - video_url = compat_b64decode(loc_b64).decode('utf-8') + video_url = base64.b64decode(loc_b64).decode('utf-8') description = self._html_search_regex( r'(?s)]+class=["\']pt-movie-desc[^>]+>(.+?)
    ', webpage, diff --git a/yt_dlp/extractor/popcorntv.py b/yt_dlp/extractor/popcorntv.py index 77984626f1..2897bb4648 100644 --- a/yt_dlp/extractor/popcorntv.py +++ b/yt_dlp/extractor/popcorntv.py @@ -37,7 +37,7 @@ def _real_extract(self, url): m3u8_url = extract_attributes( self._search_regex( r'(]+itemprop=["\'](?:content|embed)Url[^>]*>)', - webpage, 'content' + webpage, 'content', ))['href'] formats = self._extract_m3u8_formats( diff --git a/yt_dlp/extractor/porn91.py b/yt_dlp/extractor/porn91.py deleted file mode 100644 index 7d16a16319..0000000000 --- a/yt_dlp/extractor/porn91.py +++ /dev/null @@ -1,95 +0,0 @@ -import urllib.parse -from .common import InfoExtractor -from ..utils import ( - determine_ext, - int_or_none, - parse_duration, - remove_end, - unified_strdate, - ExtractorError, -) - - -class Porn91IE(InfoExtractor): - IE_NAME = '91porn' - _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/view_video.php\?([^#]+&)?viewkey=(?P\w+)' - - _TESTS = [{ - 'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134', - 'md5': 'd869db281402e0ef4ddef3c38b866f86', - 'info_dict': { - 'id': '7e42283b4f5ab36da134', - 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!', - 'description': 'md5:1ff241f579b07ae936a54e810ad2e891', - 'ext': 'mp4', - 'duration': 431, - 'upload_date': '20150520', - 'comment_count': int, - 'view_count': int, - 'age_limit': 18, - } - }, { - 'url': 'https://91porn.com/view_video.php?viewkey=7ef0cf3d362c699ab91c', - 'md5': 'f8fd50540468a6d795378cd778b40226', - 'info_dict': { - 'id': '7ef0cf3d362c699ab91c', - 'title': '真实空乘,冲上云霄第二部', - 'description': 'md5:618bf9652cafcc66cd277bd96789baea', - 'ext': 'mp4', - 'duration': 248, - 'upload_date': '20221119', - 'comment_count': int, - 'view_count': int, - 'age_limit': 18, - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - self._set_cookie('91porn.com', 'language', 'cn_CN') - - webpage = self._download_webpage( - 'http://91porn.com/view_video.php?viewkey=%s' % video_id, video_id) - - if '视频不存在,可能已经被删除或者被举报为不良内容!' in webpage: - raise ExtractorError('91 Porn says: Video does not exist', expected=True) - - daily_limit = self._search_regex( - r'作为游客,你每天只可观看([\d]+)个视频', webpage, 'exceeded daily limit', default=None, fatal=False) - if daily_limit: - raise ExtractorError(f'91 Porn says: Daily limit {daily_limit} videos exceeded', expected=True) - - video_link_url = self._search_regex( - r'document\.write\(\s*strencode2\s*\(\s*((?:"[^"]+")|(?:\'[^\']+\'))', webpage, 'video link') - video_link_url = self._search_regex( - r'src=["\']([^"\']+)["\']', urllib.parse.unquote(video_link_url), 'unquoted video link') - - formats, subtitles = self._get_formats_and_subtitle(video_link_url, video_id) - - return { - 'id': video_id, - 'title': remove_end(self._html_extract_title(webpage).replace('\n', ''), 'Chinese homemade video').strip(), - 'formats': formats, - 'subtitles': subtitles, - 'upload_date': unified_strdate(self._search_regex( - r'(\d{4}-\d{2}-\d{2})
    ', webpage, 'upload_date', fatal=False)), - 'description': self._html_search_regex( - r'\s*([^<]+)', webpage, 'description', fatal=False), - 'duration': parse_duration(self._search_regex( - r'时长:\s*]*>\s*(\d+(?::\d+){1,2})', webpage, 'duration', fatal=False)), - 'comment_count': int_or_none(self._search_regex( - r'留言:\s*]*>\s*(\d+)\s*', webpage, 'comment count', fatal=False)), - 'view_count': int_or_none(self._search_regex( - r'热度:\s*]*>\s*(\d+)\s*', webpage, 'view count', fatal=False)), - 'age_limit': 18, - } - - def _get_formats_and_subtitle(self, video_link_url, video_id): - ext = determine_ext(video_link_url) - if ext == 'm3u8': - formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_link_url, video_id, ext='mp4') - else: - formats = [{'url': video_link_url, 'ext': ext}] - subtitles = {} - - return formats, subtitles diff --git a/yt_dlp/extractor/pornbox.py b/yt_dlp/extractor/pornbox.py new file mode 100644 index 0000000000..e15244dac0 --- /dev/null +++ b/yt_dlp/extractor/pornbox.py @@ -0,0 +1,113 @@ +from .common import InfoExtractor +from ..compat import functools +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, + qualities, + str_or_none, + traverse_obj, + url_or_none, +) + + +class PornboxIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornbox\.com/application/watch-page/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://pornbox.com/application/watch-page/212108', + 'md5': '3ff6b6e206f263be4c5e987a3162ac6e', + 'info_dict': { + 'id': '212108', + 'ext': 'mp4', + 'title': 'md5:ececc5c6e6c9dd35d290c45fed05fd49', + 'uploader': 'Lily Strong', + 'timestamp': 1665871200, + 'upload_date': '20221015', + 'age_limit': 18, + 'availability': 'needs_auth', + 'duration': 1505, + 'cast': ['Lily Strong', 'John Strong'], + 'tags': 'count:11', + 'description': 'md5:589c7f33e183aa8aa939537300efb859', + 'thumbnail': r're:^https?://cdn-image\.gtflixtv\.com.*\.jpg.*$', + }, + }, { + 'url': 'https://pornbox.com/application/watch-page/216045', + 'info_dict': { + 'id': '216045', + 'title': 'md5:3e48528e73a9a2b12f7a2772ed0b26a2', + 'description': 'md5:3e631dcaac029f15ed434e402d1b06c7', + 'uploader': 'VK Studio', + 'timestamp': 1618264800, + 'upload_date': '20210412', + 'age_limit': 18, + 'availability': 'premium_only', + 'duration': 2710, + 'cast': 'count:3', + 'tags': 'count:29', + 'thumbnail': r're:^https?://cdn-image\.gtflixtv\.com.*\.jpg.*$', + 'subtitles': 'count:6', + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': [ + 'You are either not logged in or do not have access to this scene', + 'No video formats found', 'Requested format is not available'], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + public_data = self._download_json(f'https://pornbox.com/contents/{video_id}', video_id) + + subtitles = {country_code: [{ + 'url': f'https://pornbox.com/contents/{video_id}/subtitles/{country_code}', + 'ext': 'srt', + }] for country_code in traverse_obj(public_data, ('subtitles', ..., {str}))} + + is_free_scene = traverse_obj( + public_data, ('price', 'is_available_for_free', {bool}), default=False) + + metadata = { + 'id': video_id, + **traverse_obj(public_data, { + 'title': ('scene_name', {str.strip}), + 'description': ('small_description', {str.strip}), + 'uploader': 'studio', + 'duration': ('runtime', {parse_duration}), + 'cast': (('models', 'male_models'), ..., 'model_name'), + 'thumbnail': ('player_poster', {url_or_none}), + 'tags': ('niches', ..., 'niche'), + }), + 'age_limit': 18, + 'timestamp': parse_iso8601(traverse_obj( + public_data, ('studios', 'release_date'), 'publish_date')), + 'availability': self._availability(needs_auth=True, needs_premium=not is_free_scene), + 'subtitles': subtitles, + } + + if not public_data.get('is_purchased') or not is_free_scene: + self.raise_login_required( + 'You are either not logged in or do not have access to this scene', metadata_available=True) + return metadata + + media_id = traverse_obj(public_data, ( + 'medias', lambda _, v: v['title'] == 'Full video', 'media_id', {int}), get_all=False) + if not media_id: + self.raise_no_formats('Could not find stream id', video_id=video_id) + + stream_data = self._download_json( + f'https://pornbox.com/media/{media_id}/stream', video_id=video_id, note='Getting manifest urls') + + get_quality = qualities(['web', 'vga', 'hd', '1080p', '4k', '8k']) + metadata['formats'] = traverse_obj(stream_data, ('qualities', lambda _, v: v['src'], { + 'url': 'src', + 'vbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}), + 'format_id': ('quality', {str_or_none}), + 'quality': ('quality', {get_quality}), + 'width': ('size', {lambda x: int(x[:-1])}), + })) + + return metadata diff --git a/yt_dlp/extractor/porncom.py b/yt_dlp/extractor/porncom.py deleted file mode 100644 index c8ef240d70..0000000000 --- a/yt_dlp/extractor/porncom.py +++ /dev/null @@ -1,99 +0,0 @@ -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - int_or_none, - js_to_json, - parse_filesize, - str_to_int, -) - - -class PornComIE(InfoExtractor): - _VALID_URL = r'https?://(?:[a-zA-Z]+\.)?porn\.com/videos/(?:(?P[^/]+)-)?(?P\d+)' - _TESTS = [{ - 'url': 'http://www.porn.com/videos/teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec-2603339', - 'md5': '3f30ce76267533cd12ba999263156de7', - 'info_dict': { - 'id': '2603339', - 'display_id': 'teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec', - 'ext': 'mp4', - 'title': 'Teen grabs a dildo and fucks her pussy live on 1hottie, I rec', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 551, - 'view_count': int, - 'age_limit': 18, - 'categories': list, - 'tags': list, - }, - }, { - 'url': 'http://se.porn.com/videos/marsha-may-rides-seth-on-top-of-his-thick-cock-2658067', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id - - webpage = self._download_webpage(url, display_id) - - config = self._parse_json( - self._search_regex( - (r'=\s*({.+?})\s*;\s*v1ar\b', - r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*='), - webpage, 'config', default='{}'), - display_id, transform_source=js_to_json, fatal=False) - - if config: - title = config['title'] - formats = [{ - 'url': stream['url'], - 'format_id': stream.get('id'), - 'height': int_or_none(self._search_regex( - r'^(\d+)[pP]', stream.get('id') or '', 'height', default=None)) - } for stream in config['streams'] if stream.get('url')] - thumbnail = (compat_urlparse.urljoin( - config['thumbCDN'], config['poster']) - if config.get('thumbCDN') and config.get('poster') else None) - duration = int_or_none(config.get('length')) - else: - title = self._search_regex( - (r'([^<]+)', r']*>([^<]+)'), - webpage, 'title') - formats = [{ - 'url': compat_urlparse.urljoin(url, format_url), - 'format_id': '%sp' % height, - 'height': int(height), - 'filesize_approx': parse_filesize(filesize), - } for format_url, height, filesize in re.findall( - r']+href="(/download/[^"]+)">[^<]*?(\d+)p]*>(\d+\s*[a-zA-Z]+)<', - webpage)] - thumbnail = None - duration = None - - view_count = str_to_int(self._search_regex( - (r'Views:\s*\s*\s*([\d,.]+)', - r'class=["\']views["\'][^>]*>

    ([\d,.]+)'), webpage, - 'view count', fatal=False)) - - def extract_list(kind): - s = self._search_regex( - (r'(?s)%s:\s*\s*(.+?)' % kind.capitalize(), - r'(?s)]*>%s:(.+?)

    ' % kind.capitalize()), - webpage, kind, fatal=False) - return re.findall(r']+>([^<]+)', s or '') - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, - 'formats': formats, - 'age_limit': 18, - 'categories': extract_list('categories'), - 'tags': extract_list('tags'), - } diff --git a/yt_dlp/extractor/pornez.py b/yt_dlp/extractor/pornez.py deleted file mode 100644 index 3a22cb8210..0000000000 --- a/yt_dlp/extractor/pornez.py +++ /dev/null @@ -1,42 +0,0 @@ -from .common import InfoExtractor -from ..utils import int_or_none, urljoin - - -class PornezIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornez\.net/video(?P[0-9]+)/' - _TEST = { - 'url': 'https://pornez.net/video344819/mistresst-funny_penis_names-wmv/', - 'md5': '2e19a0a1cff3a5dbea0ef1b9e80bcbbc', - 'info_dict': { - 'id': '344819', - 'ext': 'mp4', - 'title': r'mistresst funny_penis_names wmv', - 'thumbnail': r're:^https?://.*\.jpg$', - 'age_limit': 18, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - iframe_src = self._html_search_regex( - r']+src="([^"]+)"', webpage, 'iframe', fatal=True) - iframe_src = urljoin('https://pornez.net', iframe_src) - title = self._html_search_meta(['name', 'twitter:title', 'og:title'], webpage, 'title', default=None) - if title is None: - title = self._search_regex(r'

    (.*?)

    ', webpage, 'title', fatal=True) - thumbnail = self._html_search_meta(['thumbnailUrl'], webpage, 'title', default=None) - webpage = self._download_webpage(iframe_src, video_id) - entries = self._parse_html5_media_entries(iframe_src, webpage, video_id)[0] - for format in entries['formats']: - height = self._search_regex(r'_(\d+)\.m3u8', format['url'], 'height') - format['format_id'] = '%sp' % height - format['height'] = int_or_none(height) - - entries.update({ - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'age_limit': 18 - }) - return entries diff --git a/yt_dlp/extractor/pornflip.py b/yt_dlp/extractor/pornflip.py index 51a9cf38f7..bc684fd6fc 100644 --- a/yt_dlp/extractor/pornflip.py +++ b/yt_dlp/extractor/pornflip.py @@ -1,9 +1,5 @@ from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, - parse_iso8601 -) +from ..utils import int_or_none, parse_duration, parse_iso8601 class PornFlipIE(InfoExtractor): @@ -47,7 +43,7 @@ class PornFlipIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - 'https://{}/sv/{}'.format(self._HOST, video_id), video_id, headers={'host': self._HOST}) + f'https://{self._HOST}/sv/{video_id}', video_id, headers={'host': self._HOST}) description = self._html_search_regex(r'&p\[summary\]=(.*?)\s*&p', webpage, 'description', fatal=False) duration = self._search_regex(r'"duration":\s+"([^"]+)",', webpage, 'duration', fatal=False) view_count = self._search_regex(r'"interactionCount":\s+"([^"]+)"', webpage, 'view_count', fatal=False) diff --git a/yt_dlp/extractor/pornhd.py b/yt_dlp/extractor/pornhd.py deleted file mode 100644 index c8a1ec80b5..0000000000 --- a/yt_dlp/extractor/pornhd.py +++ /dev/null @@ -1,116 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - js_to_json, - merge_dicts, - urljoin, -) - - -class PornHdIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P\d+)(?:/(?P.+))?' - _TESTS = [{ - 'url': 'http://www.pornhd.com/videos/9864/selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', - 'md5': '87f1540746c1d32ec7a2305c12b96b25', - 'info_dict': { - 'id': '9864', - 'display_id': 'selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', - 'ext': 'mp4', - 'title': 'Restroom selfie masturbation', - 'description': 'md5:3748420395e03e31ac96857a8f125b2b', - 'thumbnail': r're:^https?://.*\.jpg', - 'view_count': int, - 'like_count': int, - 'age_limit': 18, - }, - 'skip': 'HTTP Error 404: Not Found', - }, { - 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', - 'md5': '1b7b3a40b9d65a8e5b25f7ab9ee6d6de', - 'info_dict': { - 'id': '1962', - 'display_id': 'sierra-day-gets-his-cum-all-over-herself-hd-porn-video', - 'ext': 'mp4', - 'title': 'md5:98c6f8b2d9c229d0f0fde47f61a1a759', - 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294', - 'thumbnail': r're:^https?://.*\.jpg', - 'view_count': int, - 'like_count': int, - 'age_limit': 18, - }, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id or video_id) - - title = self._html_search_regex( - [r']+class=["\']video-name["\'][^>]*>([^<]+)', - r'(.+?) - .*?[Pp]ornHD.*?'], webpage, 'title') - - sources = self._parse_json(js_to_json(self._search_regex( - r"(?s)sources'?\s*[:=]\s*(\{.+?\})", - webpage, 'sources', default='{}')), video_id) - - info = {} - if not sources: - entries = self._parse_html5_media_entries(url, webpage, video_id) - if entries: - info = entries[0] - - if not sources and not info: - message = self._html_search_regex( - r'(?s)<(div|p)[^>]+class="no-video"[^>]*>(?P.+?)]+class=["\']video-description[^>]+>(?P.+?)', - r'<(div|p)[^>]+class="description"[^>]*>(?P[^<]+)(?:(?!\1).)+)\1", webpage, - 'thumbnail', default=None, group='url') - - like_count = int_or_none(self._search_regex( - (r'(\d+)
    \s*likes', - r'(\d+)\s*]+>(?: |\s)*\blikes', - r'class=["\']save-count["\'][^>]*>\s*(\d+)'), - webpage, 'like count', fatal=False)) - - return merge_dicts(info, { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'view_count': view_count, - 'like_count': like_count, - 'formats': formats, - 'age_limit': 18, - }) diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index 5d8d7c100a..679dc63234 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -3,11 +3,11 @@ import math import operator import re -import urllib.request from .common import InfoExtractor from .openload import PhantomJSwrapper -from ..compat import compat_HTTPError, compat_str +from ..networking import Request +from ..networking.exceptions import HTTPError from ..utils import ( NO_DEFAULT, ExtractorError, @@ -46,8 +46,8 @@ def dl(*args, **kwargs): r'document\.cookie\s*=\s*["\']RNKEY=', r'document\.location\.reload\(true\)')): url_or_request = args[0] - url = (url_or_request.get_full_url() - if isinstance(url_or_request, urllib.request.Request) + url = (url_or_request.url + if isinstance(url_or_request, Request) else url_or_request) phantom = PhantomJSwrapper(self, required_version='2.0') phantom.get(url, html=webpage) @@ -58,6 +58,12 @@ def dl(*args, **kwargs): def _real_initialize(self): self._logged_in = False + def _set_age_cookies(self, host): + self._set_cookie(host, 'age_verified', '1') + self._set_cookie(host, 'accessAgeDisclaimerPH', '1') + self._set_cookie(host, 'accessAgeDisclaimerUK', '1') + self._set_cookie(host, 'accessPH', '1') + def _login(self, host): if self._logged_in: return @@ -74,14 +80,14 @@ def _login(self, host): if username is None: return - login_url = 'https://www.%s/%slogin' % (host, 'premium/' if 'premium' in host else '') + login_url = 'https://www.{}/{}login'.format(host, 'premium/' if 'premium' in host else '') login_page = self._download_webpage( - login_url, None, 'Downloading %s login page' % site) + login_url, None, f'Downloading {site} login page') def is_logged(webpage): return any(re.search(p, webpage) for p in ( - r'class=["\']signOut', - r'>Sign\s+[Oo]ut\s*<')) + r'id="profileMenuDropdown"', + r'class="ph-icon-logout"')) if is_logged(login_page): self._logged_in = True @@ -90,13 +96,13 @@ def is_logged(webpage): login_form = self._hidden_inputs(login_page) login_form.update({ - 'username': username, + 'email': username, 'password': password, }) response = self._download_json( - 'https://www.%s/front/authenticate' % host, None, - 'Logging in to %s' % site, + f'https://www.{host}/front/authenticate', None, + f'Logging in to {site}', data=urlencode_postdata(login_form), headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', @@ -111,23 +117,23 @@ def is_logged(webpage): message = response.get('message') if message is not None: raise ExtractorError( - 'Unable to login: %s' % message, expected=True) + f'Unable to login: {message}', expected=True) raise ExtractorError('Unable to log in') class PornHubIE(PornHubBaseIE): IE_DESC = 'PornHub and Thumbzilla' - _VALID_URL = r'''(?x) + _VALID_URL = rf'''(?x) https?:// (?: (?:[^/]+\.)? - %s + {PornHubBaseIE._PORNHUB_HOST_RE} /(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P[\da-z]+) - ''' % PornHubBaseIE._PORNHUB_HOST_RE + ''' _EMBED_REGEX = [r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)'] _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', @@ -189,8 +195,8 @@ class PornHubIE(PornHubBaseIE): 'categories': list, 'subtitles': { 'en': [{ - "ext": 'srt' - }] + 'ext': 'srt', + }], }, }, 'params': { @@ -259,7 +265,7 @@ class PornHubIE(PornHubBaseIE): }] def _extract_count(self, pattern, webpage, name): - return str_to_int(self._search_regex(pattern, webpage, '%s count' % name, default=None)) + return str_to_int(self._search_regex(pattern, webpage, f'{name} count', default=None)) def _real_extract(self, url): mobj = self._match_valid_url(url) @@ -267,14 +273,13 @@ def _real_extract(self, url): video_id = mobj.group('id') self._login(host) - - self._set_cookie(host, 'age_verified', '1') + self._set_age_cookies(host) def dl_webpage(platform): self._set_cookie(host, 'platform', platform) return self._download_webpage( - 'https://www.%s/view_video.php?viewkey=%s' % (host, video_id), - video_id, 'Downloading %s webpage' % platform) + f'https://www.{host}/view_video.php?viewkey={video_id}', + video_id, f'Downloading {platform} webpage') webpage = dl_webpage('pc') @@ -285,7 +290,7 @@ def dl_webpage(platform): if error_msg: error_msg = re.sub(r'\s+', ' ', error_msg) raise ExtractorError( - 'PornHub said: %s' % error_msg, + f'PornHub said: {error_msg}', expected=True, video_id=video_id) if any(re.search(p, webpage) for p in ( @@ -326,7 +331,7 @@ def dl_webpage(platform): if not isinstance(definition, dict): continue video_url = definition.get('videoUrl') - if not video_url or not isinstance(video_url, compat_str): + if not video_url or not isinstance(video_url, str): continue if video_url in video_urls_set: continue @@ -386,7 +391,7 @@ def parse_quality_items(quality_items): if not video_urls: FORMAT_PREFIXES = ('media', 'quality', 'qualityItems') js_vars = extract_js_vars( - webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES), + webpage, r'(var\s+(?:{})_.+)'.format('|'.join(FORMAT_PREFIXES)), default=None) if js_vars: for key, format_url in js_vars.items(): @@ -397,7 +402,7 @@ def parse_quality_items(quality_items): if not video_urls and re.search( r'<[^>]+\bid=["\']lockedPlayer', webpage): raise ExtractorError( - 'Video %s is locked' % video_id, expected=True) + f'Video {video_id} is locked', expected=True) if not video_urls: js_vars = extract_js_vars( @@ -464,8 +469,8 @@ def add_format(format_url, height=None): def extract_vote_count(kind, name): return self._extract_count( - (r']+\bclass="votes%s"[^>]*>([\d,\.]+)' % kind, - r']+\bclass=["\']votes%s["\'][^>]*\bdata-rating=["\'](\d+)' % kind), + (rf']+\bclass="votes{kind}"[^>]*>([\d,\.]+)', + rf']+\bclass=["\']votes{kind}["\'][^>]*\bdata-rating=["\'](\d+)'), webpage, name) view_count = self._extract_count( @@ -477,8 +482,8 @@ def extract_vote_count(kind, name): def extract_list(meta_key): div = self._search_regex( - r'(?s)]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)' - % meta_key, webpage, meta_key, default=None) + rf'(?s)]+\bclass=["\'].*?\b{meta_key}Wrapper[^>]*>(.+?)', + webpage, meta_key, default=None) if div: return [clean_html(x).strip() for x in re.findall(r'(?s)]+\bhref=[^>]+>.+?', div)] @@ -522,7 +527,7 @@ def _extract_entries(self, webpage, host): return [ self.url_result( - 'http://www.%s/%s' % (host, video_url), + f'http://www.{host}/{video_url}', PornHubIE.ie_key(), video_title=title) for video_url, title in orderedSet(re.findall( r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', @@ -531,7 +536,7 @@ def _extract_entries(self, webpage, host): class PornHubUserIE(PornHubPlaylistBaseIE): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' % PornHubBaseIE._PORNHUB_HOST_RE + _VALID_URL = rf'(?Phttps?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph', 'playlist_mincount': 118, @@ -568,7 +573,8 @@ class PornHubUserIE(PornHubPlaylistBaseIE): def _real_extract(self, url): mobj = self._match_valid_url(url) user_id = mobj.group('id') - videos_url = '%s/videos' % mobj.group('url') + videos_url = '{}/videos'.format(mobj.group('url')) + self._set_age_cookies(mobj.group('host')) page = self._extract_page(url) if page: videos_url = update_url_query(videos_url, {'page': page}) @@ -592,12 +598,12 @@ def _entries(self, url, host, item_id): VIDEOS = '/videos' def download_page(base_url, num, fallback=False): - note = 'Downloading page %d%s' % (num, ' (switch to fallback)' if fallback else '') + note = 'Downloading page {}{}'.format(num, ' (switch to fallback)' if fallback else '') return self._download_webpage( base_url, item_id, note, query={'page': num}) def is_404(e): - return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 + return isinstance(e.cause, HTTPError) and e.cause.status == 404 base_url = url has_page = page is not None @@ -633,12 +639,13 @@ def _real_extract(self, url): item_id = mobj.group('id') self._login(host) + self._set_age_cookies(host) return self.playlist_result(self._entries(url, host, item_id), item_id) class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?%s/(?!playlist/)(?P(?:[^/]+/)*[^/?#&]+)' % PornHubBaseIE._PORNHUB_HOST_RE + _VALID_URL = rf'https?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/(?!playlist/)(?P(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph/videos', 'only_matching': True, @@ -740,11 +747,11 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): def suitable(cls, url): return (False if PornHubIE.suitable(url) or PornHubUserIE.suitable(url) or PornHubUserVideosUploadIE.suitable(url) - else super(PornHubPagedVideoListIE, cls).suitable(url)) + else super().suitable(url)) class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P[^/]+)/videos/upload)' % PornHubBaseIE._PORNHUB_HOST_RE + _VALID_URL = rf'(?Phttps?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/(?:(?:user|channel)s|model|pornstar)/(?P[^/]+)/videos/upload)' _TESTS = [{ 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', 'info_dict': { @@ -761,7 +768,7 @@ class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): class PornHubPlaylistIE(PornHubPlaylistBaseIE): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?%s/playlist/(?P[^/?#&]+))' % PornHubBaseIE._PORNHUB_HOST_RE + _VALID_URL = rf'(?Phttps?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/playlist/(?P[^/?#&]+))' _TESTS = [{ 'url': 'https://www.pornhub.com/playlist/44121572', 'info_dict': { @@ -789,8 +796,8 @@ def _entries(self, url, host, item_id): page_entries = self._extract_entries(webpage, host) def download_page(page_num): - note = 'Downloading page {}'.format(page_num) - page_url = 'https://www.{}/playlist/viewChunked'.format(host) + note = f'Downloading page {page_num}' + page_url = f'https://www.{host}/playlist/viewChunked' return self._download_webpage(page_url, item_id, note, query={ 'id': playlist_id, 'page': page_num, @@ -803,8 +810,7 @@ def download_page(page_num): page_entries = self._extract_entries(webpage, host) if not page_entries: break - for e in page_entries: - yield e + yield from page_entries def _real_extract(self, url): mobj = self._match_valid_url(url) @@ -812,5 +818,6 @@ def _real_extract(self, url): item_id = mobj.group('id') self._login(host) + self._set_age_cookies(host) return self.playlist_result(self._entries(mobj.group('url'), host, item_id), item_id) diff --git a/yt_dlp/extractor/pornotube.py b/yt_dlp/extractor/pornotube.py index e0960f4c6f..80c9b278df 100644 --- a/yt_dlp/extractor/pornotube.py +++ b/yt_dlp/extractor/pornotube.py @@ -20,7 +20,7 @@ class PornotubeIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1417582800, 'age_limit': 18, - } + }, } def _real_extract(self, url): @@ -29,25 +29,24 @@ def _real_extract(self, url): token = self._download_json( 'https://api.aebn.net/auth/v2/origins/authenticate', video_id, note='Downloading token', - data=json.dumps({'credentials': 'Clip Application'}).encode('utf-8'), + data=json.dumps({'credentials': 'Clip Application'}).encode(), headers={ 'Content-Type': 'application/json', 'Origin': 'http://www.pornotube.com', })['tokenKey'] video_url = self._download_json( - 'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id, + f'https://api.aebn.net/delivery/v1/clips/{video_id}/MP4', video_id, note='Downloading delivery information', headers={'Authorization': token})['mediaUrl'] FIELDS = ( 'title', 'description', 'startSecond', 'endSecond', 'publishDate', - 'studios{name}', 'categories{name}', 'movieId', 'primaryImageNumber' + 'studios{name}', 'categories{name}', 'movieId', 'primaryImageNumber', ) info = self._download_json( - 'https://api.aebn.net/content/v2/clips/%s?fields=%s' - % (video_id, ','.join(FIELDS)), video_id, + 'https://api.aebn.net/content/v2/clips/{}?fields={}'.format(video_id, ','.join(FIELDS)), video_id, note='Downloading metadata', headers={'Authorization': token}) diff --git a/yt_dlp/extractor/pornovoisines.py b/yt_dlp/extractor/pornovoisines.py index aa48da06b9..587b3cdf79 100644 --- a/yt_dlp/extractor/pornovoisines.py +++ b/yt_dlp/extractor/pornovoisines.py @@ -1,12 +1,13 @@ from .common import InfoExtractor from ..utils import ( - int_or_none, float_or_none, + int_or_none, unified_strdate, ) class PornoVoisinesIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/videos/show/(?P\d+)/(?P[^/.]+)' _TEST = { @@ -28,9 +29,9 @@ class PornoVoisinesIE(InfoExtractor): 'subtitles': { 'fr': [{ 'ext': 'vtt', - }] + }], }, - } + }, } def _real_extract(self, url): @@ -39,7 +40,7 @@ def _real_extract(self, url): display_id = mobj.group('display_id') settings_url = self._download_json( - 'http://www.pornovoisines.com/api/video/%s/getsettingsurl/' % video_id, + f'http://www.pornovoisines.com/api/video/{video_id}/getsettingsurl/', video_id, note='Getting settings URL')['video_settings_url'] settings = self._download_json(settings_url, video_id)['data'] diff --git a/yt_dlp/extractor/pornoxo.py b/yt_dlp/extractor/pornoxo.py index 5104d8a49b..fa31546441 100644 --- a/yt_dlp/extractor/pornoxo.py +++ b/yt_dlp/extractor/pornoxo.py @@ -5,6 +5,7 @@ class PornoXOIE(InfoExtractor): + _WORKING = False _VALID_URL = r'https?://(?:www\.)?pornoxo\.com/videos/(?P\d+)/(?P[^/]+)\.html' _TEST = { 'url': 'http://www.pornoxo.com/videos/7564/striptease-from-sexy-secretary.html', @@ -18,7 +19,7 @@ class PornoXOIE(InfoExtractor): 'categories': list, # NSFW 'thumbnail': r're:https?://.*\.jpg$', 'age_limit': 18, - } + }, } def _real_extract(self, url): diff --git a/yt_dlp/extractor/pr0gramm.py b/yt_dlp/extractor/pr0gramm.py index 2eb327fba1..b0d6475fe4 100644 --- a/yt_dlp/extractor/pr0gramm.py +++ b/yt_dlp/extractor/pr0gramm.py @@ -1,97 +1,203 @@ -import re +import datetime as dt +import functools +import json +import urllib.parse from .common import InfoExtractor -from ..utils import merge_dicts +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + make_archive_id, + mimetype2ext, + str_or_none, + urljoin, +) +from ..utils.traversal import traverse_obj -class Pr0grammStaticIE(InfoExtractor): - # Possible urls: - # https://pr0gramm.com/static/5466437 - _VALID_URL = r'https?://pr0gramm\.com/static/(?P[0-9]+)' - _TEST = { - 'url': 'https://pr0gramm.com/static/5466437', - 'md5': '52fa540d70d3edc286846f8ca85938aa', - 'info_dict': { - 'id': '5466437', - 'ext': 'mp4', - 'title': 'pr0gramm-5466437 by g11st', - 'uploader': 'g11st', - 'upload_date': '20221221', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - # Fetch media sources - entries = self._parse_html5_media_entries(url, webpage, video_id) - media_info = entries[0] - - # Fetch author - uploader = self._html_search_regex(r'by\W+([\w-]+)\W+', webpage, 'uploader') - - # Fetch approx upload timestamp from filename - # Have None-defaults in case the extraction fails - uploadDay = None - uploadMon = None - uploadYear = None - uploadTimestr = None - # (//img.pr0gramm.com/2022/12/21/62ae8aa5e2da0ebf.mp4) - m = re.search(r'//img\.pr0gramm\.com/(?P[\d]+)/(?P[\d]+)/(?P[\d]+)/\w+\.\w{,4}', webpage) - - if (m): - # Up to a day of accuracy should suffice... - uploadDay = m.groupdict().get('day') - uploadMon = m.groupdict().get('mon') - uploadYear = m.groupdict().get('year') - uploadTimestr = uploadYear + uploadMon + uploadDay - - return merge_dicts({ - 'id': video_id, - 'title': 'pr0gramm-%s%s' % (video_id, (' by ' + uploader) if uploader else ''), - 'uploader': uploader, - 'upload_date': uploadTimestr - }, media_info) - - -# This extractor is for the primary url (used for sharing, and appears in the -# location bar) Since this page loads the DOM via JS, yt-dl can't find any -# video information here. So let's redirect to a compatibility version of -# the site, which does contain the