diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index 5df13ad9b5..3b0ef323d7 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -77,3 +77,11 @@ body: render: shell validations: required: true + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index 644c87a7ed..c8702c3569 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -89,3 +89,11 @@ body: render: shell validations: required: true + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index 59d0474c28..5a6d2b0fbd 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -85,3 +85,11 @@ body: render: shell validations: required: true + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index e207396737..a17770f614 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -70,3 +70,11 @@ body: render: shell validations: required: true + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index e06db9ccf8..c600a9dcb6 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -64,3 +64,11 @@ body: [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 571223a9c5..57bc9daf51 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -70,3 +70,11 @@ body: [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 12ec5b0d8c..495d3c6306 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -240,7 +240,7 @@ jobs: permissions: contents: read actions: write # For cleaning up cache - runs-on: macos-12 + runs-on: macos-13 steps: - uses: actions/checkout@v4 @@ -266,7 +266,7 @@ jobs: # We need to ignore wheels otherwise we break universal2 builds python3 -m pip install -U --no-binary :all: -r requirements.txt # We need to fuse our own universal2 wheels for curl_cffi - python3 -m pip install -U delocate + python3 -m pip install -U 'delocate==0.11.0' mkdir curl_cffi_whls curl_cffi_universal2 python3 devscripts/install_deps.py --print -o --include curl-cffi > requirements.txt for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do @@ -346,7 +346,7 @@ jobs: macos_legacy: needs: process if: inputs.macos_legacy - runs-on: macos-12 + runs-on: macos-13 steps: - uses: actions/checkout@v4 @@ -409,7 +409,7 @@ jobs: run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds python devscripts/install_deps.py -o --include build python devscripts/install_deps.py --include curl-cffi - python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-6.7.0-py3-none-any.whl" + python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-6.10.0-py3-none-any.whl" - name: Prepare run: | @@ -469,7 +469,7 @@ jobs: run: | python devscripts/install_deps.py -o --include build python devscripts/install_deps.py - python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-6.7.0-py3-none-any.whl" + python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-6.10.0-py3-none-any.whl" - name: Prepare run: | diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index 21a64efa96..a5cb6c9707 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -59,4 +59,4 @@ jobs: continue-on-error: False run: | python3 -m yt_dlp -v || true # Print debug head - python3 ./devscripts/run_tests.py core + python3 ./devscripts/run_tests.py --pytest-args '--reruns 2 --reruns-delay 3.0' core diff --git a/.github/workflows/issue-lockdown.yml b/.github/workflows/issue-lockdown.yml new file mode 100644 index 0000000000..4b973e2e61 --- /dev/null +++ b/.github/workflows/issue-lockdown.yml @@ -0,0 +1,21 @@ +name: Issue Lockdown +on: + issues: + types: [opened] + +permissions: + issues: write + +jobs: + lockdown: + name: Issue Lockdown + if: vars.ISSUE_LOCKDOWN + runs-on: ubuntu-latest + steps: + - name: "Lock new issue" + env: + GH_TOKEN: ${{ github.token }} + ISSUE_NUMBER: ${{ github.event.issue.number }} + REPOSITORY: ${{ github.repository }} + run: | + gh issue lock "${ISSUE_NUMBER}" -R "${REPOSITORY}" diff --git a/.github/workflows/quick-test.yml b/.github/workflows/quick-test.yml index fe2a7e9239..cce7cbac1e 100644 --- a/.github/workflows/quick-test.yml +++ b/.github/workflows/quick-test.yml @@ -15,12 +15,12 @@ jobs: with: python-version: '3.8' - name: Install test requirements - run: python3 ./devscripts/install_deps.py --include test + run: python3 ./devscripts/install_deps.py -o --include test - name: Run tests timeout-minutes: 15 run: | python3 -m yt_dlp -v || true - python3 ./devscripts/run_tests.py core + python3 ./devscripts/run_tests.py --pytest-args '--reruns 2 --reruns-delay 3.0' core check: name: Code check if: "!contains(github.event.head_commit.message, 'ci skip all')" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index fa5ad7e515..8d0bc4026a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -204,7 +204,7 @@ jobs: git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com" git add -u git commit -m "Release ${{ env.version }}" \ - -m "Created by: ${{ github.event.sender.login }}" -m ":ci skip all :ci run dl" + -m "Created by: ${{ github.event.sender.login }}" -m ":ci skip all" git push origin --force ${{ github.event.ref }}:release - name: Get target commitish @@ -325,7 +325,7 @@ jobs: "(https://github.com/yt-dlp/yt-dlp-master-builds/releases/latest \"Master builds\")"' || '' }} > ./RELEASE_NOTES printf '\n\n' >> ./RELEASE_NOTES cat >> ./RELEASE_NOTES << EOF - #### A description of the various files are in the [README](https://github.com/${{ github.repository }}#release-files) + #### A description of the various files is in the [README](https://github.com/${{ github.repository }}#release-files) --- $(python ./devscripts/make_changelog.py -vv --collapsible) EOF diff --git a/.github/workflows/sanitize-comment.yml b/.github/workflows/sanitize-comment.yml new file mode 100644 index 0000000000..45c87cdd47 --- /dev/null +++ b/.github/workflows/sanitize-comment.yml @@ -0,0 +1,17 @@ +name: Sanitize comment + +on: + issue_comment: + types: [created, edited] + +permissions: + issues: write + +jobs: + sanitize-comment: + name: Sanitize comment + if: vars.SANITIZE_COMMENT && !github.event.issue.pull_request + runs-on: ubuntu-latest + steps: + - name: Sanitize comment + uses: yt-dlp/sanitize-comment@v1 diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 489ab7da8b..bcdf6a0c24 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -657,3 +657,24 @@ luvyana szantnerb hugepower scribblemaniac +Codenade +Demon000 +Deukhoofd +grqz +hibes +Khaoklong51 +kieraneglin +lengzuo +naglis +ndyanx +otovalek +quad +rakslice +sahilsinghss73 +tony-hn +xingchensong +BallzCrasher +coreywright +eric321 +poyhen +tetra-fox diff --git a/Changelog.md b/Changelog.md index 0b96ab29cd..10fd437fa1 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,118 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2024.10.07 + +#### Core changes +- **cookies**: [Fix cookie load error handling](https://github.com/yt-dlp/yt-dlp/commit/e59c82a74cda5139eb3928c75b0bd45484dbe7f0) ([#11140](https://github.com/yt-dlp/yt-dlp/issues/11140)) by [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- **applepodcasts**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/6328e2e67a4e126e08af382e6a387073082d5c5f) ([#10903](https://github.com/yt-dlp/yt-dlp/issues/10903)) by [coreywright](https://github.com/coreywright) +- **cwtv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/4b7bec66d8100978b82bb24110ed44e2a7749931) ([#11135](https://github.com/yt-dlp/yt-dlp/issues/11135)) by [kclauhk](https://github.com/kclauhk) +- **instagram** + - [Do not hardcode user-agent](https://github.com/yt-dlp/yt-dlp/commit/079a7bc334281d3c13d347770ae5f9f2b7da471a) ([#11155](https://github.com/yt-dlp/yt-dlp/issues/11155)) by [poyhen](https://github.com/poyhen) + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/cf85cba5d9496bd2689e1070005b4d1b4cd3dc6d) ([#11156](https://github.com/yt-dlp/yt-dlp/issues/11156)) by [tetra-fox](https://github.com/tetra-fox) +- **noodlemagazine**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/ccb23e1bac9768d1c70535beb744e668ed4a2720) ([#11144](https://github.com/yt-dlp/yt-dlp/issues/11144)) by [BallzCrasher](https://github.com/BallzCrasher) +- **patreon**: [Extract all m3u8 formats for locked posts](https://github.com/yt-dlp/yt-dlp/commit/f91645aceaf13926cf35be2c1dfef61b3aab97fb) ([#11138](https://github.com/yt-dlp/yt-dlp/issues/11138)) by [bashonly](https://github.com/bashonly) +- **youtube**: [Change default player clients to `ios,mweb`](https://github.com/yt-dlp/yt-dlp/commit/de2062753a188060d76f587e45becce61fe399f9) ([#11190](https://github.com/yt-dlp/yt-dlp/issues/11190)) by [seproDev](https://github.com/seproDev) + +#### Postprocessor changes +- **xattrmetadata**: [Try to write each attribute](https://github.com/yt-dlp/yt-dlp/commit/3a193346eeb27ac2959ff30c370adb899ec94732) ([#11115](https://github.com/yt-dlp/yt-dlp/issues/11115)) by [eric321](https://github.com/eric321) + +#### Misc. changes +- **ci**: [Rerun failed tests](https://github.com/yt-dlp/yt-dlp/commit/b31b81d85f00601710d4fac590c3e4efb4133283) ([#11143](https://github.com/yt-dlp/yt-dlp/issues/11143)) by [Grub4K](https://github.com/Grub4K) +- **cleanup**: Miscellaneous: [1a176d8](https://github.com/yt-dlp/yt-dlp/commit/1a176d874e6772cd898ce507379ea388e96ee3f7) by [bashonly](https://github.com/bashonly) + +### 2024.09.27 + +#### Important changes +- **The minimum *recommended* Python version has been raised to 3.9** +Since Python 3.8 will reach end-of-life in October 2024, support for it will be dropped soon. [Read more](https://github.com/yt-dlp/yt-dlp/issues/10086) + +#### Core changes +- [Allow `none` arg to negate `--convert-subs` and `--convert-thumbnails`](https://github.com/yt-dlp/yt-dlp/commit/c08e0b20b5edd8957b8318716bc14e896d1b96f4) ([#11066](https://github.com/yt-dlp/yt-dlp/issues/11066)) by [kieraneglin](https://github.com/kieraneglin) +- [Fix format sorting bug with vp9.2 vcodec](https://github.com/yt-dlp/yt-dlp/commit/8f4ea14680c7865d8ffac10a9174205d1d84ada7) ([#10884](https://github.com/yt-dlp/yt-dlp/issues/10884)) by [rakslice](https://github.com/rakslice) +- [Raise minimum recommended Python version to 3.9](https://github.com/yt-dlp/yt-dlp/commit/cca534cd9e6850c70244f225a4a1895ef4bcdbec) ([#11098](https://github.com/yt-dlp/yt-dlp/issues/11098)) by [bashonly](https://github.com/bashonly) +- **cookies**: [Improve error message for Windows `--cookies-from-browser chrome` issue](https://github.com/yt-dlp/yt-dlp/commit/b397a64691421ace5df09457c2a764821a2dc6f2) ([#11090](https://github.com/yt-dlp/yt-dlp/issues/11090)) by [seproDev](https://github.com/seproDev) +- **utils**: `mimetype2ext`: [Recognize `aacp` as `aac`](https://github.com/yt-dlp/yt-dlp/commit/cc85596d5b59f0c14e9381b3675f619c1e12e597) ([#10860](https://github.com/yt-dlp/yt-dlp/issues/10860)) by [bashonly](https://github.com/bashonly) + +#### Extractor changes +- [Fix JW Player format parsing](https://github.com/yt-dlp/yt-dlp/commit/409f8e9e3b4bde81ef76fc563256f876d2ff8099) ([#10956](https://github.com/yt-dlp/yt-dlp/issues/10956)) by [seproDev](https://github.com/seproDev) +- [Handle decode errors when reading responses](https://github.com/yt-dlp/yt-dlp/commit/325001317d97f4545d66fac44c4ba772c6f45f22) ([#10868](https://github.com/yt-dlp/yt-dlp/issues/10868)) by [bashonly](https://github.com/bashonly) +- **abc.net.au**: iview, showseries: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/7f909046f4dc0fba472b4963145aef6e0d42491b) ([#11101](https://github.com/yt-dlp/yt-dlp/issues/11101)) by [bashonly](https://github.com/bashonly) +- **adn**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/cc88a54bb1ef285154775f8a6a413335ce4c71ce) ([#10749](https://github.com/yt-dlp/yt-dlp/issues/10749)) by [infanf](https://github.com/infanf) +- **asobistage**: [Support redirected URLs](https://github.com/yt-dlp/yt-dlp/commit/a7d3235c84dac57a127cbe0ff38f7f7c2fdd8fa0) ([#10768](https://github.com/yt-dlp/yt-dlp/issues/10768)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **bandcamp**: user: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/5d0176547f16a3642cd71627126e9dfc24981e20) ([#10328](https://github.com/yt-dlp/yt-dlp/issues/10328)) by [bashonly](https://github.com/bashonly), [quad](https://github.com/quad) +- **beacon**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/b4760c778d0c92c6e3f2bc8346cd72c8f08595ae) ([#9901](https://github.com/yt-dlp/yt-dlp/issues/9901)) by [Deukhoofd](https://github.com/Deukhoofd) +- **bilibili** + - [Fix chapters and subtitles extraction](https://github.com/yt-dlp/yt-dlp/commit/a2000bc85730c950351d78bb818493dc39dca3cb) ([#11099](https://github.com/yt-dlp/yt-dlp/issues/11099)) by [bashonly](https://github.com/bashonly) + - [Fix festival URL support](https://github.com/yt-dlp/yt-dlp/commit/b43bd864851f2862e26caa85461c5d825d49d463) ([#10740](https://github.com/yt-dlp/yt-dlp/issues/10740)) by [bashonly](https://github.com/bashonly), [grqz](https://github.com/grqz) +- **biliintl**: [Fix referer header](https://github.com/yt-dlp/yt-dlp/commit/a06bb586795ebab87a2356923acfc674d6f0e152) ([#11003](https://github.com/yt-dlp/yt-dlp/issues/11003)) by [Khaoklong51](https://github.com/Khaoklong51) +- **dropbox**: [Fix password-protected video support](https://github.com/yt-dlp/yt-dlp/commit/63da31b3b29af90062d8a72a905ffe4b5e499042) ([#10735](https://github.com/yt-dlp/yt-dlp/issues/10735)) by [ndyanx](https://github.com/ndyanx) +- **ertgr**: [Fix video extraction](https://github.com/yt-dlp/yt-dlp/commit/416686ed0cf792ec44ab059f3b229dd776077e14) ([#11091](https://github.com/yt-dlp/yt-dlp/issues/11091)) by [seproDev](https://github.com/seproDev) +- **eurosport**: [Support local URL variants](https://github.com/yt-dlp/yt-dlp/commit/f0bb28504c8c2b75ee3e5796aed50de2a7f90a1b) ([#10785](https://github.com/yt-dlp/yt-dlp/issues/10785)) by [seproDev](https://github.com/seproDev) +- **facebook** + - ads: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/d62fef7e07d454c0d2ba2d69fb96d691dba1ded0) ([#10704](https://github.com/yt-dlp/yt-dlp/issues/10704)) by [kclauhk](https://github.com/kclauhk) + - reel: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/0e1b941c6b2caa688b0d3332e723d16dbafa4311) by [lengzuo](https://github.com/lengzuo) +- **germanupa**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/124f058b546d652a359c67025bb479789bfbef0b) ([#10538](https://github.com/yt-dlp/yt-dlp/issues/10538)) by [grqz](https://github.com/grqz) +- **hgtvde**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/a555389c9bb32e589e00b4664974423fb7b04dcd) ([#10992](https://github.com/yt-dlp/yt-dlp/issues/10992)) by [bashonly](https://github.com/bashonly), [rdamas](https://github.com/rdamas) +- **huya**: video: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/25c1cdaa2650563494d3bf00a38f72d0d9486bff) ([#10686](https://github.com/yt-dlp/yt-dlp/issues/10686)) by [hugepower](https://github.com/hugepower) +- **iprima**: [Fix zoom URL support](https://github.com/yt-dlp/yt-dlp/commit/4a27b8f092f7f7c10b7a334d3535c97c2af02f0a) ([#10959](https://github.com/yt-dlp/yt-dlp/issues/10959)) by [otovalek](https://github.com/otovalek) +- **khanacademy**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/0fba08485b6445b72b5b63ae23ca2a73fa5d967f) ([#10913](https://github.com/yt-dlp/yt-dlp/issues/10913)) by [seproDev](https://github.com/seproDev) +- **kick** + - clips: [Support new URL format](https://github.com/yt-dlp/yt-dlp/commit/0aa4426e9a35f7f8e184f1f2082b3b313c1448f7) ([#11107](https://github.com/yt-dlp/yt-dlp/issues/11107)) by [bashonly](https://github.com/bashonly) + - vod: [Support new URL format](https://github.com/yt-dlp/yt-dlp/commit/173d54c151b987409e3eb09552d8d89ed8fc50f7) ([#10988](https://github.com/yt-dlp/yt-dlp/issues/10988)) by [bashonly](https://github.com/bashonly), [grqz](https://github.com/grqz) +- **kika**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/e6f48ca80821939c1fd11ec2a0cdbf2fba9b258a) ([#5788](https://github.com/yt-dlp/yt-dlp/issues/5788)) by [1100101](https://github.com/1100101) +- **lnkgo**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/fa83d0b36bc43d30fe9241c1e923f4614864b758) ([#10904](https://github.com/yt-dlp/yt-dlp/issues/10904)) by [naglis](https://github.com/naglis) +- **loom**: [Fix m3u8 formats extraction](https://github.com/yt-dlp/yt-dlp/commit/7509d692b37a7ec6230ea75bfe1e44a8de5eefce) ([#10760](https://github.com/yt-dlp/yt-dlp/issues/10760)) by [kclauhk](https://github.com/kclauhk) +- **mediaklikk**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/e2b3634e299be9c16a247ece3b1858d83889c324) ([#11083](https://github.com/yt-dlp/yt-dlp/issues/11083)) by [szantnerb](https://github.com/szantnerb) +- **mojevideo**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/28b0ecba2af5b4919f198474b3d00a76ef322c31) ([#11019](https://github.com/yt-dlp/yt-dlp/issues/11019)) by [04-pasha-04](https://github.com/04-pasha-04), [pzhlkj6612](https://github.com/pzhlkj6612) +- **niconico**: [Fix m3u8 formats extraction](https://github.com/yt-dlp/yt-dlp/commit/eabb4680fdb09ba1f48d174a700a2e3b43f82add) ([#11103](https://github.com/yt-dlp/yt-dlp/issues/11103)) by [bashonly](https://github.com/bashonly) +- **nzz**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/4a9bc8c3630378bc29f0266126b503f6190c0430) ([#10461](https://github.com/yt-dlp/yt-dlp/issues/10461)) by [1-Byte](https://github.com/1-Byte) +- **patreoncampaign**: [Support API URLs](https://github.com/yt-dlp/yt-dlp/commit/232e6db30c474d1b387e405342f34173ceeaf832) ([#10734](https://github.com/yt-dlp/yt-dlp/issues/10734)) by [bashonly](https://github.com/bashonly), [hibes](https://github.com/hibes) +- **pinterest**: [Extend `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/c8c078fe28b0ffc15ef9646346c00c592fe71a78) ([#10867](https://github.com/yt-dlp/yt-dlp/issues/10867)) by [bashonly](https://github.com/bashonly), [sahilsinghss73](https://github.com/sahilsinghss73) +- **radiko**: [Extract unique `id` values](https://github.com/yt-dlp/yt-dlp/commit/c8d096c5ce111411fbdbe2abb8fed54f317a6182) ([#10726](https://github.com/yt-dlp/yt-dlp/issues/10726)) by [garret1317](https://github.com/garret1317) +- **rtp**: [Support more subpages](https://github.com/yt-dlp/yt-dlp/commit/d02df303d8e49390599db9f34482697e4d1cf5b2) ([#10787](https://github.com/yt-dlp/yt-dlp/issues/10787)) by [Demon000](https://github.com/Demon000) +- **rumblechannel**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/ad0b857f459a6d390fbf124183916218c52f223a) ([#11049](https://github.com/yt-dlp/yt-dlp/issues/11049)) by [tony-hn](https://github.com/tony-hn) +- **rutube**: [Support livestreams](https://github.com/yt-dlp/yt-dlp/commit/41be32e78c3845000dbac188ffb90ea3ea7c4dfa) ([#10844](https://github.com/yt-dlp/yt-dlp/issues/10844)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **samplefocus**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/46f4c80bc363ee8116c33d37f65202e6c3470954) ([#10947](https://github.com/yt-dlp/yt-dlp/issues/10947)) by [seproDev](https://github.com/seproDev) +- **screenrec**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/36f9e602ad55679764bc75a4f67f7562b1d6adcf) ([#10917](https://github.com/yt-dlp/yt-dlp/issues/10917)) by [naglis](https://github.com/naglis) +- **sen**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/41a241ca6ffb95b3d9aaf4f42106ca8cba9af1a6) ([#10952](https://github.com/yt-dlp/yt-dlp/issues/10952)) by [seproDev](https://github.com/seproDev) +- **servus**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/300c91274f7ea5b1b0528fc5ee11cf1a61d4079e) ([#10944](https://github.com/yt-dlp/yt-dlp/issues/10944)) by [seproDev](https://github.com/seproDev) +- **snapchatspotlight**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/b37417e4f934fd8909788b493d017777155b0ae5) ([#11030](https://github.com/yt-dlp/yt-dlp/issues/11030)) by [seproDev](https://github.com/seproDev) +- **svtpage**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/5a8a05aebb49693e78e1123015837ed5e961ff76) ([#11010](https://github.com/yt-dlp/yt-dlp/issues/11010)) by [diman8](https://github.com/diman8) +- **tenplay**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/d8d473002b654ab0e7b97ead869f58b4361eeae1) ([#10928](https://github.com/yt-dlp/yt-dlp/issues/10928)) by [aarubui](https://github.com/aarubui) +- **tiktok**: [Fix web formats extraction](https://github.com/yt-dlp/yt-dlp/commit/3ad0b7f422d547204df687b6d0b2d9110fff3990) ([#11074](https://github.com/yt-dlp/yt-dlp/issues/11074)) by [bashonly](https://github.com/bashonly) +- **twitter**: spaces: [Support video spaces](https://github.com/yt-dlp/yt-dlp/commit/bef1d4d6fc9493fda7f75e2289c07c507d10092f) ([#10789](https://github.com/yt-dlp/yt-dlp/issues/10789)) by [bashonly](https://github.com/bashonly) +- **vidflex**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/e978c312d6550a6ae4c9df18001afb1b420cb72f) ([#10002](https://github.com/yt-dlp/yt-dlp/issues/10002)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **vimeo** + - [Always try to extract original format](https://github.com/yt-dlp/yt-dlp/commit/4115c24d157c5b5f63089d75c4e0f51d1f8b4489) ([#10721](https://github.com/yt-dlp/yt-dlp/issues/10721)) by [bashonly](https://github.com/bashonly) (With fixes in [e8e6a98](https://github.com/yt-dlp/yt-dlp/commit/e8e6a982a1b659eed434d225d7922f632bac6568) by [seproDev](https://github.com/seproDev)) + - [Fix HLS audio format sorting](https://github.com/yt-dlp/yt-dlp/commit/a1b4ac2b8ed8e6eaa56044d439f1e0d00c2ba218) ([#11082](https://github.com/yt-dlp/yt-dlp/issues/11082)) by [fireattack](https://github.com/fireattack) +- **watchespn**: [Improve auth support](https://github.com/yt-dlp/yt-dlp/commit/7adff8caf152dcf96d03aff69ed8545c0a63567c) ([#10910](https://github.com/yt-dlp/yt-dlp/issues/10910)) by [ischmidt20](https://github.com/ischmidt20) +- **wistia**: [Support password-protected videos](https://github.com/yt-dlp/yt-dlp/commit/9f5c9a90898c5a1e672922d9cd799716c73cee34) ([#11100](https://github.com/yt-dlp/yt-dlp/issues/11100)) by [bashonly](https://github.com/bashonly) +- **ximalaya**: [Add VIP support](https://github.com/yt-dlp/yt-dlp/commit/3dfd720d098b4d49d69cfc77e6376f22bcd90934) ([#10832](https://github.com/yt-dlp/yt-dlp/issues/10832)) by [seproDev](https://github.com/seproDev), [xingchensong](https://github.com/xingchensong) +- **xinpianchang**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/3aa0156e05662923d130ddbc1c82596e38c01a00) ([#10950](https://github.com/yt-dlp/yt-dlp/issues/10950)) by [seproDev](https://github.com/seproDev) +- **yleareena**: [Support podcasts](https://github.com/yt-dlp/yt-dlp/commit/48d629d461e05b1b19f5e53dc959bb9ebe95da42) ([#11104](https://github.com/yt-dlp/yt-dlp/issues/11104)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Add `po_token`, `visitor_data`, `data_sync_id` extractor args](https://github.com/yt-dlp/yt-dlp/commit/3a3bd00037e9908e87da4fa9f2ad772aa34dc60e) ([#10648](https://github.com/yt-dlp/yt-dlp/issues/10648)) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [seproDev](https://github.com/seproDev) (With fixes in [fa2be9a](https://github.com/yt-dlp/yt-dlp/commit/fa2be9a7c63babede07480151363e54eee5702bd) by [bashonly](https://github.com/bashonly)) + - [Support excluding `player_client`s in extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/49f3741a820ed142f6866317c2e7d247b130960e) ([#10710](https://github.com/yt-dlp/yt-dlp/issues/10710)) by [bashonly](https://github.com/bashonly) + - clip: [Prioritize `https` formats](https://github.com/yt-dlp/yt-dlp/commit/1d84b780cf33a1d84756825ac23f990a905703df) ([#11102](https://github.com/yt-dlp/yt-dlp/issues/11102)) by [bashonly](https://github.com/bashonly) + - tab: [Fix shorts tab extraction](https://github.com/yt-dlp/yt-dlp/commit/9431777b4c37129a6093080c77ca59960afbb9d7) ([#10938](https://github.com/yt-dlp/yt-dlp/issues/10938)) by [seproDev](https://github.com/seproDev) + +#### Networking changes +- [Fix handler not being added to RequestError](https://github.com/yt-dlp/yt-dlp/commit/d1c4d88b2d912e8da5e76db455562ca63b1af690) ([#10955](https://github.com/yt-dlp/yt-dlp/issues/10955)) by [coletdjnz](https://github.com/coletdjnz) +- [Pin `curl-cffi` version to < 0.7.2](https://github.com/yt-dlp/yt-dlp/commit/5bb1aa04dafce13ba9de707ea53169fab58b5207) ([#11092](https://github.com/yt-dlp/yt-dlp/issues/11092)) by [bashonly](https://github.com/bashonly) +- **Request Handler**: websockets: [Upgrade websockets to 13.0](https://github.com/yt-dlp/yt-dlp/commit/6f9e6537434562d513d0c9b68ced8a61ade94a64) ([#10815](https://github.com/yt-dlp/yt-dlp/issues/10815)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **build** + - [Bump PyInstaller version pin to `>=6.10.0`](https://github.com/yt-dlp/yt-dlp/commit/fb8b7f226d251e521a89b23c415e249e5b788e5c) ([#10709](https://github.com/yt-dlp/yt-dlp/issues/10709)) by [bashonly](https://github.com/bashonly) + - [Pin `delocate` version for `macos`](https://github.com/yt-dlp/yt-dlp/commit/7e41628ff523b3fe373b0981a5db441358980dab) ([#10901](https://github.com/yt-dlp/yt-dlp/issues/10901)) by [bashonly](https://github.com/bashonly) +- **ci** + - [Add comment sanitization workflow](https://github.com/yt-dlp/yt-dlp/commit/b6200bdcf3a9415ae36859188f9a57e3e461c696) ([#10915](https://github.com/yt-dlp/yt-dlp/issues/10915)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + - [Add issue tracker anti-spam protection](https://github.com/yt-dlp/yt-dlp/commit/ad9a8115aa29a1a95c961b16fcf129a228d98f50) ([#10861](https://github.com/yt-dlp/yt-dlp/issues/10861)) by [bashonly](https://github.com/bashonly) +- **cleanup**: Miscellaneous: [c6387ab](https://github.com/yt-dlp/yt-dlp/commit/c6387abc1af9842bb0541288a5610abba9b1ab51) by [bashonly](https://github.com/bashonly), [Codenade](https://github.com/Codenade), [coletdjnz](https://github.com/coletdjnz), [grqz](https://github.com/grqz), [Grub4K](https://github.com/Grub4K), [pzhlkj6612](https://github.com/pzhlkj6612), [seproDev](https://github.com/seproDev) + ### 2024.08.06 #### Core changes diff --git a/README.md b/README.md index ca32e09bfb..1cafe51d51 100644 --- a/README.md +++ b/README.md @@ -200,7 +200,7 @@ #### Impersonation The following provide support for impersonating browser requests. This may be required for some sites that employ TLS fingerprinting. -* [**curl_cffi**](https://github.com/yifeikong/curl_cffi) (recommended) - Python binding for [curl-impersonate](https://github.com/lwthiker/curl-impersonate). Provides impersonation targets for Chrome, Edge and Safari. Licensed under [MIT](https://github.com/yifeikong/curl_cffi/blob/main/LICENSE) +* [**curl_cffi**](https://github.com/lexiforest/curl_cffi) (recommended) - Python binding for [curl-impersonate](https://github.com/lexiforest/curl-impersonate). Provides impersonation targets for Chrome, Edge and Safari. Licensed under [MIT](https://github.com/lexiforest/curl_cffi/blob/main/LICENSE) * Can be installed with the `curl-cffi` group, e.g. `pip install "yt-dlp[default,curl-cffi]"` * Currently included in `yt-dlp.exe`, `yt-dlp_linux` and `yt-dlp_macos` builds @@ -278,7 +278,7 @@ ### Related scripts * **`devscripts/update-version.py`** - Update the version number based on the current date. * **`devscripts/set-variant.py`** - Set the build variant of the executable. * **`devscripts/make_changelog.py`** - Create a markdown changelog using short commit messages and update `CONTRIBUTORS` file. -* **`devscripts/make_lazy_extractors.py`** - Create lazy extractors. Running this before building the binaries (any variant) will improve their startup performance. Set the environment variable `YTDLP_NO_LAZY_EXTRACTORS=1` if you wish to forcefully disable lazy extractor loading. +* **`devscripts/make_lazy_extractors.py`** - Create lazy extractors. Running this before building the binaries (any variant) will improve their startup performance. Set the environment variable `YTDLP_NO_LAZY_EXTRACTORS` to something nonempty to forcefully disable lazy extractor loading. Note: See their `--help` for more info. @@ -459,17 +459,17 @@ ## Video Selection: conditions. Use a "\" to escape "&" or quotes if needed. If used multiple times, the filter matches if at least one of the - conditions is met. E.g. --match-filter - !is_live --match-filter "like_count>?100 & + conditions is met. E.g. --match-filters + !is_live --match-filters "like_count>?100 & description~='(?i)\bcats \& dogs\b'" matches only videos that are not live OR those that have a like count more than 100 (or the like field is not available) and also has a description that contains the phrase "cats & - dogs" (caseless). Use "--match-filter -" to + dogs" (caseless). Use "--match-filters -" to interactively ask whether to download each video - --no-match-filters Do not use any --match-filter (default) + --no-match-filters Do not use any --match-filters (default) --break-match-filters FILTER Same as "--match-filters" but stops the download process when a video is rejected --no-break-match-filters Do not use any --break-match-filters (default) @@ -490,7 +490,7 @@ ## Video Selection: encountering a file that is in the archive (default) --break-per-input Alters --max-downloads, --break-on-existing, - --break-match-filter, and autonumber to + --break-match-filters, and autonumber to reset per input URL --no-break-per-input --break-on-existing and similar options terminates the entire download queue @@ -999,12 +999,16 @@ ## Post-Processing Options: be used multiple times --no-exec Remove any previously defined --exec --convert-subs FORMAT Convert the subtitles to another format - (currently supported: ass, lrc, srt, vtt) - (Alias: --convert-subtitles) + (currently supported: ass, lrc, srt, vtt). + Use "--convert-subs none" to disable + conversion (default) (Alias: --convert- + subtitles) --convert-thumbnails FORMAT Convert the thumbnails to another format (currently supported: jpg, png, webp). You can specify multiple rules using similar - syntax as --remux-video + syntax as "--remux-video". Use "--convert- + thumbnails none" to disable conversion + (default) --split-chapters Split video into multiple files based on internal chapters. The "chapter:" prefix can be used with "--paths" and "--output" to set @@ -1767,7 +1771,7 @@ # EXTRACTOR ARGUMENTS #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mediaconnect`, `mweb`, `android_producer`, `android_testsuite`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,web_creator` is used, and `tv_embedded`, `web_creator` and `mediaconnect` are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. Most `android` clients will be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients. +* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mediaconnect`, `mweb`, `android_producer`, `android_testsuite`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,mweb` is used, and `tv_embedded`, `web_creator` and `mediaconnect` are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. Most `android` clients will be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients. You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=all,-web` * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) @@ -1777,6 +1781,9 @@ #### youtube * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used * `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning +* `data_sync_id`: Overrides the account Data Sync ID used in Innertube API requests. This may be needed if you are using an account with `youtube:player_skip=webpage,configs` or `youtubetab:skip=webpage` +* `visitor_data`: Overrides the Visitor Data used in Innertube API requests. This should be used with `player_skip=webpage,configs` and without cookies. Note: this may have adverse effects if used improperly. If a session from a browser is wanted, you should pass cookies instead (which contain the Visitor ID) +* `po_token`: Proof of Origin (PO) Token(s) to use for requesting video playback. Comma seperated list of PO Tokens in the format `CLIENT+PO_TOKEN`, e.g. `youtube:po_token=web+XXX,android+YYY` #### youtubetab (YouTube playlists, channels, feeds, etc.) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) @@ -1788,6 +1795,7 @@ #### generic * `key_query`: Passthrough the master m3u8 URL query to its HLS AES-128 decryption key URI if no value is provided, or else apply the query string given as `key_query=VALUE`. Note that this will have no effect if the key URI is provided via the `hls_key` extractor-arg. Does not apply to ffmpeg * `hls_key`: An HLS AES-128 key URI *or* key (as hex), and optionally the IV (as hex), in the form of `(URI|KEY)[,IV]`; e.g. `generic:hls_key=ABCDEF1234567980,0xFEDCBA0987654321`. Passing any of these values will force usage of the native HLS downloader and override the corresponding values found in the m3u8 playlist * `is_live`: Bypass live HLS detection and manually set `live_status` - a value of `false` will set `not_live`, any other value (or no value) will set `is_live` +* `impersonate`: Target(s) to try and impersonate with the initial webpage request; e.g. `safari,chrome-110`. By default any available target will be used. Use `false` to disable impersonation #### funimation * `language`: Audio languages to extract, e.g. `funimation:language=english,japanese` @@ -1890,6 +1898,7 @@ # PLUGINS myplugin.py yt-dlp looks for these `yt_dlp_plugins` namespace folders in many locations (see below) and loads in plugins from **all** of them. +Set the environment variable `YTDLP_NO_PLUGINS` to something nonempty to disable loading plugins entirely. See the [wiki for some known plugins](https://github.com/yt-dlp/yt-dlp/wiki/Plugins) @@ -2177,9 +2186,9 @@ ### New features * **Output template improvements**: Output templates can now have date-time formatting, numeric offsets, object traversal etc. See [output template](#output-template) for details. Even more advanced operations can also be done with the help of `--parse-metadata` and `--replace-in-metadata` -* **Other new options**: Many new options have been added such as `--alias`, `--print`, `--concat-playlist`, `--wait-for-video`, `--retry-sleep`, `--sleep-requests`, `--convert-thumbnails`, `--force-download-archive`, `--force-overwrites`, `--break-match-filter` etc +* **Other new options**: Many new options have been added such as `--alias`, `--print`, `--concat-playlist`, `--wait-for-video`, `--retry-sleep`, `--sleep-requests`, `--convert-thumbnails`, `--force-download-archive`, `--force-overwrites`, `--break-match-filters` etc -* **Improvements**: Regex and other operators in `--format`/`--match-filter`, multiple `--postprocessor-args` and `--downloader-args`, faster archive checking, more [format selection options](#format-selection), merge multi-video/audio, multiple `--config-locations`, `--exec` at different stages, etc +* **Improvements**: Regex and other operators in `--format`/`--match-filters`, multiple `--postprocessor-args` and `--downloader-args`, faster archive checking, more [format selection options](#format-selection), merge multi-video/audio, multiple `--config-locations`, `--exec` at different stages, etc * **Plugins**: Extractors and PostProcessors can be loaded from an external file. See [plugins](#plugins) for details @@ -2220,7 +2229,7 @@ ### Differences in default behavior * `certifi` will be used for SSL root certificates, if installed. If you want to use system certificates (e.g. self-signed), use `--compat-options no-certifi` * yt-dlp's sanitization of invalid characters in filenames is different/smarter than in youtube-dl. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior * ~~yt-dlp tries to parse the external downloader outputs into the standard progress output if possible (Currently implemented: [aria2c](https://github.com/yt-dlp/yt-dlp/issues/5931)). You can use `--compat-options no-external-downloader-progress` to get the downloader output as-is~~ -* yt-dlp versions between 2021.09.01 and 2023.01.02 applies `--match-filter` to nested playlists. This was an unintentional side-effect of [8f18ac](https://github.com/yt-dlp/yt-dlp/commit/8f18aca8717bb0dd49054555af8d386e5eda3a88) and is fixed in [d7b460](https://github.com/yt-dlp/yt-dlp/commit/d7b460d0e5fc710950582baed2e3fc616ed98a80). Use `--compat-options playlist-match-filter` to revert this +* yt-dlp versions between 2021.09.01 and 2023.01.02 applies `--match-filters` to nested playlists. This was an unintentional side-effect of [8f18ac](https://github.com/yt-dlp/yt-dlp/commit/8f18aca8717bb0dd49054555af8d386e5eda3a88) and is fixed in [d7b460](https://github.com/yt-dlp/yt-dlp/commit/d7b460d0e5fc710950582baed2e3fc616ed98a80). Use `--compat-options playlist-match-filter` to revert this * yt-dlp versions between 2021.11.10 and 2023.06.21 estimated `filesize_approx` values for fragmented/manifest formats. This was added for convenience in [f2fe69](https://github.com/yt-dlp/yt-dlp/commit/f2fe69c7b0d208bdb1f6292b4ae92bc1e1a7444a), but was reverted in [0dff8e](https://github.com/yt-dlp/yt-dlp/commit/0dff8e4d1e6e9fb938f4256ea9af7d81f42fd54f) due to the potentially extreme inaccuracy of the estimated values. Use `--compat-options manifest-filesize-approx` to keep extracting the estimated values * yt-dlp uses modern http client backends such as `requests`. Use `--compat-options prefer-legacy-http-handler` to prefer the legacy http handler (`urllib`) to be used for standard http requests. * The sub-modules `swfinterp`, `casefold` are removed. @@ -2266,11 +2275,11 @@ #### Redundant options --get-thumbnail --print thumbnail -e, --get-title --print title -g, --get-url --print urls - --match-title REGEX --match-filter "title ~= (?i)REGEX" - --reject-title REGEX --match-filter "title !~= (?i)REGEX" - --min-views COUNT --match-filter "view_count >=? COUNT" - --max-views COUNT --match-filter "view_count <=? COUNT" - --break-on-reject Use --break-match-filter + --match-title REGEX --match-filters "title ~= (?i)REGEX" + --reject-title REGEX --match-filters "title !~= (?i)REGEX" + --min-views COUNT --match-filters "view_count >=? COUNT" + --max-views COUNT --match-filters "view_count <=? COUNT" + --break-on-reject Use --break-match-filters --user-agent UA --add-header "User-Agent:UA" --referer URL --add-header "Referer:URL" --playlist-start NUMBER -I NUMBER: diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index 5189de2d77..e7f553a5f2 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -185,5 +185,16 @@ "action": "add", "when": "6075a029dba70a89675ae1250e7cdfd91f0eba41", "short": "[priority] Security: [[ie/douyutv] Do not use dangerous javascript source/URL](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-3v33-3wmw-3785)\n - A dependency on potentially malicious third-party JavaScript code has been removed from the Douyu extractors" + }, + { + "action": "add", + "when": "fb8b7f226d251e521a89b23c415e249e5b788e5c", + "short": "[priority] **The minimum *recommended* Python version has been raised to 3.9**\nSince Python 3.8 will reach end-of-life in October 2024, support for it will be dropped soon. [Read more](https://github.com/yt-dlp/yt-dlp/issues/10086)" + }, + { + "action": "change", + "when": "b31b81d85f00601710d4fac590c3e4efb4133283", + "short": "[ci] Rerun failed tests (#11143)", + "authors": ["Grub4K"] } ] diff --git a/devscripts/make_issue_template.py b/devscripts/make_issue_template.py index a5d59f3c03..8135689c7e 100644 --- a/devscripts/make_issue_template.py +++ b/devscripts/make_issue_template.py @@ -46,6 +46,14 @@ render: shell validations: required: true + - type: markdown + attributes: + value: | + > [!CAUTION] + > ### GitHub is experiencing a high volume of malicious spam comments. + > ### If you receive any replies asking you download a file, do NOT follow the download links! + > + > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. '''.strip() NO_SKIP = ''' diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index d74ea202f0..d288d84296 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -2,7 +2,6 @@ # Allow direct execution import os -import shutil import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -34,18 +33,14 @@ class {name}({bases}): def main(): + os.environ['YTDLP_NO_PLUGINS'] = 'true' + os.environ['YTDLP_NO_LAZY_EXTRACTORS'] = 'true' + lazy_extractors_filename = get_filename_args(default_outfile='yt_dlp/extractor/lazy_extractors.py') - if os.path.exists(lazy_extractors_filename): - os.remove(lazy_extractors_filename) - _ALL_CLASSES = get_all_ies() # Must be before import - - import yt_dlp.plugins + from yt_dlp.extractor.extractors import _ALL_CLASSES from yt_dlp.extractor.common import InfoExtractor, SearchInfoExtractor - # Filter out plugins - _ALL_CLASSES = [cls for cls in _ALL_CLASSES if not cls.__module__.startswith(f'{yt_dlp.plugins.PACKAGE_NAME}.')] - DummyInfoExtractor = type('InfoExtractor', (InfoExtractor,), {'IE_NAME': NO_ATTR}) module_src = '\n'.join(( MODULE_TEMPLATE, @@ -58,20 +53,6 @@ def main(): write_file(lazy_extractors_filename, f'{module_src}\n') -def get_all_ies(): - PLUGINS_DIRNAME = 'ytdlp_plugins' - BLOCKED_DIRNAME = f'{PLUGINS_DIRNAME}_blocked' - if os.path.exists(PLUGINS_DIRNAME): - # os.rename cannot be used, e.g. in Docker. See https://github.com/yt-dlp/yt-dlp/pull/4958 - shutil.move(PLUGINS_DIRNAME, BLOCKED_DIRNAME) - try: - from yt_dlp.extractor.extractors import _ALL_CLASSES - finally: - if os.path.exists(BLOCKED_DIRNAME): - shutil.move(BLOCKED_DIRNAME, PLUGINS_DIRNAME) - return _ALL_CLASSES - - def extra_ie_code(ie, base=None): for var in STATIC_CLASS_PROPERTIES: val = getattr(ie, var) diff --git a/devscripts/run_tests.py b/devscripts/run_tests.py index c605aa62cf..eb614fe591 100755 --- a/devscripts/run_tests.py +++ b/devscripts/run_tests.py @@ -16,7 +16,7 @@ def parse_args(): parser = argparse.ArgumentParser(description='Run selected yt-dlp tests') parser.add_argument( - 'test', help='a extractor tests, or one of "core" or "download"', nargs='*') + 'test', help='an extractor test, test path, or one of "core" or "download"', nargs='*') parser.add_argument( '-k', help='run a test matching EXPRESSION. Same as "pytest -k"', metavar='EXPRESSION') parser.add_argument( @@ -27,7 +27,6 @@ def parse_args(): def run_tests(*tests, pattern=None, ci=False): run_core = 'core' in tests or (not pattern and not tests) run_download = 'download' in tests - tests = list(map(fix_test_name, tests)) pytest_args = args.pytest_args or os.getenv('HATCH_TEST_ARGS', '') arguments = ['pytest', '-Werror', '--tb=short', *shlex.split(pytest_args)] @@ -41,7 +40,9 @@ def run_tests(*tests, pattern=None, ci=False): arguments.extend(['-m', 'download']) else: arguments.extend( - f'test/test_download.py::TestDownload::test_{test}' for test in tests) + test if '/' in test + else f'test/test_download.py::TestDownload::test_{fix_test_name(test)}' + for test in tests) print(f'Running {arguments}', flush=True) try: diff --git a/pyproject.toml b/pyproject.toml index fe6894a428..200a9c99ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,7 @@ dependencies = [ default = [] curl-cffi = [ "curl-cffi==0.5.10; os_name=='nt' and implementation_name=='cpython'", - "curl-cffi>=0.5.10,!=0.6.*,<0.8; os_name!='nt' and implementation_name=='cpython'", + "curl-cffi>=0.5.10,!=0.6.*,<0.7.2; os_name!='nt' and implementation_name=='cpython'", ] secretstorage = [ "cffi", @@ -76,13 +76,14 @@ dev = [ ] static-analysis = [ "autopep8~=2.0", - "ruff~=0.5.0", + "ruff~=0.6.0", ] test = [ "pytest~=8.1", + "pytest-rerunfailures~=14.0", ] pyinstaller = [ - "pyinstaller>=6.7.0", # for compat with setuptools>=70 + "pyinstaller>=6.10.0", # Windows temp cleanup fixed in 6.10.0 ] py2exe = [ "py2exe>=0.12", @@ -162,7 +163,6 @@ lint-fix = "ruff check --fix {args:.}" features = ["test"] dependencies = [ "pytest-randomly~=3.15", - "pytest-rerunfailures~=14.0", "pytest-xdist[psutil]~=3.5", ] diff --git a/supportedsites.md b/supportedsites.md index e3bbe03ec7..e23d395fde 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -143,6 +143,7 @@ # Supported sites - **BBVTV**: [*bbvtv*](## "netrc machine") - **BBVTVLive**: [*bbvtv*](## "netrc machine") - **BBVTVRecordings**: [*bbvtv*](## "netrc machine") + - **BeaconTv** - **BeatBumpPlaylist** - **BeatBumpVideo** - **Beatport** @@ -505,6 +506,7 @@ # Supported sites - **gem.cbc.ca:playlist** - **Genius** - **GeniusLyrics** + - **Germanupa**: germanupa.de - **GetCourseRu**: [*getcourseru*](## "netrc machine") - **GetCourseRuPlayer** - **Gettr** @@ -580,6 +582,7 @@ # Supported sites - **HungamaAlbumPlaylist** - **HungamaSong** - **huya:live**: huya.com + - **huya:video**: 虎牙视频 - **Hypem** - **Hytale** - **Icareus** @@ -660,6 +663,7 @@ # Supported sites - **kick:vod** - **Kicker** - **KickStarter** + - **Kika**: KiKA.de - **kinja:embed** - **KinoPoisk** - **Kommunetv** @@ -722,7 +726,6 @@ # Supported sites - **livestream:original** - **Livestreamfails** - **Lnk** - - **LnkGo** - **loc**: Library of Congress - **loom** - **loom:folder** @@ -756,7 +759,7 @@ # Supported sites - **Masters** - **MatchTV** - **MBN**: mbn.co.kr (매일방송) - - **MDR**: MDR.DE and KiKA + - **MDR**: MDR.DE - **MedalTV** - **media.ccc.de** - **media.ccc.de:lists** @@ -811,6 +814,7 @@ # Supported sites - **MNetTVLive**: [*mnettv*](## "netrc machine") - **MNetTVRecordings**: [*mnettv*](## "netrc machine") - **MochaVideo** + - **Mojevideo**: mojevideo.sk - **Mojvideo** - **Monstercat** - **MonsterSirenHypergryphMusic** @@ -1285,12 +1289,14 @@ # Supported sites - **Screencast** - **Screencastify** - **ScreencastOMatic** + - **ScreenRec** - **ScrippsNetworks** - **scrippsnetworks:watch** - **Scrolller** - **SCTE**: [*scte*](## "netrc machine") (**Currently broken**) - **SCTECourse**: [*scte*](## "netrc machine") (**Currently broken**) - **sejm** + - **Sen** - **SenalColombiaLive**: (**Currently broken**) - **SenateGov** - **SenateISVP** @@ -1327,6 +1333,7 @@ # Supported sites - **SlidesLive** - **Slutload** - **Smotrim** + - **SnapchatSpotlight** - **Snotr** - **Sohu** - **SohuV** @@ -1608,6 +1615,7 @@ # Supported sites - **videomore:season** - **videomore:video** - **VideoPress** + - **Vidflex** - **Vidio**: [*vidio*](## "netrc machine") - **VidioLive**: [*vidio*](## "netrc machine") - **VidioPremier**: [*vidio*](## "netrc machine") @@ -1736,7 +1744,7 @@ # Supported sites - **XiaoHongShu**: 小红书 - **ximalaya**: 喜马拉雅FM - **ximalaya:album**: 喜马拉雅FM 专辑 - - **xinpianchang**: xinpianchang.com (**Currently broken**) + - **Xinpianchang**: 新片场 - **XMinus**: (**Currently broken**) - **XNXX** - **Xstream** diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 1847c4ffd8..a99e624080 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -236,6 +236,35 @@ def test_format_selection_video(self): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'vid-vcodec-dot') + def test_format_selection_by_vcodec_sort(self): + formats = [ + {'format_id': 'av1-format', 'ext': 'mp4', 'vcodec': 'av1', 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'vp9-hdr-format', 'ext': 'mp4', 'vcodec': 'vp09.02.50.10.01.09.18.09.00', 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'vp9-sdr-format', 'ext': 'mp4', 'vcodec': 'vp09.00.50.08', 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'h265-format', 'ext': 'mp4', 'vcodec': 'h265', 'acodec': 'none', 'url': TEST_URL}, + ] + info_dict = _make_result(formats) + + ydl = YDL({'format': 'bestvideo', 'format_sort': ['vcodec:vp9.2']}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'vp9-hdr-format') + + ydl = YDL({'format': 'bestvideo', 'format_sort': ['vcodec:vp9']}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'vp9-sdr-format') + + ydl = YDL({'format': 'bestvideo', 'format_sort': ['+vcodec:vp9.2']}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'vp9-hdr-format') + + ydl = YDL({'format': 'bestvideo', 'format_sort': ['+vcodec:vp9']}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'vp9-sdr-format') + def test_format_selection_string_ops(self): formats = [ {'format_id': 'abc-cba', 'ext': 'mp4', 'url': TEST_URL}, diff --git a/test/test_networking.py b/test/test_networking.py index 826f11a561..d96624af18 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -822,6 +822,24 @@ def test_remove_logging_handler(self, handler, logger_name): rh.close() assert len(logging_handlers) == before_count + def test_wrap_request_errors(self): + class TestRequestHandler(RequestHandler): + def _validate(self, request): + if request.headers.get('x-fail'): + raise UnsupportedRequest('test error') + + def _send(self, request: Request): + raise RequestError('test error') + + with TestRequestHandler(logger=FakeLogger()) as rh: + with pytest.raises(UnsupportedRequest, match='test error') as exc_info: + rh.validate(Request('http://example.com', headers={'x-fail': '1'})) + assert exc_info.value.handler is rh + + with pytest.raises(RequestError, match='test error') as exc_info: + rh.send(Request('http://example.com')) + assert exc_info.value.handler is rh + @pytest.mark.parametrize('handler', ['Urllib'], indirect=True) class TestUrllibRequestHandler(TestRequestHandlerBase): diff --git a/test/test_traversal.py b/test/test_traversal.py index 5d9fbe1d16..9179dadda4 100644 --- a/test/test_traversal.py +++ b/test/test_traversal.py @@ -4,8 +4,18 @@ import pytest -from yt_dlp.utils import dict_get, int_or_none, str_or_none -from yt_dlp.utils.traversal import traverse_obj +from yt_dlp.utils import ( + ExtractorError, + determine_ext, + dict_get, + int_or_none, + str_or_none, +) +from yt_dlp.utils.traversal import ( + traverse_obj, + require, + subs_list_to_dict, +) _TEST_DATA = { 100: 100, @@ -420,6 +430,71 @@ def test_traversal_morsel(self): assert traverse_obj(morsel, [(None,), any]) == morsel, \ 'Morsel should not be implicitly changed to dict on usage' + def test_traversal_filter(self): + data = [None, False, True, 0, 1, 0.0, 1.1, '', 'str', {}, {0: 0}, [], [1]] + + assert traverse_obj(data, [..., filter]) == [True, 1, 1.1, 'str', {0: 0}, [1]], \ + '`filter` should filter falsy values' + + +class TestTraversalHelpers: + def test_traversal_require(self): + with pytest.raises(ExtractorError): + traverse_obj(_TEST_DATA, ['None', {require('value')}]) + assert traverse_obj(_TEST_DATA, ['str', {require('value')}]) == 'str', \ + '`require` should pass through non `None` values' + + def test_subs_list_to_dict(self): + assert traverse_obj([ + {'name': 'de', 'url': 'https://example.com/subs/de.vtt'}, + {'name': 'en', 'url': 'https://example.com/subs/en1.ass'}, + {'name': 'en', 'url': 'https://example.com/subs/en2.ass'}, + ], [..., { + 'id': 'name', + 'url': 'url', + }, all, {subs_list_to_dict}]) == { + 'de': [{'url': 'https://example.com/subs/de.vtt'}], + 'en': [ + {'url': 'https://example.com/subs/en1.ass'}, + {'url': 'https://example.com/subs/en2.ass'}, + ], + }, 'function should build subtitle dict from list of subtitles' + assert traverse_obj([ + {'name': 'de', 'url': 'https://example.com/subs/de.ass'}, + {'name': 'de'}, + {'name': 'en', 'content': 'content'}, + {'url': 'https://example.com/subs/en'}, + ], [..., { + 'id': 'name', + 'data': 'content', + 'url': 'url', + }, all, {subs_list_to_dict}]) == { + 'de': [{'url': 'https://example.com/subs/de.ass'}], + 'en': [{'data': 'content'}], + }, 'subs with mandatory items missing should be filtered' + assert traverse_obj([ + {'url': 'https://example.com/subs/de.ass', 'name': 'de'}, + {'url': 'https://example.com/subs/en', 'name': 'en'}, + ], [..., { + 'id': 'name', + 'ext': ['url', {lambda x: determine_ext(x, default_ext=None)}], + 'url': 'url', + }, all, {subs_list_to_dict(ext='ext')}]) == { + 'de': [{'url': 'https://example.com/subs/de.ass', 'ext': 'ass'}], + 'en': [{'url': 'https://example.com/subs/en', 'ext': 'ext'}], + }, '`ext` should set default ext but leave existing value untouched' + assert traverse_obj([ + {'name': 'en', 'url': 'https://example.com/subs/en2', 'prio': True}, + {'name': 'en', 'url': 'https://example.com/subs/en1', 'prio': False}, + ], [..., { + 'id': 'name', + 'quality': ['prio', {int}], + 'url': 'url', + }, all, {subs_list_to_dict(ext='ext')}]) == {'en': [ + {'url': 'https://example.com/subs/en1', 'ext': 'ext'}, + {'url': 'https://example.com/subs/en2', 'ext': 'ext'}, + ]}, '`quality` key should sort subtitle list accordingly' + class TestDictGet: def test_dict_get(self): diff --git a/test/test_utils.py b/test/test_utils.py index a2b4593527..d4b846f56f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -221,9 +221,10 @@ def test_sanitize_ids(self): self.assertEqual(sanitize_filename('N0Y__7-UOdI', is_id=True), 'N0Y__7-UOdI') def test_sanitize_path(self): - if sys.platform != 'win32': - return + with unittest.mock.patch('sys.platform', 'win32'): + self._test_sanitize_path() + def _test_sanitize_path(self): self.assertEqual(sanitize_path('abc'), 'abc') self.assertEqual(sanitize_path('abc/def'), 'abc\\def') self.assertEqual(sanitize_path('abc\\def'), 'abc\\def') @@ -256,6 +257,11 @@ def test_sanitize_path(self): self.assertEqual(sanitize_path('./abc'), 'abc') self.assertEqual(sanitize_path('./../abc'), '..\\abc') + self.assertEqual(sanitize_path('\\abc'), '\\abc') + self.assertEqual(sanitize_path('C:abc'), 'C:abc') + self.assertEqual(sanitize_path('C:abc\\..\\'), 'C:..') + self.assertEqual(sanitize_path('C:\\abc:%(title)s.%(ext)s'), 'C:\\abc#%(title)s.%(ext)s') + def test_sanitize_url(self): self.assertEqual(sanitize_url('//foo.bar'), 'http://foo.bar') self.assertEqual(sanitize_url('httpss://foo.bar'), 'https://foo.bar') @@ -921,6 +927,11 @@ def test_parse_codecs(self): 'acodec': 'none', 'dynamic_range': 'HDR10', }) + self.assertEqual(parse_codecs('vp09.02.50.10.01.09.18.09.00'), { + 'vcodec': 'vp09.02.50.10.01.09.18.09.00', + 'acodec': 'none', + 'dynamic_range': 'HDR10', + }) self.assertEqual(parse_codecs('av01.0.12M.10.0.110.09.16.09.0'), { 'vcodec': 'av01.0.12M.10.0.110.09.16.09.0', 'acodec': 'none', diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 9691a1ea7c..eea1065036 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -27,7 +27,7 @@ from .cache import Cache from .compat import urllib # isort: split from .compat import compat_os_name, urllib_req_to_req -from .cookies import LenientSimpleCookie, load_cookies +from .cookies import CookieLoadError, LenientSimpleCookie, load_cookies from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name from .downloader.rtmp import rtmpdump_version from .extractor import gen_extractor_classes, get_info_extractor @@ -1624,7 +1624,7 @@ def wrapper(self, *args, **kwargs): while True: try: return func(self, *args, **kwargs) - except (DownloadCancelled, LazyList.IndexError, PagedList.IndexError): + except (CookieLoadError, DownloadCancelled, LazyList.IndexError, PagedList.IndexError): raise except ReExtractInfo as e: if e.expected: @@ -3580,6 +3580,8 @@ def __download_wrapper(self, func): def wrapper(*args, **kwargs): try: res = func(*args, **kwargs) + except CookieLoadError: + raise except UnavailableVideoError as e: self.report_error(e) except DownloadCancelled as e: @@ -4068,6 +4070,10 @@ def get_encoding(stream): write_debug(f'Proxy map: {self.proxies}') write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers.values())}') + if os.environ.get('YTDLP_NO_PLUGINS'): + write_debug('Plugins are forcibly disabled') + return + for plugin_type, plugins in {'Extractor': plugin_ies, 'Post-Processor': plugin_pps}.items(): display_list = ['{}{}'.format( klass.__name__, '' if klass.__name__ == name else f' as {name}') @@ -4113,8 +4119,14 @@ def proxies(self): @functools.cached_property def cookiejar(self): """Global cookiejar instance""" - return load_cookies( - self.params.get('cookiefile'), self.params.get('cookiesfrombrowser'), self) + try: + return load_cookies( + self.params.get('cookiefile'), self.params.get('cookiesfrombrowser'), self) + except CookieLoadError as error: + cause = error.__context__ + # compat: <=py3.9: `traceback.format_exception` has a different signature + self.report_error(str(cause), tb=''.join(traceback.format_exception(None, cause, cause.__traceback__))) + raise @property def _opener(self): diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index c0b8e3b507..f598b6c2fe 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -15,7 +15,7 @@ import traceback from .compat import compat_os_name -from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS +from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS, CookieLoadError from .downloader.external import get_external_downloader from .extractor import list_extractor_classes from .extractor.adobepass import MSO_INFO @@ -235,6 +235,11 @@ def validate_minmax(min_val, max_val, min_name, max_name=None): validate_regex('format sorting', f, FormatSorter.regex) # Postprocessor formats + if opts.convertsubtitles == 'none': + opts.convertsubtitles = None + if opts.convertthumbnails == 'none': + opts.convertthumbnails = None + validate_regex('merge output format', opts.merge_output_format, r'({0})(/({0}))*'.format('|'.join(map(re.escape, FFmpegMergerPP.SUPPORTED_EXTS)))) validate_regex('audio format', opts.audioformat, FFmpegExtractAudioPP.FORMAT_RE) @@ -1079,7 +1084,7 @@ def main(argv=None): _IN_CLI = True try: _exit(*variadic(_real_main(argv))) - except DownloadError: + except (CookieLoadError, DownloadError): _exit(1) except SameFileError as e: _exit(f'ERROR: {e}') diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 070d2fcb98..4a69c576be 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -34,6 +34,7 @@ from .minicurses import MultilinePrinter, QuietMultilinePrinter from .utils import ( DownloadError, + YoutubeDLError, Popen, error_to_str, expand_path, @@ -86,24 +87,31 @@ def _create_progress_bar(logger): return printer +class CookieLoadError(YoutubeDLError): + pass + + def load_cookies(cookie_file, browser_specification, ydl): - cookie_jars = [] - if browser_specification is not None: - browser_name, profile, keyring, container = _parse_browser_specification(*browser_specification) - cookie_jars.append( - extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl), keyring=keyring, container=container)) + try: + cookie_jars = [] + if browser_specification is not None: + browser_name, profile, keyring, container = _parse_browser_specification(*browser_specification) + cookie_jars.append( + extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl), keyring=keyring, container=container)) - if cookie_file is not None: - is_filename = is_path_like(cookie_file) - if is_filename: - cookie_file = expand_path(cookie_file) + if cookie_file is not None: + is_filename = is_path_like(cookie_file) + if is_filename: + cookie_file = expand_path(cookie_file) - jar = YoutubeDLCookieJar(cookie_file) - if not is_filename or os.access(cookie_file, os.R_OK): - jar.load() - cookie_jars.append(jar) + jar = YoutubeDLCookieJar(cookie_file) + if not is_filename or os.access(cookie_file, os.R_OK): + jar.load() + cookie_jars.append(jar) - return _merge_cookie_jars(cookie_jars) + return _merge_cookie_jars(cookie_jars) + except Exception: + raise CookieLoadError('failed to load cookies') def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger(), *, keyring=None, container=None): @@ -1053,8 +1061,9 @@ class DATA_BLOB(ctypes.Structure): ctypes.byref(blob_out), # pDataOut ) if not ret: - logger.warning('failed to decrypt with DPAPI', only_once=True) - return None + message = 'Failed to decrypt with DPAPI. See https://github.com/yt-dlp/yt-dlp/issues/10927 for more info' + logger.error(message) + raise DownloadError(message) # force exit result = ctypes.string_at(blob_out.pbData, blob_out.cbData) ctypes.windll.kernel32.LocalFree(blob_out.pbData) diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index ae2372915b..6c1ec403c8 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -508,7 +508,7 @@ def _call_downloader(self, tmpfilename, info_dict): env = None proxy = self.params.get('proxy') if proxy: - if not re.match(r'^[\da-zA-Z]+://', proxy): + if not re.match(r'[\da-zA-Z]+://', proxy): proxy = f'http://{proxy}' if proxy.startswith('socks'): @@ -559,7 +559,7 @@ def _call_downloader(self, tmpfilename, info_dict): selected_formats = info_dict.get('requested_formats') or [info_dict] for i, fmt in enumerate(selected_formats): - is_http = re.match(r'^https?://', fmt['url']) + is_http = re.match(r'https?://', fmt['url']) cookies = self.ydl.cookiejar.get_cookies_for_url(fmt['url']) if is_http else [] if cookies: args.extend(['-cookies', ''.join( diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 000a3cabb7..1f8dfb4ec8 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -217,6 +217,7 @@ BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE, ) +from .beacon import BeaconTvIE from .beatbump import ( BeatBumpPlaylistIE, BeatBumpVideoIE, @@ -730,6 +731,7 @@ GeniusIE, GeniusLyricsIE, ) +from .germanupa import GermanupaIE from .getcourseru import ( GetCourseRuIE, GetCourseRuPlayerIE, @@ -823,7 +825,10 @@ HungamaIE, HungamaSongIE, ) -from .huya import HuyaLiveIE +from .huya import ( + HuyaLiveIE, + HuyaVideoIE, +) from .hypem import HypemIE from .hypergryph import MonsterSirenHypergryphMusicIE from .hytale import HytaleIE @@ -946,6 +951,7 @@ ) from .kicker import KickerIE from .kickstarter import KickStarterIE +from .kika import KikaIE from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE from .kommunetv import KommunetvIE @@ -1037,10 +1043,7 @@ LivestreamShortenerIE, ) from .livestreamfails import LivestreamfailsIE -from .lnkgo import ( - LnkGoIE, - LnkIE, -) +from .lnk import LnkIE from .loom import ( LoomFolderIE, LoomIE, @@ -1165,6 +1168,7 @@ ) from .mlssoccer import MLSSoccerIE from .mocha import MochaVideoIE +from .mojevideo import MojevideoIE from .mojvideo import MojvideoIE from .monstercat import MonstercatIE from .motherless import ( @@ -1811,6 +1815,7 @@ from .screencast import ScreencastIE from .screencastify import ScreencastifyIE from .screencastomatic import ScreencastOMaticIE +from .screenrec import ScreenRecIE from .scrippsnetworks import ( ScrippsNetworksIE, ScrippsNetworksWatchIE, @@ -1821,6 +1826,7 @@ SCTECourseIE, ) from .sejmpl import SejmIE +from .sen import SenIE from .senalcolombia import SenalColombiaLiveIE from .senategov import ( SenateGovIE, @@ -1876,6 +1882,7 @@ from .slideslive import SlidesLiveIE from .slutload import SlutloadIE from .smotrim import SmotrimIE +from .snapchat import SnapchatSpotlightIE from .snotr import SnotrIE from .sohu import ( SohuIE, @@ -2312,6 +2319,7 @@ VideomoreVideoIE, ) from .videopress import VideoPressIE +from .vidflex import VidflexIE from .vidio import ( VidioIE, VidioLiveIE, diff --git a/yt_dlp/extractor/abc.py b/yt_dlp/extractor/abc.py index 7518ba6f0d..7296be73b3 100644 --- a/yt_dlp/extractor/abc.py +++ b/yt_dlp/extractor/abc.py @@ -387,17 +387,27 @@ class ABCIViewShowSeriesIE(InfoExtractor): 'thumbnail': r're:^https?://cdn\.iview\.abc\.net\.au/thumbs/.*\.jpg$', }, 'playlist_count': 15, + 'skip': 'This program is not currently available in ABC iview', + }, { + 'url': 'https://iview.abc.net.au/show/inbestigators', + 'info_dict': { + 'id': '175343-1', + 'title': 'Series 1', + 'description': 'md5:b9976935a6450e5b78ce2a940a755685', + 'series': 'The Inbestigators', + 'season': 'Series 1', + 'thumbnail': r're:^https?://cdn\.iview\.abc\.net\.au/thumbs/.+\.jpg', + }, + 'playlist_count': 17, }] def _real_extract(self, url): show_id = self._match_id(url) webpage = self._download_webpage(url, show_id) - webpage_data = self._search_regex( - r'window\.__INITIAL_STATE__\s*=\s*[\'"](.+?)[\'"]\s*;', - webpage, 'initial state') - video_data = self._parse_json( - unescapeHTML(webpage_data).encode().decode('unicode_escape'), show_id) - video_data = video_data['route']['pageData']['_embedded'] + video_data = self._search_json( + r'window\.__INITIAL_STATE__\s*=\s*[\'"]', webpage, 'initial state', show_id, + transform_source=lambda x: x.encode().decode('unicode_escape'), + end_pattern=r'[\'"]\s*;')['route']['pageData']['_embedded'] highlight = try_get(video_data, lambda x: x['highlightVideo']['shareUrl']) if not self._yes_playlist(show_id, bool(highlight), video_label='highlight video'): diff --git a/yt_dlp/extractor/academicearth.py b/yt_dlp/extractor/academicearth.py index d9691cb5c6..b997a02885 100644 --- a/yt_dlp/extractor/academicearth.py +++ b/yt_dlp/extractor/academicearth.py @@ -4,7 +4,7 @@ class AcademicEarthCourseIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/playlists/(?P[^?#/]+)' + _VALID_URL = r'https?://(?:www\.)?academicearth\.org/playlists/(?P[^?#/]+)' IE_NAME = 'AcademicEarth:Course' _TEST = { 'url': 'http://academicearth.org/playlists/laws-of-nature/', diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py index eb7e597e52..7cc15ec7b6 100644 --- a/yt_dlp/extractor/adobepass.py +++ b/yt_dlp/extractor/adobepass.py @@ -1355,6 +1355,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' + _MODERN_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:131.0) Gecko/20100101 Firefox/131.0' _MVPD_CACHE = 'ap-mvpd' _DOWNLOADING_LOGIN_PAGE = 'Downloading Provider Login Page' @@ -1454,7 +1455,11 @@ def extract_redirect_url(html, url=None, fatal=False): 'no_iframe': 'false', 'domain_name': 'adobe.com', 'redirect_url': url, - }) + }, headers={ + # yt-dlp's default user-agent is usually too old for Comcast_SSO + # See: https://github.com/yt-dlp/yt-dlp/issues/10848 + 'User-Agent': self._MODERN_USER_AGENT, + } if mso_id == 'Comcast_SSO' else None) elif not self._cookies_passed: raise_mvpd_required() diff --git a/yt_dlp/extractor/applepodcasts.py b/yt_dlp/extractor/applepodcasts.py index bd301e904a..b99d24e0eb 100644 --- a/yt_dlp/extractor/applepodcasts.py +++ b/yt_dlp/extractor/applepodcasts.py @@ -1,27 +1,42 @@ from .common import InfoExtractor from ..utils import ( - clean_html, clean_podcast_url, - get_element_by_class, int_or_none, parse_iso8601, - try_get, ) +from ..utils.traversal import traverse_obj class ApplePodcastsIE(InfoExtractor): _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P\d+)' _TESTS = [{ + 'url': 'https://podcasts.apple.com/us/podcast/ferreck-dawn-to-the-break-of-dawn-117/id1625658232?i=1000665010654', + 'md5': '82cc219b8cc1dcf8bfc5a5e99b23b172', + 'info_dict': { + 'id': '1000665010654', + 'ext': 'mp3', + 'title': 'Ferreck Dawn - To The Break of Dawn 117', + 'episode': 'Ferreck Dawn - To The Break of Dawn 117', + 'description': 'md5:1fc571102f79dbd0a77bfd71ffda23bc', + 'upload_date': '20240812', + 'timestamp': 1723449600, + 'duration': 3596, + 'series': 'Ferreck Dawn - To The Break of Dawn', + 'thumbnail': 're:.+[.](png|jpe?g|webp)', + }, + }, { 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', - 'md5': '41dc31cd650143e530d9423b6b5a344f', + 'md5': 'baf8a6b8b8aa6062dbb4639ed73d0052', 'info_dict': { 'id': '1000482637777', 'ext': 'mp3', 'title': '207 - Whitney Webb Returns', + 'episode': '207 - Whitney Webb Returns', + 'episode_number': 207, 'description': 'md5:75ef4316031df7b41ced4e7b987f79c6', 'upload_date': '20200705', 'timestamp': 1593932400, - 'duration': 6454, + 'duration': 5369, 'series': 'The Tim Dillon Show', 'thumbnail': 're:.+[.](png|jpe?g|webp)', }, @@ -39,47 +54,24 @@ class ApplePodcastsIE(InfoExtractor): def _real_extract(self, url): episode_id = self._match_id(url) webpage = self._download_webpage(url, episode_id) - episode_data = {} - ember_data = {} - # new page type 2021-11 - amp_data = self._parse_json(self._search_regex( - r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<', - webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {} - amp_data = try_get(amp_data, - lambda a: self._parse_json( - next(a[x] for x in iter(a) if episode_id in x), - episode_id), - dict) or {} - amp_data = amp_data.get('d') or [] - episode_data = try_get( - amp_data, - lambda a: next(x for x in a - if x['type'] == 'podcast-episodes' and x['id'] == episode_id), - dict) - if not episode_data: - # try pre 2021-11 page type: TODO: consider deleting if no longer used - ember_data = self._parse_json(self._search_regex( - r'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', - webpage, 'ember data'), episode_id) or {} - ember_data = ember_data.get(episode_id) or ember_data - episode_data = try_get(ember_data, lambda x: x['data'], dict) - episode = episode_data['attributes'] - description = episode.get('description') or {} - - series = None - for inc in (amp_data or ember_data.get('included') or []): - if inc.get('type') == 'media/podcast': - series = try_get(inc, lambda x: x['attributes']['name']) - series = series or clean_html(get_element_by_class('podcast-header__identity', webpage)) + server_data = self._search_json( + r'', webpage): + settings = self._search_json(r'var\s+settings\s*=[^{]*', script, 'settings', page_id, fatal=False) + if entry := self._parse_jwplayer_data(settings, page_id): + yield entry + def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) - entries = [] - for player_element in re.findall( - r'(<[^>]+class="kalturaPlayer[^"]*"[^>]*>)', webpage): - player_params = extract_attributes(player_element) - if player_params.get('data-type') not in ('kaltura_singleArticle',): - self.report_warning('Unsupported player type') - continue - entry_id = player_params['data-id'] - entries.append(self.url_result( - 'kaltura:1750922:' + entry_id, 'Kaltura', entry_id)) - - return self.playlist_result(entries, page_id) + return self.playlist_result(self._entries(webpage, page_id), page_id) diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index 4489d533a6..4d668cd37d 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -1,3 +1,4 @@ +import functools import itertools import urllib.parse @@ -22,13 +23,19 @@ class PatreonBaseIE(InfoExtractor): - USER_AGENT = 'Patreon/7.6.28 (Android; Android 11; Scale/2.10)' + @functools.cached_property + def patreon_user_agent(self): + # Patreon mobile UA is needed to avoid triggering Cloudflare anti-bot protection. + # Newer UA yields higher res m3u8 formats for locked posts, but gives 401 if not logged-in + if self._get_cookies('https://www.patreon.com/').get('session_id'): + return 'Patreon/72.2.28 (Android; Android 14; Scale/2.10)' + return 'Patreon/7.6.28 (Android; Android 11; Scale/2.10)' def _call_api(self, ep, item_id, query=None, headers=None, fatal=True, note=None): if headers is None: headers = {} if 'User-Agent' not in headers: - headers['User-Agent'] = self.USER_AGENT + headers['User-Agent'] = self.patreon_user_agent if query: query.update({'json-api-version': 1.0}) @@ -48,6 +55,7 @@ def _call_api(self, ep, item_id, query=None, headers=None, fatal=True, note=None class PatreonIE(PatreonBaseIE): + IE_NAME = 'patreon' _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?:creation\?hid=|posts/(?:[\w-]+-)?)(?P\d+)' _TESTS = [{ 'url': 'http://www.patreon.com/creation?hid=743933', @@ -111,6 +119,7 @@ class PatreonIE(PatreonBaseIE): 'comment_count': int, 'channel_is_verified': True, 'chapters': 'count:4', + 'timestamp': 1423689666, }, 'params': { 'noplaylist': True, @@ -221,6 +230,7 @@ class PatreonIE(PatreonBaseIE): 'thumbnail': r're:^https?://.+', }, 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # multiple attachments/embeds 'url': 'https://www.patreon.com/posts/holy-wars-solos-100601977', @@ -326,8 +336,13 @@ def _real_extract(self, url): if embed_url and (urlh := self._request_webpage( embed_url, video_id, 'Checking embed URL', headers=headers, fatal=False, errnote=False, expected_status=403)): + # Vimeo's Cloudflare anti-bot protection will return HTTP status 200 for 404, so we need + # to check for "Sorry, we couldn&rsquo;t find that page" in the meta description tag + meta_description = clean_html(self._html_search_meta( + 'description', self._webpage_read_content(urlh, embed_url, video_id, fatal=False), default=None)) # Password-protected vids.io embeds return 403 errors w/o --video-password or session cookie - if urlh.status != 403 or VidsIoIE.suitable(embed_url): + if ((urlh.status != 403 and meta_description != 'Sorry, we couldn’t find that page') + or VidsIoIE.suitable(embed_url)): entries.append(self.url_result(smuggle_url(embed_url, headers))) post_file = traverse_obj(attributes, ('post_file', {dict})) @@ -419,15 +434,19 @@ def _get_comments(self, post_id): class PatreonCampaignIE(PatreonBaseIE): - - _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?!rss)(?:(?:m|api/campaigns)/(?P\d+)|(?P[-\w]+))' + IE_NAME = 'patreon:campaign' + _VALID_URL = r'''(?x) + https?://(?:www\.)?patreon\.com/(?: + (?:m|api/campaigns)/(?P\d+)| + (?P(?!creation[?/]|posts/|rss[?/])[\w-]+) + )(?:/posts)?/?(?:$|[?#])''' _TESTS = [{ 'url': 'https://www.patreon.com/dissonancepod/', 'info_dict': { 'title': 'Cognitive Dissonance Podcast', 'channel_url': 'https://www.patreon.com/dissonancepod', 'id': '80642', - 'description': 'md5:eb2fa8b83da7ab887adeac34da6b7af7', + 'description': r're:(?s).*We produce a weekly news podcast focusing on stories that deal with skepticism and religion.*', 'channel_id': '80642', 'channel': 'Cognitive Dissonance Podcast', 'age_limit': 0, @@ -445,7 +464,7 @@ class PatreonCampaignIE(PatreonBaseIE): 'id': '4767637', 'channel_id': '4767637', 'channel_url': 'https://www.patreon.com/notjustbikes', - 'description': 'md5:9f4b70051216c4d5c58afe580ffc8d0f', + 'description': r're:(?s).*Not Just Bikes started as a way to explain why we chose to live in the Netherlands.*', 'age_limit': 0, 'channel': 'Not Just Bikes', 'uploader_url': 'https://www.patreon.com/notjustbikes', @@ -462,7 +481,7 @@ class PatreonCampaignIE(PatreonBaseIE): 'id': '4243769', 'channel_id': '4243769', 'channel_url': 'https://www.patreon.com/secondthought', - 'description': 'md5:69c89a3aba43efdb76e85eb023e8de8b', + 'description': r're:(?s).*Second Thought is an educational YouTube channel.*', 'age_limit': 0, 'channel': 'Second Thought', 'uploader_url': 'https://www.patreon.com/secondthought', @@ -482,10 +501,6 @@ class PatreonCampaignIE(PatreonBaseIE): 'only_matching': True, }] - @classmethod - def suitable(cls, url): - return False if PatreonIE.suitable(url) else super().suitable(url) - def _entries(self, campaign_id): cursor = None params = { @@ -512,7 +527,7 @@ def _real_extract(self, url): campaign_id, vanity = self._match_valid_url(url).group('campaign_id', 'vanity') if campaign_id is None: - webpage = self._download_webpage(url, vanity, headers={'User-Agent': self.USER_AGENT}) + webpage = self._download_webpage(url, vanity, headers={'User-Agent': self.patreon_user_agent}) campaign_id = self._search_nextjs_data( webpage, vanity)['props']['pageProps']['bootstrapEnvelope']['pageBootstrap']['campaign']['data']['id'] diff --git a/yt_dlp/extractor/pinterest.py b/yt_dlp/extractor/pinterest.py index 07f249498c..f0b38893b2 100644 --- a/yt_dlp/extractor/pinterest.py +++ b/yt_dlp/extractor/pinterest.py @@ -109,7 +109,7 @@ def _extract_video(self, data, extract_formats=True): class PinterestIE(PinterestBaseIE): - _VALID_URL = rf'{PinterestBaseIE._VALID_URL_BASE}/pin/(?P\d+)' + _VALID_URL = rf'{PinterestBaseIE._VALID_URL_BASE}/pin/(?:[\w-]+--)?(?P\d+)' _TESTS = [{ # formats found in data['videos'] 'url': 'https://www.pinterest.com/pin/664281013778109217/', @@ -174,6 +174,25 @@ class PinterestIE(PinterestBaseIE): }, { 'url': 'https://co.pinterest.com/pin/824721750502199491/', 'only_matching': True, + }, + { + 'url': 'https://pinterest.com/pin/dive-into-serenity-blue-lagoon-pedi-nails-for-a-tranquil-and-refreshing-spa-experience-video-in-2024--2885187256207927', + 'info_dict': { + 'id': '2885187256207927', + 'ext': 'mp4', + 'title': 'Dive into Serenity: Blue Lagoon Pedi Nails for a Tranquil and Refreshing Spa Experience! 💙💅', + 'description': 'md5:5da41c767d2317e42e49b663b0b2150f', + 'uploader': 'Glamour Artistry |Everyday Outfits, Luxury Fashion & Nail Designs', + 'uploader_id': '1142999717836434688', + 'upload_date': '20240702', + 'timestamp': 1719939156, + 'duration': 7.967, + 'comment_count': int, + 'repost_count': int, + 'categories': 'count:9', + 'tags': ['#BlueLagoonPediNails', '#SpaExperience'], + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index 679dc63234..e1e9777e8e 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -628,8 +628,7 @@ def is_404(e): page_entries = self._extract_entries(webpage, host) if not page_entries: break - for e in page_entries: - yield e + yield from page_entries if not self._has_more(webpage): break diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py index ff21963541..9d90439841 100644 --- a/yt_dlp/extractor/radiofrance.py +++ b/yt_dlp/extractor/radiofrance.py @@ -16,7 +16,7 @@ class RadioFranceIE(InfoExtractor): - _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P[^?#]+)' + _VALID_URL = r'https?://maison\.radiofrance\.fr/radiovisions/(?P[^?#]+)' IE_NAME = 'radiofrance' _TEST = { diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py index bc3e5f7eee..b633dc48af 100644 --- a/yt_dlp/extractor/reddit.py +++ b/yt_dlp/extractor/reddit.py @@ -1,3 +1,4 @@ +import json import urllib.parse from .common import InfoExtractor @@ -17,7 +18,7 @@ class RedditIE(InfoExtractor): _NETRC_MACHINE = 'reddit' - _VALID_URL = r'https?://(?P(?:\w+\.)?reddit(?:media)?\.com)/(?P(?:(?:r|user)/[^/]+/)?comments/(?P[^/?#&]+))' + _VALID_URL = r'https?://(?:\w+\.)?reddit(?:media)?\.com/(?P(?:(?:r|user)/[^/]+/)?comments/(?P[^/?#&]+))' _TESTS = [{ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', 'info_dict': { @@ -251,15 +252,15 @@ def _get_subtitles(self, video_id): return {'en': [{'url': caption_url}]} def _real_extract(self, url): - host, slug, video_id = self._match_valid_url(url).group('host', 'slug', 'id') + slug, video_id = self._match_valid_url(url).group('slug', 'id') - data = self._download_json( - f'https://{host}/{slug}/.json', video_id, fatal=False, expected_status=403) - if not data: - fallback_host = 'old.reddit.com' if host != 'old.reddit.com' else 'www.reddit.com' - self.to_screen(f'{host} request failed, retrying with {fallback_host}') + try: data = self._download_json( - f'https://{fallback_host}/{slug}/.json', video_id, expected_status=403) + f'https://www.reddit.com/{slug}/.json', video_id, expected_status=403) + except ExtractorError as e: + if isinstance(e.cause, json.JSONDecodeError): + self.raise_login_required('Account authentication is required') + raise if traverse_obj(data, 'error') == 403: reason = data.get('reason') diff --git a/yt_dlp/extractor/reverbnation.py b/yt_dlp/extractor/reverbnation.py index ddf8c3753f..f3bcc2c328 100644 --- a/yt_dlp/extractor/reverbnation.py +++ b/yt_dlp/extractor/reverbnation.py @@ -6,7 +6,7 @@ class ReverbNationIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?reverbnation\.com/.*?/song/(?P\d+).*?$' + _VALID_URL = r'https?://(?:www\.)?reverbnation\.com/.*?/song/(?P\d+).*?$' _TESTS = [{ 'url': 'http://www.reverbnation.com/alkilados/song/16965047-mona-lisa', 'md5': 'c0aaf339bcee189495fdf5a8c8ba8645', diff --git a/yt_dlp/extractor/rtp.py b/yt_dlp/extractor/rtp.py index 944e8636ab..26aec2e4cc 100644 --- a/yt_dlp/extractor/rtp.py +++ b/yt_dlp/extractor/rtp.py @@ -8,7 +8,7 @@ class RTPIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P[0-9]+)/(?P[^/?#]+)/?' + _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/(?:(?:estudoemcasa|palco|zigzag)/)?p(?P[0-9]+)/(?P[^/?#]+)' _TESTS = [{ 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas', 'md5': 'e736ce0c665e459ddb818546220b4ef8', @@ -19,9 +19,25 @@ class RTPIE(InfoExtractor): 'description': 'As paixões musicais de António Cartaxo e António Macedo', 'thumbnail': r're:^https?://.*\.jpg', }, + }, { + 'url': 'https://www.rtp.pt/play/zigzag/p13166/e757904/25-curiosidades-25-de-abril', + 'md5': '9a81ed53f2b2197cfa7ed455b12f8ade', + 'info_dict': { + 'id': 'e757904', + 'ext': 'mp4', + 'title': '25 Curiosidades, 25 de Abril', + 'description': 'Estudar ou não estudar - Em cada um dos episódios descobrimos uma curiosidade acerca de como era viver em Portugal antes da revolução do 25 de abr', + 'thumbnail': r're:^https?://.*\.jpg', + }, }, { 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas', 'only_matching': True, + }, { + 'url': 'https://www.rtp.pt/play/estudoemcasa/p7776/portugues-1-ano', + 'only_matching': True, + }, { + 'url': 'https://www.rtp.pt/play/palco/p13785/l7nnon', + 'only_matching': True, }] _RX_OBFUSCATION = re.compile(r'''(?xs) @@ -49,17 +65,17 @@ def _real_extract(self, url): f, config = self._search_regex( r'''(?sx) - var\s+f\s*=\s*(?P".*?"|{[^;]+?});\s* + (?:var\s+f\s*=\s*(?P".*?"|{[^;]+?});\s*)? var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P{(?:(?!\*/).)+?})\);(?!\s*\*/) ''', webpage, 'player config', group=('f', 'config')) - f = self._parse_json( - f, video_id, - lambda data: self.__unobfuscate(data, video_id=video_id)) config = self._parse_json( config, video_id, lambda data: self.__unobfuscate(data, video_id=video_id)) + f = config['file'] if not f else self._parse_json( + f, video_id, + lambda data: self.__unobfuscate(data, video_id=video_id)) formats = [] if isinstance(f, dict): diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index db780a2cf4..74c7e4f176 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -8,14 +8,17 @@ UnsupportedError, clean_html, determine_ext, + extract_attributes, format_field, get_element_by_class, + get_elements_html_by_class, int_or_none, join_nonempty, parse_count, parse_iso8601, traverse_obj, unescapeHTML, + urljoin, ) @@ -382,8 +385,10 @@ def entries(self, url, playlist_id): if isinstance(e.cause, HTTPError) and e.cause.status == 404: break raise - for video_url in re.findall(r'class="[^>"]*videostream__link[^>]+href="([^"]+\.html)"', webpage): - yield self.url_result('https://rumble.com' + video_url) + for video_url in traverse_obj( + get_elements_html_by_class('videostream__link', webpage), (..., {extract_attributes}, 'href'), + ): + yield self.url_result(urljoin('https://rumble.com', video_url)) def _real_extract(self, url): url, playlist_id = self._match_valid_url(url).groups() diff --git a/yt_dlp/extractor/rutube.py b/yt_dlp/extractor/rutube.py index d389b32091..2c416811af 100644 --- a/yt_dlp/extractor/rutube.py +++ b/yt_dlp/extractor/rutube.py @@ -6,6 +6,7 @@ determine_ext, int_or_none, parse_qs, + traverse_obj, try_get, unified_timestamp, url_or_none, @@ -80,6 +81,8 @@ def _extract_formats(self, options, video_id): 'url': format_url, 'format_id': format_id, }) + for hls_url in traverse_obj(options, ('live_streams', 'hls', ..., 'url', {url_or_none})): + formats.extend(self._extract_m3u8_formats(hls_url, video_id, ext='mp4', fatal=False)) return formats def _download_and_extract_formats(self, video_id, query=None): @@ -90,7 +93,7 @@ def _download_and_extract_formats(self, video_id, query=None): class RutubeIE(RutubeBaseIE): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' - _VALID_URL = r'https?://rutube\.ru/(?:video(?:/private)?|(?:play/)?embed)/(?P[\da-z]{32})' + _VALID_URL = r'https?://rutube\.ru/(?:(?:live/)?video(?:/private)?|(?:play/)?embed)/(?P[\da-z]{32})' _EMBED_REGEX = [r']+?src=(["\'])(?P(?:https?:)?//rutube\.ru/(?:play/)?embed/[\da-z]{32}.*?)\1'] _TESTS = [{ @@ -164,6 +167,29 @@ class RutubeIE(RutubeBaseIE): 'uploader': 'Стас Быков', }, 'expected_warnings': ['Unable to download f4m'], + }, { + 'url': 'https://rutube.ru/live/video/c58f502c7bb34a8fcdd976b221fca292/', + 'info_dict': { + 'id': 'c58f502c7bb34a8fcdd976b221fca292', + 'ext': 'mp4', + 'categories': ['Телепередачи'], + 'description': '', + 'thumbnail': 'http://pic.rutubelist.ru/video/14/19/14190807c0c48b40361aca93ad0867c7.jpg', + 'live_status': 'is_live', + 'age_limit': 0, + 'uploader_id': '23460655', + 'timestamp': 1652972968, + 'view_count': int, + 'upload_date': '20220519', + 'title': r're:Первый канал. Прямой эфир \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'uploader': 'Первый канал', + }, + }, { + 'url': 'https://rutube.ru/video/5ab908fccfac5bb43ef2b1e4182256b0/', + 'only_matching': True, + }, { + 'url': 'https://rutube.ru/live/video/private/c58f502c7bb34a8fcdd976b221fca292/', + 'only_matching': True, }] @classmethod diff --git a/yt_dlp/extractor/samplefocus.py b/yt_dlp/extractor/samplefocus.py index 36ceb0254d..3db3ce1424 100644 --- a/yt_dlp/extractor/samplefocus.py +++ b/yt_dlp/extractor/samplefocus.py @@ -36,7 +36,7 @@ class SampleFocusIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + webpage = self._download_webpage(url, display_id, impersonate=True) sample_id = self._search_regex( r']+id=(["\'])sample_id\1[^>]+value=(?:["\'])(?P\d+)', @@ -82,7 +82,15 @@ def extract_count(klass): return { 'id': sample_id, 'title': title, - 'url': mp3_url, + 'formats': [{ + 'url': mp3_url, + 'ext': 'mp3', + 'vcodec': 'none', + 'acodec': 'mp3', + 'http_headers': { + 'Referer': url, + }, + }], 'display_id': display_id, 'thumbnail': thumbnail, 'uploader': uploader, diff --git a/yt_dlp/extractor/screenrec.py b/yt_dlp/extractor/screenrec.py new file mode 100644 index 0000000000..64f8d2494a --- /dev/null +++ b/yt_dlp/extractor/screenrec.py @@ -0,0 +1,33 @@ +from .common import InfoExtractor + + +class ScreenRecIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?screenrec\.com/share/(?P\w{10})' + _TESTS = [{ + 'url': 'https://screenrec.com/share/DasLtbknYo', + 'info_dict': { + 'id': 'DasLtbknYo', + 'ext': 'mp4', + 'title': '02.05.2024_03.01.25_REC', + 'description': 'Recorded with ScreenRec', + 'thumbnail': r're:^https?://.*\.gif$', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + m3u8_url = self._search_regex( + r'customUrl\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'm3u8 URL', group='url') + + return { + 'id': video_id, + 'title': self._og_search_title(webpage, default=None) or self._html_extract_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4'), + } diff --git a/yt_dlp/extractor/sen.py b/yt_dlp/extractor/sen.py new file mode 100644 index 0000000000..d8f14ecdc0 --- /dev/null +++ b/yt_dlp/extractor/sen.py @@ -0,0 +1,36 @@ +from .common import InfoExtractor +from ..utils import url_or_none +from ..utils.traversal import traverse_obj + + +class SenIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?sen\.com/video/(?P[0-9a-f-]+)' + _TEST = { + 'url': 'https://www.sen.com/video/eef46eb1-4d79-4e28-be9d-bd937767f8c4', + 'md5': 'ff615aca9691053c94f8f10d96cd7884', + 'info_dict': { + 'id': 'eef46eb1-4d79-4e28-be9d-bd937767f8c4', + 'ext': 'mp4', + 'description': 'Florida, 28 Sep 2022', + 'title': 'Hurricane Ian', + 'tags': ['North America', 'Storm', 'Weather'], + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + api_data = self._download_json(f'https://api.sen.com/content/public/video/{video_id}', video_id) + m3u8_url = (traverse_obj(api_data, ( + 'data', 'nodes', lambda _, v: v['id'] == 'player', 'video', 'url', {url_or_none}, any)) + or f'https://vod.sen.com/videos/{video_id}/manifest.m3u8') + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4'), + **traverse_obj(api_data, ('data', 'nodes', lambda _, v: v['id'] == 'details', any, 'content', { + 'title': ('title', 'text', {str}), + 'description': ('descriptions', 0, 'text', {str}), + 'tags': ('badges', ..., 'text', {str}), + })), + } diff --git a/yt_dlp/extractor/servus.py b/yt_dlp/extractor/servus.py index 117f180814..841c7ebf33 100644 --- a/yt_dlp/extractor/servus.py +++ b/yt_dlp/extractor/servus.py @@ -27,7 +27,7 @@ class ServusIE(InfoExtractor): 'info_dict': { 'id': 'AA-28BYCQNH92111', 'ext': 'mp4', - 'title': 'Klettersteige in den Alpen', + 'title': 'Vie Ferrate - Klettersteige in den Alpen', 'description': 'md5:25e47ddd83a009a0f9789ba18f2850ce', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 2823, @@ -38,6 +38,7 @@ class ServusIE(InfoExtractor): 'season_number': 11, 'episode': 'Episode 8 - Vie Ferrate – Klettersteige in den Alpen', 'episode_number': 8, + 'categories': ['Bergwelten'], }, 'params': {'skip_download': 'm3u8'}, }, { @@ -71,8 +72,11 @@ class ServusIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url).upper() + webpage = self._download_webpage(url, video_id) + next_data = self._search_nextjs_data(webpage, video_id, fatal=False) + video = self._download_json( - 'https://api-player.redbull.com/stv/servus-tv?timeZone=Europe/Berlin', + 'https://api-player.redbull.com/stv/servus-tv-playnet', video_id, 'Downloading video JSON', query={'videoId': video_id}) if not video.get('videoUrl'): self._report_errors(video) @@ -89,7 +93,7 @@ def _real_extract(self, url): return { 'id': video_id, 'title': video.get('title'), - 'description': self._get_description(video_id) or video.get('description'), + 'description': self._get_description(next_data) or video.get('description'), 'thumbnail': video.get('poster'), 'duration': float_or_none(video.get('duration')), 'timestamp': unified_timestamp(video.get('currentSunrise')), @@ -100,16 +104,19 @@ def _real_extract(self, url): 'episode_number': episode_number, 'formats': formats, 'subtitles': subtitles, + **traverse_obj(next_data, ('props', 'pageProps', 'data', { + 'title': ('title', 'rendered', {str}), + 'timestamp': ('stv_date', 'raw', {int}), + 'duration': ('stv_duration', {float_or_none}), + 'categories': ('category_names', ..., {str}), + })), } - def _get_description(self, video_id): - info = self._download_json( - f'https://backend.servustv.com/wp-json/rbmh/v2/media_asset/aa_id/{video_id}?fieldset=page', - video_id, fatal=False) - - return join_nonempty(*traverse_obj(info, ( - ('stv_short_description', 'stv_long_description'), - {lambda x: unescapeHTML(x.replace('\n\n', '\n'))})), delim='\n\n') + def _get_description(self, next_data): + return join_nonempty(*traverse_obj(next_data, ( + 'props', 'pageProps', 'data', + ('stv_short_description', 'stv_long_description'), {str}, + {lambda x: x.replace('\n\n', '\n')}, {unescapeHTML})), delim='\n\n') def _report_errors(self, video): playability_errors = traverse_obj(video, ('playabilityErrors', ...)) diff --git a/yt_dlp/extractor/snapchat.py b/yt_dlp/extractor/snapchat.py new file mode 100644 index 0000000000..732677c190 --- /dev/null +++ b/yt_dlp/extractor/snapchat.py @@ -0,0 +1,76 @@ +from .common import InfoExtractor +from ..utils import float_or_none, int_or_none, url_or_none +from ..utils.traversal import traverse_obj + + +class SnapchatSpotlightIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?snapchat\.com/spotlight/(?P\w+)' + + _TESTS = [{ + 'url': 'https://www.snapchat.com/spotlight/W7_EDlXWTBiXAEEniNoMPwAAYYWtidGhudGZpAX1TKn0JAX1TKnXJAAAAAA', + 'md5': '46c580f63592d0cbb76e974d2f9f0fcc', + 'info_dict': { + 'id': 'W7_EDlXWTBiXAEEniNoMPwAAYYWtidGhudGZpAX1TKn0JAX1TKnXJAAAAAA', + 'ext': 'mp4', + 'title': 'Views 💕', + 'description': '', + 'thumbnail': r're:https://cf-st\.sc-cdn\.net/d/kKJHIR1QAznRKK9jgYYDq\.256\.IRZXSOY', + 'duration': 4.665, + 'timestamp': 1637777831.369, + 'upload_date': '20211124', + 'repost_count': int, + 'uploader': 'shreypatel57', + 'uploader_url': 'https://www.snapchat.com/add/shreypatel57', + }, + }, { + 'url': 'https://www.snapchat.com/spotlight/W7_EDlXWTBiXAEEniNoMPwAAYcnVjYWdwcGV1AZEaIYn5AZEaIYnrAAAAAQ', + 'md5': '4cd9626458c1a0e3e6dbe72c544a9ec2', + 'info_dict': { + 'id': 'W7_EDlXWTBiXAEEniNoMPwAAYcnVjYWdwcGV1AZEaIYn5AZEaIYnrAAAAAQ', + 'ext': 'mp4', + 'title': 'Spotlight Snap', + 'description': 'How he flirt her teacher🤭🤭🤩😍 #kdrama#cdrama #dramaclips #dramaspotlight', + 'thumbnail': r're:https://cf-st\.sc-cdn\.net/i/ztfr6xFs0FOcFhwVczWfj\.256\.IRZXSOY', + 'duration': 10.91, + 'timestamp': 1722720291.307, + 'upload_date': '20240803', + 'view_count': int, + 'repost_count': int, + 'uploader': 'ganda0535', + 'uploader_url': 'https://www.snapchat.com/add/ganda0535', + 'tags': ['#dramaspotlight', '#dramaclips', '#cdrama', '#kdrama'], + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + page_props = self._search_nextjs_data(webpage, video_id)['props']['pageProps'] + video_data = traverse_obj(page_props, ( + 'spotlightFeed', 'spotlightStories', + lambda _, v: v['story']['storyId']['value'] == video_id, 'metadata', any), None) + + return { + 'id': video_id, + 'ext': 'mp4', + **traverse_obj(video_data, ('videoMetadata', { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'timestamp': ('uploadDateMs', {lambda x: float_or_none(x, 1000)}), + 'view_count': ('viewCount', {int_or_none}, {lambda x: None if x == -1 else x}), + 'repost_count': ('shareCount', {int_or_none}), + 'url': ('contentUrl', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'duration': ('durationMs', {lambda x: float_or_none(x, 1000)}), + 'thumbnail': ('thumbnailUrl', {url_or_none}), + 'uploader': ('creator', 'personCreator', 'username', {str}), + 'uploader_url': ('creator', 'personCreator', 'url', {url_or_none}), + })), + **traverse_obj(video_data, { + 'description': ('description', {str}), + 'tags': ('hashtags', ..., {str}), + 'view_count': ('engagementStats', 'viewCount', {int_or_none}, {lambda x: None if x == -1 else x}), + 'repost_count': ('engagementStats', 'shareCount', {int_or_none}), + }), + } diff --git a/yt_dlp/extractor/svt.py b/yt_dlp/extractor/svt.py index 38782abac7..b5df2e1a18 100644 --- a/yt_dlp/extractor/svt.py +++ b/yt_dlp/extractor/svt.py @@ -472,7 +472,7 @@ def _real_extract(self, url): title = self._og_search_title(webpage) urql_state = self._search_json( - r'window\.svt\.nyh\.urqlState\s*=', webpage, 'json data', display_id) + r'window\.svt\.(?:nyh\.)?urqlState\s*=', webpage, 'json data', display_id) data = traverse_obj(urql_state, (..., 'data', {str}, {json.loads}), get_all=False) or {} diff --git a/yt_dlp/extractor/tele13.py b/yt_dlp/extractor/tele13.py index c5ca208fb4..0d721773ed 100644 --- a/yt_dlp/extractor/tele13.py +++ b/yt_dlp/extractor/tele13.py @@ -8,7 +8,7 @@ class Tele13IE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P[\w-]+)' _TESTS = [ { 'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py index d8c556acef..07db583470 100644 --- a/yt_dlp/extractor/tenplay.py +++ b/yt_dlp/extractor/tenplay.py @@ -1,33 +1,31 @@ -import base64 -import datetime as dt import functools import itertools from .common import InfoExtractor from ..networking import HEADRequest -from ..utils import int_or_none, traverse_obj, urlencode_postdata, urljoin +from ..utils import int_or_none, traverse_obj, url_or_none, urljoin class TenPlayIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?:[^/]+/)+(?Ptpv\d{6}[a-z]{5})' _NETRC_MACHINE = '10play' _TESTS = [{ - 'url': 'https://10play.com.au/neighbours/web-extras/season-39/nathan-borg-is-the-first-aussie-actor-with-a-cochlear-implant-to-join-neighbours/tpv210128qupwd', + 'url': 'https://10play.com.au/neighbours/web-extras/season-41/heres-a-first-look-at-mischa-bartons-neighbours-debut/tpv230911hyxnz', 'info_dict': { - 'id': '6226844312001', + 'id': '6336940246112', 'ext': 'mp4', - 'title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours', - 'alt_title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours', - 'description': 'md5:a02d0199c901c2dd4c796f1e7dd0de43', - 'duration': 186, - 'season': 'Season 39', - 'season_number': 39, + 'title': 'Here\'s A First Look At Mischa Barton\'s Neighbours Debut', + 'alt_title': 'Here\'s A First Look At Mischa Barton\'s Neighbours Debut', + 'description': 'Neighbours Premieres Monday, September 18 At 4:30pm On 10 And 10 Play And 6:30pm On 10 Peach', + 'duration': 74, + 'season': 'Season 41', + 'season_number': 41, 'series': 'Neighbours', 'thumbnail': r're:https://.*\.jpg', 'uploader': 'Channel 10', 'age_limit': 15, - 'timestamp': 1611810000, - 'upload_date': '20210128', + 'timestamp': 1694386800, + 'upload_date': '20230910', 'uploader_id': '2199827728001', }, 'params': { @@ -35,21 +33,30 @@ class TenPlayIE(InfoExtractor): }, 'skip': 'Only available in Australia', }, { - 'url': 'https://10play.com.au/todd-sampsons-body-hack/episodes/season-4/episode-7/tpv200921kvngh', + 'url': 'https://10play.com.au/neighbours/episodes/season-42/episode-9107/tpv240902nzqyp', 'info_dict': { - 'id': '6192880312001', + 'id': '9000000000091177', 'ext': 'mp4', - 'title': "Todd Sampson's Body Hack - S4 Ep. 2", - 'description': 'md5:fa278820ad90f08ea187f9458316ac74', + 'title': 'Neighbours - S42 Ep. 9107', + 'alt_title': 'Thu 05 Sep', + 'description': 'md5:37a1f4271be34b9ee2b533426a5fbaef', + 'duration': 1388, + 'episode': 'Episode 9107', + 'episode_number': 9107, + 'season': 'Season 42', + 'season_number': 42, + 'series': 'Neighbours', + 'thumbnail': r're:https://.*\.jpg', 'age_limit': 15, - 'timestamp': 1600770600, - 'upload_date': '20200922', + 'timestamp': 1725517860, + 'upload_date': '20240905', 'uploader': 'Channel 10', 'uploader_id': '2199827728001', }, 'params': { 'skip_download': True, }, + 'skip': 'Only available in Australia', }, { 'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc', 'only_matching': True, @@ -66,55 +73,42 @@ class TenPlayIE(InfoExtractor): 'X': 18, } - def _get_bearer_token(self, video_id): - username, password = self._get_login_info() - if username is None or password is None: - self.raise_login_required('Your 10play account\'s details must be provided with --username and --password.') - _timestamp = dt.datetime.now().strftime('%Y%m%d000000') - _auth_header = base64.b64encode(_timestamp.encode('ascii')).decode('ascii') - data = self._download_json('https://10play.com.au/api/user/auth', video_id, 'Getting bearer token', headers={ - 'X-Network-Ten-Auth': _auth_header, - }, data=urlencode_postdata({ - 'email': username, - 'password': password, - })) - return 'Bearer ' + data['jwt']['accessToken'] - def _real_extract(self, url): content_id = self._match_id(url) data = self._download_json( 'https://10play.com.au/api/v1/videos/' + content_id, content_id) - headers = {} - if data.get('memberGated') is True: - _token = self._get_bearer_token(content_id) - headers = {'Authorization': _token} - - _video_url = self._download_json( - data.get('playbackApiEndpoint'), content_id, 'Downloading video JSON', - headers=headers).get('source') - m3u8_url = self._request_webpage(HEADRequest( - _video_url), content_id).url + video_data = self._download_json( + f'https://vod.ten.com.au/api/videos/bcquery?command=find_videos_by_id&video_id={data["altId"]}', + content_id, 'Downloading video JSON') + m3u8_url = self._request_webpage( + HEADRequest(video_data['items'][0]['HLSURL']), + content_id, 'Checking stream URL').url if '10play-not-in-oz' in m3u8_url: self.raise_geo_restricted(countries=['AU']) + # Attempt to get a higher quality stream + m3u8_url = m3u8_url.replace(',150,75,55,0000', ',300,150,75,55,0000') formats = self._extract_m3u8_formats(m3u8_url, content_id, 'mp4') return { + 'id': content_id, 'formats': formats, - 'subtitles': {'en': [{'url': data.get('captionUrl')}]} if data.get('captionUrl') else None, - 'id': data.get('altId') or content_id, - 'duration': data.get('duration'), - 'title': data.get('subtitle'), - 'alt_title': data.get('title'), - 'description': data.get('description'), - 'age_limit': self._AUS_AGES.get(data.get('classification')), - 'series': data.get('tvShow'), - 'season_number': int_or_none(data.get('season')), - 'episode_number': int_or_none(data.get('episode')), - 'timestamp': data.get('published'), - 'thumbnail': data.get('imageUrl'), + 'subtitles': {'en': [{'url': data['captionUrl']}]} if url_or_none(data.get('captionUrl')) else None, 'uploader': 'Channel 10', 'uploader_id': '2199827728001', + **traverse_obj(data, { + 'id': ('altId', {str}), + 'duration': ('duration', {int_or_none}), + 'title': ('subtitle', {str}), + 'alt_title': ('title', {str}), + 'description': ('description', {str}), + 'age_limit': ('classification', {self._AUS_AGES.get}), + 'series': ('tvShow', {str}), + 'season_number': ('season', {int_or_none}), + 'episode_number': ('episode', {int_or_none}), + 'timestamp': ('published', {int_or_none}), + 'thumbnail': ('imageUrl', {url_or_none}), + }), } diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 9d823a3154..f7e103fe9f 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -542,16 +542,12 @@ def _extract_web_formats(self, aweme_detail): **COMMON_FORMAT_INFO, 'format_id': 'download', 'url': self._proto_relative_url(download_url), + 'format_note': 'watermarked', + 'preference': -2, }) self._remove_duplicate_formats(formats) - for f in traverse_obj(formats, lambda _, v: 'unwatermarked' not in v['url']): - f.update({ - 'format_note': join_nonempty(f.get('format_note'), 'watermarked', delim=', '), - 'preference': f.get('preference') or -2, - }) - # Is it a slideshow with only audio for download? if not formats and traverse_obj(aweme_detail, ('music', 'playUrl', {url_or_none})): audio_url = aweme_detail['music']['playUrl'] @@ -565,7 +561,8 @@ def _extract_web_formats(self, aweme_detail): 'vcodec': 'none', }) - return formats + # Filter out broken formats, see https://github.com/yt-dlp/yt-dlp/issues/11034 + return [f for f in formats if urllib.parse.urlparse(f['url']).hostname != 'www.tiktok.com'] def _parse_aweme_video_web(self, aweme_detail, webpage_url, video_id, extract_flat=False): author_info = traverse_obj(aweme_detail, (('authorInfo', 'author', None), { diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py index c13832c6f5..a8865fe649 100644 --- a/yt_dlp/extractor/tver.py +++ b/yt_dlp/extractor/tver.py @@ -6,11 +6,12 @@ str_or_none, strip_or_none, traverse_obj, + update_url_query, ) class TVerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?:(?Plp|corner|series|episodes?|feature|tokyo2020/video|olympic/paris2024/video)/)+(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?:(?Plp|corner|series|episodes?|feature)/)+(?P[a-zA-Z0-9]+)' _TESTS = [{ 'skip': 'videos are only available for 7 days', 'url': 'https://tver.jp/episodes/ep83nf3w4p', @@ -21,80 +22,115 @@ class TVerIE(InfoExtractor): 'episode': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', 'alt_title': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', 'channel': 'テレビ朝日', + 'id': 'ep83nf3w4p', + 'ext': 'mp4', + 'onair_label': '5月3日(火)放送分', + 'ext_title': '家事ヤロウ!!! 売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着! テレビ朝日 5月3日(火)放送分', }, 'add_ie': ['BrightcoveNew'], - }, { - 'url': 'https://tver.jp/olympic/paris2024/video/6359578055112/', - 'info_dict': { - 'id': '6359578055112', - 'ext': 'mp4', - 'title': '堀米雄斗 金メダルで五輪連覇!「みんなの応援が最後に乗れたカギ」', - 'timestamp': 1722279928, - 'upload_date': '20240729', - 'tags': ['20240729', 'japanese', 'japanmedal', 'paris'], - 'uploader_id': '4774017240001', - 'thumbnail': r're:https?://[^/?#]+boltdns\.net/[^?#]+/1920x1080/match/image\.jpg', - 'duration': 670.571, - }, - 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://tver.jp/corner/f0103888', 'only_matching': True, }, { 'url': 'https://tver.jp/lp/f0033031', 'only_matching': True, + }, { + 'url': 'https://tver.jp/series/srtxft431v', + 'info_dict': { + 'id': 'srtxft431v', + 'title': '名探偵コナン', + }, + 'playlist': [ + { + 'md5': '779ffd97493ed59b0a6277ea726b389e', + 'info_dict': { + 'id': 'ref:conan-1137-241005', + 'ext': 'mp4', + 'title': '名探偵コナン #1137「行列店、味変の秘密」', + 'uploader_id': '5330942432001', + 'tags': [], + 'channel': '読売テレビ', + 'series': '名探偵コナン', + 'description': 'md5:601fccc1d2430d942a2c8068c4b33eb5', + 'episode': '#1137「行列店、味変の秘密」', + 'duration': 1469.077, + 'timestamp': 1728030405, + 'upload_date': '20241004', + 'alt_title': '名探偵コナン #1137「行列店、味変の秘密」 読売テレビ 10月5日(土)放送分', + 'thumbnail': r're:https://.+\.jpg', + }, + }], + }, { + 'url': 'https://tver.jp/series/sru35hwdd2', + 'info_dict': { + 'id': 'sru35hwdd2', + 'title': '神回だけ見せます!', + }, + 'playlist_count': 11, + }, { + 'url': 'https://tver.jp/series/srkq2shp9d', + 'only_matching': True, }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' - _PLATFORM_UID = None - _PLATFORM_TOKEN = None + _HEADERS = {'x-tver-platform-type': 'web'} + _PLATFORM_QUERY = {} def _real_initialize(self): - create_response = self._download_json( - 'https://platform-api.tver.jp/v2/api/platform_users/browser/create', None, - note='Creating session', data=b'device_type=pc', headers={ - 'Origin': 'https://s.tver.jp', - 'Referer': 'https://s.tver.jp/', - 'Content-Type': 'application/x-www-form-urlencoded', + session_info = self._download_json( + 'https://platform-api.tver.jp/v2/api/platform_users/browser/create', + None, 'Creating session', data=b'device_type=pc') + self._PLATFORM_QUERY = traverse_obj(session_info, ('result', { + 'platform_uid': 'platform_uid', + 'platform_token': 'platform_token', + })) + + def _call_platform_api(self, path, video_id, note=None, fatal=True, query=None): + return self._download_json( + f'https://platform-api.tver.jp/service/api/{path}', video_id, note, + fatal=fatal, headers=self._HEADERS, query={ + **self._PLATFORM_QUERY, + **(query or {}), }) - self._PLATFORM_UID = traverse_obj(create_response, ('result', 'platform_uid')) - self._PLATFORM_TOKEN = traverse_obj(create_response, ('result', 'platform_token')) + + def _yield_episode_ids_for_series(self, series_id): + seasons_info = self._download_json( + f'https://service-api.tver.jp/api/v1/callSeriesSeasons/{series_id}', + series_id, 'Downloading seasons info', headers=self._HEADERS) + for season_id in traverse_obj( + seasons_info, ('result', 'contents', lambda _, v: v['type'] == 'season', 'content', 'id', {str})): + episodes_info = self._call_platform_api( + f'v1/callSeasonEpisodes/{season_id}', series_id, f'Downloading season {season_id} episodes info') + yield from traverse_obj(episodes_info, ( + 'result', 'contents', lambda _, v: v['type'] == 'episode', 'content', 'id', {str})) def _real_extract(self, url): video_id, video_type = self._match_valid_url(url).group('id', 'type') - if video_type == 'olympic/paris2024/video': - # Player ID is taken from .content.brightcove.E200.pro.pc.account_id: - # https://tver.jp/olympic/paris2024/req/api/hook?q=https%3A%2F%2Folympic-assets.tver.jp%2Fweb-static%2Fjson%2Fconfig.json&d= - return self.url_result(smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % ('4774017240001', video_id), - {'geo_countries': ['JP']}), 'BrightcoveNew') + if video_type == 'series': + series_info = self._call_platform_api( + f'v2/callSeries/{video_id}', video_id, 'Downloading series info') + return self.playlist_from_matches( + self._yield_episode_ids_for_series(video_id), video_id, + traverse_obj(series_info, ('result', 'content', 'content', 'title', {str})), + ie=TVerIE, getter=lambda x: f'https://tver.jp/episodes/{x}') - elif video_type not in {'series', 'episodes'}: + if video_type != 'episodes': webpage = self._download_webpage(url, video_id, note='Resolving to new URL') video_id = self._match_id(self._search_regex( (r'canonical"\s*href="(https?://tver\.jp/[^"]+)"', r'&link=(https?://tver\.jp/[^?&]+)[?&]'), webpage, 'url regex')) - episode_info = self._download_json( - f'https://platform-api.tver.jp/service/api/v1/callEpisode/{video_id}?require_data=mylist,later[epefy106ur],good[epefy106ur],resume[epefy106ur]', - video_id, fatal=False, - query={ - 'platform_uid': self._PLATFORM_UID, - 'platform_token': self._PLATFORM_TOKEN, - }, headers={ - 'x-tver-platform-type': 'web', + episode_info = self._call_platform_api( + f'v1/callEpisode/{video_id}', video_id, 'Downloading episode info', fatal=False, query={ + 'require_data': 'mylist,later[epefy106ur],good[epefy106ur],resume[epefy106ur]', }) episode_content = traverse_obj( episode_info, ('result', 'episode', 'content')) or {} + version = traverse_obj(episode_content, ('version', {str_or_none}), default='5') video_info = self._download_json( - f'https://statics.tver.jp/content/episode/{video_id}.json', video_id, - query={ - 'v': str_or_none(episode_content.get('version')) or '5', - }, headers={ - 'Origin': 'https://tver.jp', - 'Referer': 'https://tver.jp/', - }) + f'https://statics.tver.jp/content/episode/{video_id}.json', video_id, 'Downloading video info', + query={'v': version}, headers={'Referer': 'https://tver.jp/'}) p_id = video_info['video']['accountID'] r_id = traverse_obj(video_info, ('video', ('videoRefID', 'videoID')), get_all=False) if not r_id: @@ -110,6 +146,23 @@ def _real_extract(self, url): provider = str_or_none(episode_content.get('productionProviderName')) onair_label = str_or_none(episode_content.get('broadcastDateLabel')) + thumbnails = [ + { + 'id': quality, + 'url': update_url_query( + f'https://statics.tver.jp/images/content/thumbnail/episode/{quality}/{video_id}.jpg', + {'v': version}), + 'width': width, + 'height': height, + } + for quality, width, height in [ + ('small', 480, 270), + ('medium', 640, 360), + ('large', 960, 540), + ('xlarge', 1280, 720), + ] + ] + return { '_type': 'url_transparent', 'title': title, @@ -119,6 +172,7 @@ def _real_extract(self, url): 'alt_title': join_nonempty(title, provider, onair_label, delim=' '), 'channel': provider, 'description': str_or_none(video_info.get('description')), + 'thumbnails': thumbnails, 'url': smuggle_url( self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), {'geo_countries': ['JP']}), 'ie_key': 'BrightcoveNew', diff --git a/yt_dlp/extractor/twitcasting.py b/yt_dlp/extractor/twitcasting.py index 53b4084694..bf9c6348cb 100644 --- a/yt_dlp/extractor/twitcasting.py +++ b/yt_dlp/extractor/twitcasting.py @@ -270,7 +270,7 @@ def _real_extract(self, url): class TwitCastingUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P[^/?#]+)/(:?show|archive)/?(?:[#?]|$)' + _VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P[^/?#]+)/(?:show|archive)/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://twitcasting.tv/natsuiromatsuri/archive/', 'info_dict': { diff --git a/yt_dlp/extractor/vidflex.py b/yt_dlp/extractor/vidflex.py new file mode 100644 index 0000000000..ce0880b472 --- /dev/null +++ b/yt_dlp/extractor/vidflex.py @@ -0,0 +1,148 @@ +import base64 +import json + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + join_nonempty, + mimetype2ext, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class VidflexIE(InfoExtractor): + _DOMAINS_RE = [ + r'[^.]+\.vidflex\.tv', + r'(?:www\.)?acactv\.ca', + r'(?:www\.)?albertalacrossetv\.com', + r'(?:www\.)?cjfltv\.com', + r'(?:www\.)?figureitoutbaseball\.com', + r'(?:www\.)?ocaalive\.com', + r'(?:www\.)?pegasussports\.tv', + r'(?:www\.)?praxisseries\.ca', + r'(?:www\.)?silenticetv\.com', + r'(?:www\.)?tuffhedemantv\.com', + r'(?:www\.)?watchfuntv\.com', + r'live\.ofsaa\.on\.ca', + r'tv\.procoro\.ca', + r'tv\.realcastmedia\.net', + r'tv\.fringetheatre\.ca', + r'video\.haisla\.ca', + r'video\.hockeycanada\.ca', + r'video\.huuayaht\.org', + r'video\.turningpointensemble\.ca', + r'videos\.livingworks\.net', + r'videos\.telusworldofscienceedmonton\.ca', + r'watch\.binghamtonbulldogs\.com', + r'watch\.rekindle\.tv', + r'watch\.wpca\.com', + ] + _VALID_URL = rf'https?://(?:{"|".join(_DOMAINS_RE)})/[a-z]{{2}}(?:-[a-z]{{2}})?/c/[\w-]+\.(?P\d+)' + _TESTS = [{ + 'url': 'https://video.hockeycanada.ca/en/c/nwt-micd-up-with-jamie-lee-rattray.107486', + 'only_matching': True, + }, { + # m3u8 + https + 'url': 'https://video.hockeycanada.ca/en-us/c/nwt-micd-up-with-jamie-lee-rattray.107486', + 'info_dict': { + 'id': '107486', + 'title': 'NWT: Mic’d up with Jamie Lee Rattray', + 'ext': 'mp4', + 'duration': 115, + 'timestamp': 1634310409, + 'upload_date': '20211015', + 'tags': ['English', '2021', "National Women's Team"], + 'description': 'md5:efb1cf6165b48cc3f5555c4262dd5b23', + 'thumbnail': r're:^https?://wpmedia01-a\.akamaihd\.net/en/asset/public/image/.+', + }, + 'params': {'skip_download': True}, + }, { + 'url': 'https://video.hockeycanada.ca/en/c/mwc-remembering-the-wild-ride-in-riga.112307', + 'info_dict': { + 'id': '112307', + 'title': 'MWC: Remembering the wild ride in Riga', + 'ext': 'mp4', + 'duration': 322, + 'timestamp': 1716235607, + 'upload_date': '20240520', + 'tags': ['English', '2024', "National Men's Team", 'IIHF World Championship', 'Fan'], + 'description': r're:.+Canada’s National Men’s Team.+', + 'thumbnail': r're:^https?://wpmedia01-a\.akamaihd\.net/en/asset/public/image/.+', + }, + 'params': {'skip_download': True}, + }, { + # the same video in French + 'url': 'https://video.hockeycanada.ca/fr/c/cmm-retour-sur-un-parcours-endiable-a-riga.112304', + 'info_dict': { + 'id': '112304', + 'title': 'CMM : Retour sur un parcours endiablé à Riga', + 'ext': 'mp4', + 'duration': 322, + 'timestamp': 1716235545, + 'upload_date': '20240520', + 'tags': ['French', '2024', "National Men's Team", 'IIHF World Championship', 'Fan'], + 'description': 'md5:cf825222882a3dab1cd62cffcf3b4d1f', + 'thumbnail': r're:^https?://wpmedia01-a\.akamaihd\.net/en/asset/public/image/.+', + }, + 'params': {'skip_download': True}, + }, { + 'url': 'https://myfbcgreenville.vidflex.tv/en/c/may-12th-2024.658', + 'only_matching': True, + }, { + 'url': 'https://www.figureitoutbaseball.com/en/c/fiob-podcast-14-dan-bertolini-ncaa-d1-head-coach-recorded-11-29-2018.1367', + 'only_matching': True, + }, { + 'url': 'https://videos.telusworldofscienceedmonton.ca/en/c/the-aurora-project-timelapse-4.577', + 'only_matching': True, + }, { + 'url': 'https://www.tuffhedemantv.com/en/c/2022-tuff-hedeman-tour-hobbs-nm-january-22.227', + 'only_matching': True, + }, { + 'url': 'https://www.albertalacrossetv.com/en/c/up-floor-ground-balls-one-more.3449', + 'only_matching': True, + }, { + 'url': 'https://www.silenticetv.com/en/c/jp-unlocked-day-in-the-life-of-langley-ha-15u.5197', + 'only_matching': True, + }, { + 'url': 'https://jphl.vidflex.tv/en/c/jp-unlocked-day-in-the-life-of-langley-ha-15u.5197', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + data_url = self._html_search_regex( + r'content_api:\s*(["\'])(?Phttps?://(?:(?!\1).)+)\1', webpage, 'content api url', group='url') + media_config = traverse_obj( + self._download_json(data_url, video_id), + ('config', {base64.b64decode}, {bytes.decode}, {json.loads}, {dict})) + + return { + 'id': video_id, + 'formats': list(self._yield_formats(media_config, video_id)), + **self._search_json_ld( + webpage.replace('/**/', ''), video_id), + } + + def _yield_formats(self, media_config, video_id): + for media_source in traverse_obj(media_config, ('media', 'source', lambda _, v: url_or_none(v['src']))): + media_url = media_source['src'] + media_type = mimetype2ext(media_source.get('type')) + + if media_type == 'm3u8': + yield from self._extract_m3u8_formats(media_url, video_id, fatal=False, m3u8_id='hls') + elif media_type == 'mp4': + bitrate = self._search_regex(r'_(\d+)k\.mp4', media_url, 'bitrate', default=None) + yield { + 'format_id': join_nonempty('http', bitrate), + 'url': media_url, + 'ext': 'mp4', + 'tbr': int_or_none(bitrate), + } + else: + yield { + 'url': media_url, + 'ext': media_type, + } diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index a20cf4b17d..367d5e5835 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -21,6 +21,7 @@ parse_filesize, parse_iso8601, parse_qs, + qualities, smuggle_url, str_or_none, traverse_obj, @@ -146,6 +147,8 @@ def _parse_config(self, config, video_id): }) # TODO: fix handling of 308 status code returned for live archive manifest requests + QUALITIES = ('low', 'medium', 'high') + quality = qualities(QUALITIES) sep_pattern = r'/sep/video/' for files_type in ('hls', 'dash'): for cdn_name, cdn_data in (try_get(config_files, lambda x: x[files_type]['cdns']) or {}).items(): @@ -166,6 +169,11 @@ def _parse_config(self, config, video_id): m_url, video_id, 'mp4', live=is_live, m3u8_id=f_id, note=f'Downloading {cdn_name} m3u8 information', fatal=False) + # m3u8 doesn't give audio bitrates; need to prioritize based on GROUP-ID + # See: https://github.com/yt-dlp/yt-dlp/issues/10854 + for f in fmts: + if mobj := re.search(rf'audio-({"|".join(QUALITIES)})', f['format_id']): + f['quality'] = quality(mobj.group(1)) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) elif files_type == 'dash': @@ -234,13 +242,30 @@ def _parse_config(self, config, video_id): '_format_sort_fields': ('quality', 'res', 'fps', 'hdr:12', 'source'), } - def _extract_original_format(self, url, video_id, unlisted_hash=None): + def _call_videos_api(self, video_id, jwt_token, unlisted_hash=None, **kwargs): + return self._download_json( + join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'), + video_id, 'Downloading API JSON', headers={ + 'Authorization': f'jwt {jwt_token}', + 'Accept': 'application/json', + }, query={ + 'fields': ','.join(( + 'config_url', 'created_time', 'description', 'download', 'license', + 'metadata.connections.comments.total', 'metadata.connections.likes.total', + 'release_time', 'stats.plays')), + }, **kwargs) + + def _extract_original_format(self, url, video_id, unlisted_hash=None, jwt=None, api_data=None): + # Original/source formats are only available when logged in + if not self._get_cookies('https://vimeo.com/').get('vimeo'): + return + query = {'action': 'load_download_config'} if unlisted_hash: query['unlisted_hash'] = unlisted_hash download_data = self._download_json( - url, video_id, fatal=False, query=query, - headers={'X-Requested-With': 'XMLHttpRequest'}, + url, video_id, 'Loading download config JSON', fatal=False, + query=query, headers={'X-Requested-With': 'XMLHttpRequest'}, expected_status=(403, 404)) or {} source_file = download_data.get('source_file') download_url = try_get(source_file, lambda x: x['download_url']) @@ -261,15 +286,13 @@ def _extract_original_format(self, url, video_id, unlisted_hash=None): 'quality': 1, } - jwt_response = self._download_json( - 'https://vimeo.com/_rv/viewer', video_id, note='Downloading jwt token', fatal=False) or {} - if not jwt_response.get('jwt'): + jwt = jwt or traverse_obj(self._download_json( + 'https://vimeo.com/_rv/viewer', video_id, 'Downloading jwt token', fatal=False), ('jwt', {str})) + if not jwt: return - headers = {'Authorization': 'jwt {}'.format(jwt_response['jwt']), 'Accept': 'application/json'} - original_response = self._download_json( - f'https://api.vimeo.com/videos/{video_id}', video_id, - headers=headers, fatal=False, expected_status=(403, 404)) or {} - for download_data in original_response.get('download') or []: + original_response = api_data or self._call_videos_api( + video_id, jwt, unlisted_hash, fatal=False, expected_status=(403, 404)) + for download_data in traverse_obj(original_response, ('download', ..., {dict})): download_url = download_data.get('link') if not download_url or download_data.get('quality') != 'source': continue @@ -354,7 +377,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip': 'No longer available', }, { - 'url': 'http://player.vimeo.com/video/54469442', + 'url': 'https://player.vimeo.com/video/54469442', 'md5': '619b811a4417aa4abe78dc653becf511', 'note': 'Videos that embed the url in the player page', 'info_dict': { @@ -370,6 +393,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'format': 'best[protocol=https]', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/68375962', @@ -379,22 +403,23 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '68375962', 'ext': 'mp4', 'title': 'youtube-dl password protected test video', - 'timestamp': 1371200155, + 'timestamp': 1371214555, 'upload_date': '20130614', + 'release_timestamp': 1371214555, + 'release_date': '20130614', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'md5:6173f270cd0c0119f22817204b3eb86c', - 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', - 'view_count': int, 'comment_count': int, 'like_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', }, 'params': { 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/channels/keypeele/75629013', @@ -418,29 +443,38 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': int, }, 'params': {'format': 'http-1080p'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/76979871', 'note': 'Video with subtitles', 'info_dict': { 'id': '76979871', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'The New Vimeo Player (You Know, For Videos)', - 'description': 'md5:2ec900bf97c3f389378a96aee11260ea', - 'timestamp': 1381846109, + 'description': str, # FIXME: Dynamic SEO spam description + 'timestamp': 1381860509, 'upload_date': '20131015', + 'release_timestamp': 1381860509, + 'release_date': '20131015', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/staff', 'uploader_id': 'staff', - 'uploader': 'Vimeo Staff', + 'uploader': 'Vimeo', 'duration': 62, + 'comment_count': int, + 'like_count': int, + 'thumbnail': 'https://i.vimeocdn.com/video/452001751-8216e0571c251a09d7a8387550942d89f7f86f6398f8ed886e639b0dd50d3c90-d_1280', 'subtitles': { - 'de': [{'ext': 'vtt'}], - 'en': [{'ext': 'vtt'}], - 'es': [{'ext': 'vtt'}], - 'fr': [{'ext': 'vtt'}], + 'de': 'count:3', + 'en': 'count:3', + 'es': 'count:3', + 'fr': 'count:3', }, }, - 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], + 'expected_warnings': [ + 'Ignoring subtitle tracks found in the HLS manifest', + 'Failed to parse XML: not well-formed', + ], }, { # from https://www.ouya.tv/game/Pier-Solar-and-the-Great-Architects/ @@ -456,11 +490,12 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 118, 'thumbnail': 'https://i.vimeocdn.com/video/478636036-c18440305ef3df9decfb6bf207a61fe39d2d17fa462a96f6f2d93d30492b037d-d_1280', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { - # contains original format + # contains Original format 'url': 'https://vimeo.com/33951933', - 'md5': '53c688fa95a55bf4b7293d37a89c5c53', + # 'md5': '53c688fa95a55bf4b7293d37a89c5c53', 'info_dict': { 'id': '33951933', 'ext': 'mp4', @@ -476,15 +511,19 @@ class VimeoIE(VimeoBaseInfoExtractor): 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/231174622-dd07f015e9221ff529d451e1cc31c982b5d87bfafa48c4189b1da72824ee289a-d_1280', 'like_count': int, + 'tags': 'count:11', }, + # 'params': {'format': 'Original'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { - 'note': 'Contains original format not accessible in webpage', + 'note': 'Contains source format not accessible in webpage', 'url': 'https://vimeo.com/393756517', - 'md5': 'c464af248b592190a5ffbb5d33f382b0', + # 'md5': 'c464af248b592190a5ffbb5d33f382b0', 'info_dict': { 'id': '393756517', - 'ext': 'mov', + # 'ext': 'mov', + 'ext': 'mp4', 'timestamp': 1582642091, 'uploader_id': 'frameworkla', 'title': 'Straight To Hell - Sabrina: Netflix', @@ -495,6 +534,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'thumbnail': 'https://i.vimeocdn.com/video/859377297-836494a4ef775e9d4edbace83937d9ad34dc846c688c0c419c0e87f7ab06c4b3-d_1280', 'uploader_url': 'https://vimeo.com/frameworkla', }, + # 'params': {'format': 'source'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # only available via https://vimeo.com/channels/tributes/6213729 and @@ -511,16 +552,18 @@ class VimeoIE(VimeoBaseInfoExtractor): 'channel_id': 'tributes', 'timestamp': 1250886430, 'upload_date': '20090821', - 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', + 'description': str, # FIXME: Dynamic SEO spam description 'duration': 321, 'comment_count': int, 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/22728298-bfc22146f930de7cf497821c7b0b9f168099201ecca39b00b6bd31fcedfca7a6-d_1280', 'like_count': int, + 'tags': ['[the shining', 'vimeohq', 'cv', 'vimeo tribute]'], }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # redirects to ondemand extractor and should be passed through it @@ -543,28 +586,23 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip': 'this page is no longer available.', }, { - 'url': 'http://player.vimeo.com/video/68375962', + 'url': 'https://player.vimeo.com/video/68375962', 'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7', 'info_dict': { 'id': '68375962', 'ext': 'mp4', 'title': 'youtube-dl password protected test video', - 'timestamp': 1371200155, - 'upload_date': '20130614', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'md5:6173f270cd0c0119f22817204b3eb86c', 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', - 'view_count': int, - 'comment_count': int, - 'like_count': int, }, 'params': { 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741', @@ -592,7 +630,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'title': "youtube-dl test video '' ä↭𝕐-BaW jenozKc", 'uploader': 'Philipp Hagemeister', 'uploader_id': 'user20132939', - 'description': 'md5:fa7b6c6d8db0bdc353893df2f111855b', + 'description': str, # FIXME: Dynamic SEO spam description 'upload_date': '20150209', 'timestamp': 1423518307, 'thumbnail': 'https://i.vimeocdn.com/video/default_1280', @@ -606,6 +644,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'format': 'best[protocol=https]', 'videopassword': 'youtube-dl', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # source file returns 403: Forbidden @@ -633,11 +672,13 @@ class VimeoIE(VimeoBaseInfoExtractor): 'release_date': '20160329', }, 'params': {'skip_download': True}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/138909882', 'info_dict': { 'id': '138909882', + # 'ext': 'm4v', 'ext': 'mp4', 'title': 'Eastnor Castle 2015 Firework Champions - The Promo!', 'description': 'md5:5967e090768a831488f6e74b7821b3c1', @@ -645,11 +686,19 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'Firework Champions', 'upload_date': '20150910', 'timestamp': 1441901895, + 'thumbnail': 'https://i.vimeocdn.com/video/534715882-6ff8e4660cbf2fea68282876d8d44f318825dfe572cc4016e73b3266eac8ae3a-d_1280', + 'uploader_url': 'https://vimeo.com/fireworkchampions', + 'tags': 'count:6', + 'duration': 229, + 'view_count': int, + 'like_count': int, + 'comment_count': int, }, 'params': { 'skip_download': True, - 'format': 'Original', + # 'format': 'source', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/channels/staffpicks/143603739', @@ -670,8 +719,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': int, 'uploader_url': 'https://vimeo.com/karimhd', 'channel_url': 'https://vimeo.com/channels/staffpicks', + 'tags': 'count:6', }, 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # requires passing unlisted_hash(a52724358e) to load_download_config request @@ -701,6 +752,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # chapters must be sorted, see: https://github.com/yt-dlp/yt-dlp/issues/5308 @@ -735,6 +787,48 @@ class VimeoIE(VimeoBaseInfoExtractor): }, 'expected_warnings': ['Failed to parse XML: not well-formed'], }, + { + # vimeo.com URL with unlisted hash and Original format + 'url': 'https://vimeo.com/144579403/ec02229140', + # 'md5': '6b662c2884e0373183fbde2a0d15cb78', + 'info_dict': { + 'id': '144579403', + 'ext': 'mp4', + 'title': 'SALESMANSHIP', + 'description': 'md5:4338302f347a1ff8841b4a3aecaa09f0', + 'uploader': 'Off the Picture Pictures', + 'uploader_id': 'offthepicturepictures', + 'uploader_url': 'https://vimeo.com/offthepicturepictures', + 'duration': 669, + 'upload_date': '20151104', + 'timestamp': 1446607180, + 'release_date': '20151104', + 'release_timestamp': 1446607180, + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/1018638656-[\da-f]+-d_1280', + }, + # 'params': {'format': 'Original'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, + { + # player.vimeo.com URL with source format + 'url': 'https://player.vimeo.com/video/859028877', + # 'md5': '19ca3d2463441dee2d2f0671ac2916a2', + 'info_dict': { + 'id': '859028877', + 'ext': 'mp4', + 'title': 'Ariana Grande - Honeymoon Avenue (Live from London)', + 'uploader': 'Raja Virdi', + 'uploader_id': 'rajavirdi', + 'uploader_url': 'https://vimeo.com/rajavirdi', + 'duration': 309, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/1716727772-[\da-f]+-d_1280', + }, + # 'params': {'format': 'source'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, { # user playlist alias -> https://vimeo.com/258705797 'url': 'https://vimeo.com/user26785108/newspiritualguide', @@ -768,16 +862,6 @@ def _verify_player_video_password(self, url, video_id, headers): raise ExtractorError('Wrong video password', expected=True) return checked - def _call_videos_api(self, video_id, jwt_token, unlisted_hash=None): - return self._download_json( - join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'), - video_id, 'Downloading API JSON', headers={ - 'Authorization': f'jwt {jwt_token}', - 'Accept': 'application/json', - }, query={ - 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', - }) - def _extract_from_api(self, video_id, unlisted_hash=None): viewer = self._download_json( 'https://vimeo.com/_next/viewer', video_id, 'Downloading viewer info') @@ -798,6 +882,11 @@ def _extract_from_api(self, video_id, unlisted_hash=None): info = self._parse_config(self._download_json( video['config_url'], video_id), video_id) + source_format = self._extract_original_format( + f'https://vimeo.com/{video_id}', video_id, unlisted_hash, jwt=viewer['jwt'], api_data=video) + if source_format: + info['formats'].append(source_format) + get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) info.update({ 'description': video.get('description'), @@ -899,7 +988,12 @@ def _real_extract(self, url): if config.get('view') == 4: config = self._verify_player_video_password( redirect_url, video_id, headers) - return self._parse_config(config, video_id) + info = self._parse_config(config, video_id) + source_format = self._extract_original_format( + f'https://vimeo.com/{video_id}', video_id, unlisted_hash) + if source_format: + info['formats'].append(source_format) + return info vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None) if vimeo_config: @@ -1269,6 +1363,20 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): IE_DESC = 'Review pages on vimeo' _VALID_URL = r'https?://vimeo\.com/(?P[^/?#]+)/review/(?P\d+)/(?P[\da-f]{10})' _TESTS = [{ + 'url': 'https://vimeo.com/user170863801/review/996447483/a316d6ed8d', + 'info_dict': { + 'id': '996447483', + 'ext': 'mp4', + 'title': 'Rodeo day 1-_2', + 'uploader': 'BROADKAST', + 'uploader_id': 'user170863801', + 'uploader_url': 'https://vimeo.com/user170863801', + 'duration': 30, + 'thumbnail': 'https://i.vimeocdn.com/video/1912612821-09a43bd2e75c203d503aed89de7534f28fc4474a48f59c51999716931a246af5-d_1280', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML'], + }, { 'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d', 'md5': 'c507a72f780cacc12b2248bb4006d253', 'info_dict': { @@ -1282,6 +1390,7 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'thumbnail': 'https://i.vimeocdn.com/video/450115033-43303819d9ebe24c2630352e18b7056d25197d09b3ae901abdac4c4f1d68de71-d_1280', 'uploader_url': 'https://vimeo.com/user21297594', }, + 'skip': '404 Not Found', }, { 'note': 'video player needs Referer', 'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053', @@ -1316,6 +1425,7 @@ def _real_extract(self, url): user, video_id, review_hash = self._match_valid_url(url).group('user', 'id', 'hash') data_url = f'https://vimeo.com/{user}/review/data/{video_id}/{review_hash}' data = self._download_json(data_url, video_id) + viewer = {} if data.get('isLocked') is True: video_password = self._get_video_password() viewer = self._download_json( @@ -1327,8 +1437,8 @@ def _real_extract(self, url): config = self._download_json(config_url, video_id) info_dict = self._parse_config(config, video_id) source_format = self._extract_original_format( - f'https://vimeo.com/{user}/review/{video_id}/{review_hash}/action', video_id, - unlisted_hash=traverse_obj(config_url, ({parse_qs}, 'h', -1))) + f'https://vimeo.com/{user}/review/{video_id}/{review_hash}/action', + video_id, unlisted_hash=clip_data.get('unlistedHash'), jwt=viewer.get('jwt')) if source_format: info_dict['formats'].append(source_format) info_dict['description'] = clean_html(clip_data.get('description')) diff --git a/yt_dlp/extractor/viu.py b/yt_dlp/extractor/viu.py index 01e59352bf..f4ed96bf62 100644 --- a/yt_dlp/extractor/viu.py +++ b/yt_dlp/extractor/viu.py @@ -90,7 +90,7 @@ def _real_extract(self, url): formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4') for key, value in video_data.items(): - mobj = re.match(r'^subtitle_(?P[^_]+)_(?P(vtt|srt))', key) + mobj = re.match(r'subtitle_(?P[^_]+)_(?P(vtt|srt))', key) if not mobj: continue subtitles.setdefault(mobj.group('lang'), []).append({ diff --git a/yt_dlp/extractor/weverse.py b/yt_dlp/extractor/weverse.py index c94ca9db97..6f1a8b95d8 100644 --- a/yt_dlp/extractor/weverse.py +++ b/yt_dlp/extractor/weverse.py @@ -27,8 +27,9 @@ class WeverseBaseIE(InfoExtractor): _NETRC_MACHINE = 'weverse' - _ACCOUNT_API_BASE = 'https://accountapi.weverse.io/web/api/v2' + _ACCOUNT_API_BASE = 'https://accountapi.weverse.io/web/api' _API_HEADERS = { + 'Accept': 'application/json', 'Referer': 'https://weverse.io/', 'WEV-device-Id': str(uuid.uuid4()), } @@ -39,14 +40,14 @@ def _perform_login(self, username, password): headers = { 'x-acc-app-secret': '5419526f1c624b38b10787e5c10b2a7a', - 'x-acc-app-version': '2.2.6', + 'x-acc-app-version': '3.3.6', 'x-acc-language': 'en', 'x-acc-service-id': 'weverse', 'x-acc-trace-id': str(uuid.uuid4()), 'x-clog-user-device-id': str(uuid.uuid4()), } valid_username = traverse_obj(self._download_json( - f'{self._ACCOUNT_API_BASE}/signup/email/status', None, note='Checking username', + f'{self._ACCOUNT_API_BASE}/v2/signup/email/status', None, note='Checking username', query={'email': username}, headers=headers, expected_status=(400, 404)), 'hasPassword') if not valid_username: raise ExtractorError('Invalid username provided', expected=True) @@ -54,8 +55,9 @@ def _perform_login(self, username, password): headers['content-type'] = 'application/json' try: auth = self._download_json( - f'{self._ACCOUNT_API_BASE}/auth/token/by-credentials', None, data=json.dumps({ + f'{self._ACCOUNT_API_BASE}/v3/auth/token/by-credentials', None, data=json.dumps({ 'email': username, + 'otpSessionId': 'BY_PASS', 'password': password, }, separators=(',', ':')).encode(), headers=headers, note='Logging in') except ExtractorError as e: @@ -78,8 +80,10 @@ def _call_api(self, ep, video_id, data=None, note='Downloading API JSON'): # From https://ssl.pstatic.net/static/wevweb/2_3_2_11101725/public/static/js/main.e206f7c1.js: key = b'1b9cb6378d959b45714bec49971ade22e6e24e42' api_path = update_url_query(ep, { + # 'gcc': 'US', 'appId': 'be4d79eb8fc7bd008ee82c8ec4ff6fd4', 'language': 'en', + 'os': 'WEB', 'platform': 'WEB', 'wpf': 'pc', }) @@ -152,7 +156,7 @@ def _parse_post_meta(self, metadata): 'description': ((('extension', 'mediaInfo', 'body'), 'body'), {str}), 'uploader': ('author', 'profileName', {str}), 'uploader_id': ('author', 'memberId', {str}), - 'creator': ('community', 'communityName', {str}), + 'creators': ('community', 'communityName', {str}, all), 'channel_id': (('community', 'author'), 'communityId', {str_or_none}), 'duration': ('extension', 'video', 'playTime', {float_or_none}), 'timestamp': ('publishedAt', {lambda x: int_or_none(x, 1000)}), @@ -196,7 +200,7 @@ class WeverseIE(WeverseBaseIE): 'channel': 'billlie', 'channel_id': '72', 'channel_url': 'https://weverse.io/billlie', - 'creator': 'Billlie', + 'creators': ['Billlie'], 'timestamp': 1666262062, 'upload_date': '20221020', 'release_timestamp': 1666262058, @@ -222,7 +226,7 @@ class WeverseIE(WeverseBaseIE): 'channel': 'lesserafim', 'channel_id': '47', 'channel_url': 'https://weverse.io/lesserafim', - 'creator': 'LE SSERAFIM', + 'creators': ['LE SSERAFIM'], 'timestamp': 1659353400, 'upload_date': '20220801', 'release_timestamp': 1659353400, @@ -286,7 +290,7 @@ def _real_extract(self, url): elif live_status == 'is_live': video_info = self._call_api( - f'/video/v1.0/lives/{api_video_id}/playInfo?preview.format=json&preview.version=v2', + f'/video/v1.2/lives/{api_video_id}/playInfo?preview.format=json&preview.version=v2', video_id, note='Downloading live JSON') playback = self._parse_json(video_info['lipPlayback'], video_id) m3u8_url = traverse_obj(playback, ( @@ -302,7 +306,7 @@ def _real_extract(self, url): else: infra_video_id = post['extension']['video']['infraVideoId'] in_key = self._call_api( - f'/video/v1.0/vod/{api_video_id}/inKey?preview=false', video_id, + f'/video/v1.1/vod/{api_video_id}/inKey?preview=false', video_id, data=b'{}', note='Downloading VOD API key')['inKey'] video_info = self._download_json( @@ -347,7 +351,6 @@ class WeverseMediaIE(WeverseBaseIE): _VALID_URL = r'https?://(?:www\.|m\.)?weverse\.io/(?P[^/?#]+)/media/(?P[\d-]+)' _TESTS = [{ 'url': 'https://weverse.io/billlie/media/4-116372884', - 'md5': '8efc9cfd61b2f25209eb1a5326314d28', 'info_dict': { 'id': 'e-C9wLSQs6o', 'ext': 'mp4', @@ -358,8 +361,9 @@ class WeverseMediaIE(WeverseBaseIE): 'channel_url': 'https://www.youtube.com/channel/UCyc9sUCxELTDK9vELO5Fzeg', 'uploader': 'Billlie', 'uploader_id': '@Billlie', - 'uploader_url': 'http://www.youtube.com/@Billlie', + 'uploader_url': 'https://www.youtube.com/@Billlie', 'upload_date': '20230403', + 'timestamp': 1680533992, 'duration': 211, 'age_limit': 0, 'playable_in_embed': True, @@ -372,6 +376,8 @@ class WeverseMediaIE(WeverseBaseIE): 'thumbnail': 'https://i.ytimg.com/vi/e-C9wLSQs6o/maxresdefault.jpg', 'categories': ['Entertainment'], 'tags': 'count:7', + 'channel_is_verified': True, + 'heatmap': 'count:100', }, }, { 'url': 'https://weverse.io/billlie/media/3-102914520', @@ -386,7 +392,7 @@ class WeverseMediaIE(WeverseBaseIE): 'channel': 'billlie', 'channel_id': '72', 'channel_url': 'https://weverse.io/billlie', - 'creator': 'Billlie', + 'creators': ['Billlie'], 'timestamp': 1662174000, 'upload_date': '20220903', 'release_timestamp': 1662174000, @@ -432,7 +438,7 @@ class WeverseMomentIE(WeverseBaseIE): 'uploader_id': '66a07e164b56a696ee71c99315ffe27b', 'channel': 'secretnumber', 'channel_id': '56', - 'creator': 'SECRET NUMBER', + 'creators': ['SECRET NUMBER'], 'duration': 10, 'upload_date': '20230405', 'timestamp': 1680653968, @@ -441,7 +447,6 @@ class WeverseMomentIE(WeverseBaseIE): 'comment_count': int, 'availability': 'needs_auth', }, - 'skip': 'Moment has expired', }] def _real_extract(self, url): @@ -571,7 +576,7 @@ class WeverseLiveIE(WeverseBaseIE): 'channel': 'purplekiss', 'channel_id': '35', 'channel_url': 'https://weverse.io/purplekiss', - 'creator': 'PURPLE KISS', + 'creators': ['PURPLE KISS'], 'timestamp': 1680780892, 'upload_date': '20230406', 'release_timestamp': 1680780883, @@ -584,6 +589,31 @@ class WeverseLiveIE(WeverseBaseIE): 'live_status': 'is_live', }, 'skip': 'Livestream has ended', + }, { + 'url': 'https://weverse.io/lesserafim', + 'info_dict': { + 'id': '4-181521628', + 'ext': 'mp4', + 'title': r're:심심해서요', + 'description': '', + 'uploader': '채채🤎', + 'uploader_id': 'd49b8b06f3cc1d92d655b25ab27ac2e7', + 'channel': 'lesserafim', + 'channel_id': '47', + 'creators': ['LE SSERAFIM'], + 'channel_url': 'https://weverse.io/lesserafim', + 'timestamp': 1728570273, + 'upload_date': '20241010', + 'release_timestamp': 1728570264, + 'release_date': '20241010', + 'thumbnail': r're:https://phinf\.wevpstatic\.net/.+\.png', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'is_live', + }, + 'skip': 'Livestream has ended', }, { 'url': 'https://weverse.io/billlie/', 'only_matching': True, diff --git a/yt_dlp/extractor/wistia.py b/yt_dlp/extractor/wistia.py index fb2a8648fd..df7ecb3cdc 100644 --- a/yt_dlp/extractor/wistia.py +++ b/yt_dlp/extractor/wistia.py @@ -8,6 +8,7 @@ from ..utils import ( ExtractorError, determine_ext, + filter_dict, float_or_none, int_or_none, parse_qs, @@ -25,16 +26,25 @@ class WistiaBaseIE(InfoExtractor): def _download_embed_config(self, config_type, config_id, referer): base_url = self._EMBED_BASE_URL + f'{config_type}/{config_id}' + video_password = self.get_param('videopassword') embed_config = self._download_json( base_url + '.json', config_id, headers={ 'Referer': referer if referer.startswith('http') else base_url, # Some videos require this. - }) + }, query=filter_dict({'password': video_password})) error = traverse_obj(embed_config, 'error') if error: raise ExtractorError( f'Error while getting the playlist: {error}', expected=True) + if traverse_obj(embed_config, ( + 'media', ('embed_options', 'embedOptions'), 'plugin', + 'passwordProtectedVideo', 'on', any)) == 'true': + if video_password: + raise ExtractorError('Invalid video password', expected=True) + raise ExtractorError( + 'This content is password-protected. Use the --video-password option', expected=True) + return embed_config def _get_real_ext(self, url): diff --git a/yt_dlp/extractor/ximalaya.py b/yt_dlp/extractor/ximalaya.py index e900a4ad9f..02bf6a7beb 100644 --- a/yt_dlp/extractor/ximalaya.py +++ b/yt_dlp/extractor/ximalaya.py @@ -1,7 +1,17 @@ +import base64 import math +import time from .common import InfoExtractor -from ..utils import InAdvancePagedList, str_or_none, traverse_obj, try_call +from .videa import VideaIE +from ..utils import ( + InAdvancePagedList, + int_or_none, + str_or_none, + traverse_obj, + try_call, + update_url_query, +) class XimalayaBaseIE(InfoExtractor): @@ -11,7 +21,7 @@ class XimalayaBaseIE(InfoExtractor): class XimalayaIE(XimalayaBaseIE): IE_NAME = 'ximalaya' IE_DESC = '喜马拉雅FM' - _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(:?(?P\d+)/)?sound/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?:(?P\d+)/)?sound/(?P[0-9]+)' _TESTS = [ { 'url': 'http://www.ximalaya.com/sound/47740352/', @@ -71,23 +81,92 @@ class XimalayaIE(XimalayaBaseIE): 'like_count': int, }, }, + { + # VIP-restricted audio + 'url': 'https://www.ximalaya.com/sound/562111701', + 'only_matching': True, + }, ] + @staticmethod + def _decrypt_filename(file_id, seed): + cgstr = '' + key = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\\:._-1234567890' + for _ in key: + seed = float(int(211 * seed + 30031) % 65536) + r = int(seed / 65536 * len(key)) + cgstr += key[r] + key = key.replace(key[r], '') + parts = file_id.split('*') + filename = ''.join(cgstr[int(part)] for part in parts if part.isdecimal()) + if not filename.startswith('/'): + filename = '/' + filename + return filename + + @staticmethod + def _decrypt_url_params(encrypted_params): + params = VideaIE.rc4( + base64.b64decode(encrypted_params), 'xkt3a41psizxrh9l').split('-') + # sign, token, timestamp + return params[1], params[2], params[3] + def _real_extract(self, url): scheme = 'https' if url.startswith('https') else 'http' audio_id = self._match_id(url) - audio_info_file = f'{scheme}://m.ximalaya.com/tracks/{audio_id}.json' audio_info = self._download_json( - audio_info_file, audio_id, - f'Downloading info json {audio_info_file}', 'Unable to download info file') + f'{scheme}://m.ximalaya.com/tracks/{audio_id}.json', audio_id, + 'Downloading info json', 'Unable to download info file') - formats = [{ + formats = [] + # NOTE: VIP-restricted audio + if audio_info.get('is_paid'): + ts = int(time.time()) + vip_info = self._download_json( + f'{scheme}://mpay.ximalaya.com/mobile/track/pay/{audio_id}/{ts}', + audio_id, 'Downloading VIP info json', 'Unable to download VIP info file', + query={'device': 'pc', 'isBackend': 'true', '_': ts}) + filename = self._decrypt_filename(vip_info['fileId'], vip_info['seed']) + sign, token, timestamp = self._decrypt_url_params(vip_info['ep']) + vip_url = update_url_query( + f'{vip_info["domain"]}/download/{vip_info["apiVersion"]}{filename}', { + 'sign': sign, + 'token': token, + 'timestamp': timestamp, + 'buy_key': vip_info['buyKey'], + 'duration': vip_info['duration'], + }) + fmt = { + 'format_id': 'vip', + 'url': vip_url, + 'vcodec': 'none', + } + if '_preview_' in vip_url: + self.report_warning( + f'This tracks requires a VIP account. Using a sample instead. {self._login_hint()}') + fmt.update({ + 'format_note': 'Sample', + 'preference': -10, + **traverse_obj(vip_info, { + 'filesize': ('sampleLength', {int_or_none}), + 'duration': ('sampleDuration', {int_or_none}), + }), + }) + else: + fmt.update(traverse_obj(vip_info, { + 'filesize': ('totalLength', {int_or_none}), + 'duration': ('duration', {int_or_none}), + })) + + fmt['abr'] = try_call(lambda: fmt['filesize'] * 8 / fmt['duration'] / 1024) + formats.append(fmt) + + formats.extend([{ 'format_id': f'{bps}k', 'url': audio_info[k], 'abr': bps, 'vcodec': 'none', - } for bps, k in ((24, 'play_path_32'), (64, 'play_path_64')) if audio_info.get(k)] + } for bps, k in ((24, 'play_path_32'), (64, 'play_path_64')) if audio_info.get(k)]) thumbnails = [] for k in audio_info: diff --git a/yt_dlp/extractor/xinpianchang.py b/yt_dlp/extractor/xinpianchang.py index 10849916b8..23ed9270da 100644 --- a/yt_dlp/extractor/xinpianchang.py +++ b/yt_dlp/extractor/xinpianchang.py @@ -3,16 +3,13 @@ int_or_none, str_or_none, try_get, - update_url_query, url_or_none, ) class XinpianchangIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'https?://www\.xinpianchang\.com/(?P[^/]+?)(?:\D|$)' - IE_NAME = 'xinpianchang' - IE_DESC = 'xinpianchang.com' + _VALID_URL = r'https?://(www\.)?xinpianchang\.com/(?Pa\d+)' + IE_DESC = '新片场' _TESTS = [{ 'url': 'https://www.xinpianchang.com/a11766551', 'info_dict': { @@ -49,11 +46,11 @@ class XinpianchangIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id=video_id) - domain = self.find_value_with_regex(var='requireNewDomain', webpage=webpage) - vid = self.find_value_with_regex(var='vid', webpage=webpage) - app_key = self.find_value_with_regex(var='modeServerAppKey', webpage=webpage) - api = update_url_query(f'{domain}/mod/api/v2/media/{vid}', {'appKey': app_key}) - data = self._download_json(api, video_id=video_id)['data'] + video_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['detail']['video'] + + data = self._download_json( + f'https://mod-api.xinpianchang.com/mod/api/v2/media/{video_data["vid"]}', video_id, + query={'appKey': video_data['appKey']})['data'] formats, subtitles = [], {} for k, v in data.get('resource').items(): if k in ('dash', 'hls'): @@ -72,6 +69,10 @@ def _real_extract(self, url): 'width': int_or_none(prog.get('width')), 'height': int_or_none(prog.get('height')), 'ext': 'mp4', + 'http_headers': { + # NB: Server returns 403 without the Range header + 'Range': 'bytes=0-', + }, } for prog in v if prog.get('url') or []]) return { @@ -87,6 +88,3 @@ def _real_extract(self, url): 'formats': formats, 'subtitles': subtitles, } - - def find_value_with_regex(self, var, webpage): - return self._search_regex(rf'var\s{var}\s=\s\"(?P[^\"]+)\"', webpage, name=var) diff --git a/yt_dlp/extractor/yle_areena.py b/yt_dlp/extractor/yle_areena.py index ef9e96804c..c0a218e2fc 100644 --- a/yt_dlp/extractor/yle_areena.py +++ b/yt_dlp/extractor/yle_areena.py @@ -10,7 +10,7 @@ class YleAreenaIE(InfoExtractor): - _VALID_URL = r'https?://areena\.yle\.fi/(?P[\d-]+)' + _VALID_URL = r'https?://areena\.yle\.fi/(?Ppodcastit/)?(?P[\d-]+)' _GEO_COUNTRIES = ['FI'] _TESTS = [ { @@ -77,7 +77,7 @@ class YleAreenaIE(InfoExtractor): ] def _real_extract(self, url): - video_id = self._match_id(url) + video_id, is_podcast = self._match_valid_url(url).group('id', 'podcast') info = self._search_json_ld(self._download_webpage(url, video_id), video_id, default={}) video_data = self._download_json( f'https://player.api.yle.fi/v1/preview/{video_id}.json?app_id=player_static_prod&app_key=8930d72170e48303cf5f3867780d549b', @@ -103,8 +103,11 @@ def _real_extract(self, url): 'name': sub.get('kind'), }) - kaltura_id = traverse_obj(video_data, ('data', 'ongoing_ondemand', 'kaltura', 'id'), expected_type=str) - if kaltura_id: + if is_podcast: + info_dict = { + 'url': video_data['data']['ongoing_ondemand']['media_url'], + } + elif kaltura_id := traverse_obj(video_data, ('data', 'ongoing_ondemand', 'kaltura', 'id', {str})): info_dict = { '_type': 'url_transparent', 'url': smuggle_url(f'kaltura:1955031:{kaltura_id}', {'source_url': url}), @@ -114,13 +117,11 @@ def _real_extract(self, url): formats, subs = self._extract_m3u8_formats_and_subtitles( video_data['data']['ongoing_ondemand']['manifest_url'], video_id, 'mp4', m3u8_id='hls') self._merge_subtitles(subs, target=subtitles) - info_dict = { - 'id': video_id, - 'formats': formats, - } + info_dict = {'formats': formats} return { **info_dict, + 'id': video_id, 'title': (traverse_obj(video_data, ('data', 'ongoing_ondemand', 'title', 'fin'), expected_type=str) or episode or info.get('title')), 'description': description, diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 2501398ba1..6acc42fc0a 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -69,6 +69,8 @@ ) STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client' +STREAMING_DATA_PO_TOKEN = '__yt_dlp_po_token' + # any clients starting with _ cannot be explicitly requested by the user INNERTUBE_CLIENTS = { 'web': { @@ -79,6 +81,7 @@ }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, + 'REQUIRE_PO_TOKEN': True, }, # Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats 'web_safari': { @@ -90,6 +93,7 @@ }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, + 'REQUIRE_PO_TOKEN': True, }, 'web_embedded': { 'INNERTUBE_CONTEXT': { @@ -132,6 +136,7 @@ }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_PO_TOKEN': True, }, 'android_music': { 'INNERTUBE_CONTEXT': { @@ -146,6 +151,7 @@ }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_PO_TOKEN': True, }, 'android_creator': { 'INNERTUBE_CONTEXT': { @@ -160,6 +166,7 @@ }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_PO_TOKEN': True, }, # YouTube Kids videos aren't returned on this client for some reason 'android_vr': { @@ -323,6 +330,7 @@ def build_innertube_clients(): for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') ytcfg.setdefault('REQUIRE_JS_PLAYER', True) + ytcfg.setdefault('REQUIRE_PO_TOKEN', False) ytcfg.setdefault('PLAYER_PARAMS', None) ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') @@ -688,31 +696,46 @@ def _extract_identity_token(self, ytcfg=None, webpage=None): r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, 'identity token', default=None, fatal=False) - @staticmethod - def _extract_account_syncid(*args): + def _data_sync_id_to_delegated_session_id(self, data_sync_id): + if not data_sync_id: + return + # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel + # and just "user_syncid||" for primary channel. We only want the channel_syncid + channel_syncid, _, user_syncid = data_sync_id.partition('||') + if user_syncid: + return channel_syncid + + def _extract_account_syncid(self, *args): """ - Extract syncId required to download private playlists of secondary channels + Extract current session ID required to download private playlists of secondary channels @params response and/or ytcfg """ - for data in args: - # ytcfg includes channel_syncid if on secondary channel - delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], str) - if delegated_sid: - return delegated_sid - sync_ids = (try_get( - data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'], - lambda x: x['DATASYNC_ID']), str) or '').split('||') - if len(sync_ids) >= 2 and sync_ids[1]: - # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel - # and just "user_syncid||" for primary channel. We only want the channel_syncid - return sync_ids[0] + # ytcfg includes channel_syncid if on secondary channel + if delegated_sid := traverse_obj(args, (..., 'DELEGATED_SESSION_ID', {str}, any)): + return delegated_sid - @staticmethod - def _extract_visitor_data(*args): + data_sync_id = self._extract_data_sync_id(*args) + return self._data_sync_id_to_delegated_session_id(data_sync_id) + + def _extract_data_sync_id(self, *args): + """ + Extract current account dataSyncId. + In the format DELEGATED_SESSION_ID||USER_SESSION_ID or USER_SESSION_ID|| + @params response and/or ytcfg + """ + if data_sync_id := self._configuration_arg('data_sync_id', [None], ie_key=YoutubeIE, casesense=True)[0]: + return data_sync_id + + return traverse_obj( + args, (..., ('DATASYNC_ID', ('responseContext', 'mainAppWebResponseContext', 'datasyncId')), {str}, any)) + + def _extract_visitor_data(self, *args): """ Extracts visitorData from an API response or ytcfg Appears to be used to track session state """ + if visitor_data := self._configuration_arg('visitor_data', [None], ie_key=YoutubeIE, casesense=True)[0]: + return visitor_data return get_first( args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))], expected_type=str) @@ -1334,12 +1357,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'}, } _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') - _POTOKEN_EXPERIMENTS = ('51217476', '51217102') - _BROKEN_CLIENTS = { - short_client_name(client): client - for client in ('android', 'android_creator', 'android_music') - } - _DEFAULT_CLIENTS = ('ios', 'web_creator') + _DEFAULT_CLIENTS = ('ios', 'mweb') _GEO_BYPASS = False @@ -3701,6 +3719,54 @@ def _generate_player_context(cls, sts=None): **cls._get_checkok_params(), } + def _get_config_po_token(self, client): + po_token_strs = self._configuration_arg('po_token', [], ie_key=YoutubeIE, casesense=True) + for token_str in po_token_strs: + po_token_client, sep, po_token = token_str.partition('+') + if not sep: + self.report_warning( + f'Invalid po_token configuration format. Expected "client+po_token", got "{token_str}"', only_once=True) + continue + if po_token_client == client: + return po_token + + def fetch_po_token(self, client='web', visitor_data=None, data_sync_id=None, player_url=None, **kwargs): + # PO Token is bound to visitor_data / Visitor ID when logged out. Must have visitor_data for it to function. + if not visitor_data and not self.is_authenticated and player_url: + self.report_warning( + f'Unable to fetch PO Token for {client} client: Missing required Visitor Data. ' + f'You may need to pass Visitor Data with --extractor-args "youtube:visitor_data=XXX"') + return + + config_po_token = self._get_config_po_token(client) + if config_po_token: + # PO token is bound to data_sync_id / account Session ID when logged in. However, for the config po_token, + # if using first channel in an account then we don't need the data_sync_id anymore... + if not data_sync_id and self.is_authenticated and player_url: + self.report_warning( + f'Got a PO Token for {client} client, but missing Data Sync ID for account. Formats may not work.' + f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"') + + return config_po_token + + # Require PO Token if logged in for external fetching + if not data_sync_id and self.is_authenticated and player_url: + self.report_warning( + f'Unable to fetch PO Token for {client} client: Missing required Data Sync ID for account. ' + f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"') + return + + return self._fetch_po_token( + client=client, + visitor_data=visitor_data, + data_sync_id=data_sync_id, + player_url=player_url, + **kwargs, + ) + + def _fetch_po_token(self, client, visitor_data=None, data_sync_id=None, player_url=None, **kwargs): + """External PO Token fetch stub""" + @staticmethod def _is_agegated(player_response): if traverse_obj(player_response, ('playabilityStatus', 'desktopLegacyAgeGateReason')): @@ -3717,13 +3783,17 @@ def _is_agegated(player_response): def _is_unplayable(player_response): return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE' - def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, smuggled_data): - - session_index = self._extract_session_index(player_ytcfg, master_ytcfg) - syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr) - sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None + def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, visitor_data, data_sync_id, po_token): headers = self.generate_api_headers( - ytcfg=player_ytcfg, account_syncid=syncid, session_index=session_index, default_client=client) + ytcfg=player_ytcfg, + default_client=client, + visitor_data=visitor_data, + session_index=self._extract_session_index(master_ytcfg, player_ytcfg), + account_syncid=( + self._data_sync_id_to_delegated_session_id(data_sync_id) + or self._extract_account_syncid(master_ytcfg, initial_pr, player_ytcfg) + ), + ) yt_query = { 'videoId': video_id, @@ -3734,6 +3804,10 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, if player_params := self._configuration_arg('player_params', [default_pp], casesense=True)[0]: yt_query['params'] = player_params + if po_token: + yt_query['serviceIntegrityDimensions'] = {'poToken': po_token} + + sts = self._extract_signature_timestamp(video_id, player_url, master_ytcfg, fatal=False) if player_url else None yt_query.update(self._generate_player_context(sts)) return self._extract_response( item_id=video_id, ep='player', query=yt_query, @@ -3744,7 +3818,6 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, def _get_requested_clients(self, url, smuggled_data): requested_clients = [] - broken_clients = [] excluded_clients = [] allowed_clients = sorted( (client for client in INNERTUBE_CLIENTS if client[:1] != '_'), @@ -3758,12 +3831,8 @@ def _get_requested_clients(self, url, smuggled_data): excluded_clients.append(client[1:]) elif client not in allowed_clients: self.report_warning(f'Skipping unsupported client "{client}"') - elif client in self._BROKEN_CLIENTS.values(): - broken_clients.append(client) else: requested_clients.append(client) - # Force deprioritization of _BROKEN_CLIENTS for format de-duplication - requested_clients.extend(broken_clients) if not requested_clients: requested_clients.extend(self._DEFAULT_CLIENTS) for excluded_client in excluded_clients: @@ -3788,19 +3857,14 @@ def _invalid_player_response(self, pr, video_id): return pr_id def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, smuggled_data): - initial_pr = ignore_initial_response = None + initial_pr = None if webpage: - if 'web' in clients: - experiments = traverse_obj(master_ytcfg, ( - 'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'serializedExperimentIds', {lambda x: x.split(',')}, ...)) - if all(x in experiments for x in self._POTOKEN_EXPERIMENTS): - self.report_warning( - 'Webpage contains broken formats (poToken experiment detected). Ignoring initial player response') - ignore_initial_response = True initial_pr = self._search_json( self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', video_id, fatal=False) prs = [] + deprioritized_prs = [] + if initial_pr and not self._invalid_player_response(initial_pr, video_id): # Android player_response does not have microFormats which are needed for # extraction of some data. So we return the initial_pr with formats @@ -3822,14 +3886,13 @@ def append_client(*client_names): return tried_iframe_fallback = False - player_url = None + player_url = visitor_data = data_sync_id = None skipped_clients = {} while clients: + deprioritize_pr = False client, base_client, variant = _split_innertube_client(clients.pop()) - player_ytcfg = {} - if client == 'web': - player_ytcfg = self._get_default_ytcfg() if ignore_initial_response else master_ytcfg - elif 'configs' not in self._configuration_arg('player_skip'): + player_ytcfg = master_ytcfg if client == 'web' else {} + if 'configs' not in self._configuration_arg('player_skip') and client != 'web': player_ytcfg = self._download_ytcfg(client, video_id) or player_ytcfg player_url = player_url or self._extract_player_url(master_ytcfg, player_ytcfg, webpage=webpage) @@ -3842,34 +3905,53 @@ def append_client(*client_names): player_url = self._download_player_url(video_id) tried_iframe_fallback = True - pr = initial_pr if client == 'web' and not ignore_initial_response else None - for retry in self.RetryManager(fatal=False): - try: - pr = pr or self._extract_player_response( - client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, - player_url if require_js_player else None, initial_pr, smuggled_data) - except ExtractorError as e: - self.report_warning(e) - break - experiments = traverse_obj(pr, ( - 'responseContext', 'serviceTrackingParams', lambda _, v: v['service'] == 'GFEEDBACK', - 'params', lambda _, v: v['key'] == 'e', 'value', {lambda x: x.split(',')}, ...)) - if all(x in experiments for x in self._POTOKEN_EXPERIMENTS): - pr = None - retry.error = ExtractorError('API returned broken formats (poToken experiment detected)', expected=True) - if not pr: + visitor_data = visitor_data or self._extract_visitor_data(master_ytcfg, initial_pr, player_ytcfg) + data_sync_id = data_sync_id or self._extract_data_sync_id(master_ytcfg, initial_pr, player_ytcfg) + po_token = self.fetch_po_token( + client=client, visitor_data=visitor_data, + data_sync_id=data_sync_id if self.is_authenticated else None, + player_url=player_url if require_js_player else None, + ) + + require_po_token = self._get_default_ytcfg(client).get('REQUIRE_PO_TOKEN') + if not po_token and require_po_token: + self.report_warning( + f'No PO Token provided for {client} client, ' + f'which is required for working {client} formats. ' + f'You can manually pass a PO Token for this client with ' + f'--extractor-args "youtube:po_token={client}+XXX"', + only_once=True) + deprioritize_pr = True + + pr = initial_pr if client == 'web' else None + try: + pr = pr or self._extract_player_response( + client, video_id, + master_ytcfg=player_ytcfg or master_ytcfg, + player_ytcfg=player_ytcfg, + player_url=player_url, + initial_pr=initial_pr, + visitor_data=visitor_data, + data_sync_id=data_sync_id, + po_token=po_token) + except ExtractorError as e: + self.report_warning(e) continue if pr_id := self._invalid_player_response(pr, video_id): skipped_clients[client] = pr_id elif pr: # Save client name for introspection later - name = short_client_name(client) sd = traverse_obj(pr, ('streamingData', {dict})) or {} - sd[STREAMING_DATA_CLIENT_NAME] = name + sd[STREAMING_DATA_CLIENT_NAME] = client + sd[STREAMING_DATA_PO_TOKEN] = po_token for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})): - f[STREAMING_DATA_CLIENT_NAME] = name - prs.append(pr) + f[STREAMING_DATA_CLIENT_NAME] = client + f[STREAMING_DATA_PO_TOKEN] = po_token + if deprioritize_pr: + deprioritized_prs.append(pr) + else: + prs.append(pr) # tv_embedded can work around age-gate and age-verification IF the video is embeddable if self._is_agegated(pr) and variant != 'tv_embedded': @@ -3893,6 +3975,8 @@ def append_client(*client_names): # _producer, _testsuite, & _vr variants can also work around age-verification append_client('web_creator', 'mediaconnect') + prs.extend(deprioritized_prs) + if skipped_clients: self.report_warning( f'Skipping player responses from {"/".join(skipped_clients)} clients ' @@ -4026,14 +4110,18 @@ def build_fragments(f): self.report_warning( f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) - client_name = fmt.get(STREAMING_DATA_CLIENT_NAME) - # _BROKEN_CLIENTS return videoplayback URLs that expire after 30 seconds - # Ref: https://github.com/yt-dlp/yt-dlp/issues/9554 - is_broken = client_name in self._BROKEN_CLIENTS + client_name = fmt[STREAMING_DATA_CLIENT_NAME] + po_token = fmt.get(STREAMING_DATA_PO_TOKEN) + + if po_token: + fmt_url = update_url_query(fmt_url, {'pot': po_token}) + + # Clients that require PO Token return videoplayback URLs that may return 403 + is_broken = (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) if is_broken: self.report_warning( - f'{video_id}: {self._BROKEN_CLIENTS[client_name]} client formats are broken ' - 'and may yield HTTP Error 403. They will be deprioritized', only_once=True) + f'{video_id}: {client_name} client formats require a PO Token which was not provided. ' + 'They will be deprioritized as they may yield HTTP Error 403', only_once=True) name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or '' fps = int_or_none(fmt.get('fps')) or 0 @@ -4047,7 +4135,7 @@ def build_fragments(f): try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), is_damaged and 'DAMAGED', is_broken and 'BROKEN', - (self.get_param('verbose') or all_formats) and client_name, + (self.get_param('verbose') or all_formats) and short_client_name(client_name), delim=', '), # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 'source_preference': (-5 if itag == '22' else -1) + (100 if 'Premium' in name else 0), @@ -4109,12 +4197,24 @@ def build_fragments(f): elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live': skip_manifests.add('dash') - def process_manifest_format(f, proto, client_name, itag): + def process_manifest_format(f, proto, client_name, itag, po_token): key = (proto, f.get('language')) if not all_formats and key in itags[itag]: return False itags[itag].add(key) + if f.get('source_preference') is None: + f['source_preference'] = -1 + + # Clients that require PO Token return videoplayback URLs that may return 403 + # hls does not currently require PO Token + if (not po_token and self._get_default_ytcfg(client_name).get('REQUIRE_PO_TOKEN')) and proto != 'hls': + self.report_warning( + f'{video_id}: {client_name} client {proto} formats require a PO Token which was not provided. ' + 'They will be deprioritized as they may yield HTTP Error 403', only_once=True) + f['format_note'] = join_nonempty(f.get('format_note'), 'BROKEN', delim=' ') + f['source_preference'] -= 20 + if itag and all_formats: f['format_id'] = f'{itag}-{proto}' elif any(p != proto for p, _ in itags[itag]): @@ -4126,9 +4226,6 @@ def process_manifest_format(f, proto, client_name, itag): f['format_note'] = join_nonempty(f.get('format_note'), '(default)', delim=' ') f['language_preference'] = PREFERRED_LANG_VALUE - if f.get('source_preference') is None: - f['source_preference'] = -1 - if itag in ('616', '235'): f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ') f['source_preference'] += 100 @@ -4137,7 +4234,8 @@ def process_manifest_format(f, proto, client_name, itag): if f['quality'] == -1 and f.get('height'): f['quality'] = q(res_qualities[min(res_qualities, key=lambda x: abs(x - f['height']))]) if self.get_param('verbose') or all_formats: - f['format_note'] = join_nonempty(f.get('format_note'), client_name, delim=', ') + f['format_note'] = join_nonempty( + f.get('format_note'), short_client_name(client_name), delim=', ') if f.get('fps') and f['fps'] <= 1: del f['fps'] @@ -4148,24 +4246,28 @@ def process_manifest_format(f, proto, client_name, itag): subtitles = {} for sd in streaming_data: - client_name = sd.get(STREAMING_DATA_CLIENT_NAME) - + client_name = sd[STREAMING_DATA_CLIENT_NAME] + po_token = sd.get(STREAMING_DATA_PO_TOKEN) hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl') if hls_manifest_url: + if po_token: + hls_manifest_url = hls_manifest_url.rstrip('/') + f'/pot/{po_token}' fmts, subs = self._extract_m3u8_formats_and_subtitles( hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live') subtitles = self._merge_subtitles(subs, subtitles) for f in fmts: if process_manifest_format(f, 'hls', client_name, self._search_regex( - r'/itag/(\d+)', f['url'], 'itag', default=None)): + r'/itag/(\d+)', f['url'], 'itag', default=None), po_token): yield f dash_manifest_url = 'dash' not in skip_manifests and sd.get('dashManifestUrl') if dash_manifest_url: + if po_token: + dash_manifest_url = dash_manifest_url.rstrip('/') + f'/pot/{po_token}' formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH for f in formats: - if process_manifest_format(f, 'dash', client_name, f['format_id']): + if process_manifest_format(f, 'dash', client_name, f['format_id'], po_token): f['filesize'] = int_or_none(self._search_regex( r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) if needs_live_processing: @@ -4987,7 +5089,7 @@ def _playlist_entries(self, video_list_renderer): def _rich_entries(self, rich_grid_renderer): renderer = traverse_obj( rich_grid_renderer, - ('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer')), get_all=False) or {} + ('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer', 'shortsLockupViewModel'), any)) or {} video_id = renderer.get('videoId') if video_id: yield self._extract_video(renderer) @@ -4999,6 +5101,21 @@ def _rich_entries(self, rich_grid_renderer): ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=self._get_text(renderer, 'title')) return + # shortsLockupViewModel extraction + entity_id = renderer.get('entityId') + if entity_id: + video_id = traverse_obj(renderer, ('onTap', 'innertubeCommand', 'reelWatchEndpoint', 'videoId', {str})) + if not video_id: + return + yield self.url_result( + f'https://www.youtube.com/shorts/{video_id}', + ie=YoutubeIE, video_id=video_id, + **traverse_obj(renderer, ('overlayMetadata', { + 'title': ('primaryText', 'content', {str}), + 'view_count': ('secondaryText', 'content', {parse_count}), + })), + thumbnails=self._extract_thumbnails(renderer, 'thumbnail', final_key='sources')) + return def _video_entry(self, video_renderer): video_id = video_renderer.get('videoId') @@ -7538,6 +7655,8 @@ def _real_extract(self, url): 'id': clip_id, 'section_start': int(clip_data['startTimeMs']) / 1000, 'section_end': int(clip_data['endTimeMs']) / 1000, + '_format_sort_fields': ( # https protocol is prioritized for ffmpeg compatibility + 'proto:https', 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec:vp9.2', 'channels', 'acodec', 'lang'), } diff --git a/yt_dlp/networking/_curlcffi.py b/yt_dlp/networking/_curlcffi.py index e8a67b7347..0643348e7e 100644 --- a/yt_dlp/networking/_curlcffi.py +++ b/yt_dlp/networking/_curlcffi.py @@ -31,9 +31,9 @@ curl_cffi_version = tuple(map(int, re.split(r'[^\d]+', curl_cffi.__version__)[:3])) -if curl_cffi_version != (0, 5, 10) and not ((0, 7, 0) <= curl_cffi_version < (0, 8, 0)): +if curl_cffi_version != (0, 5, 10) and not ((0, 7, 0) <= curl_cffi_version < (0, 7, 2)): curl_cffi._yt_dlp__version = f'{curl_cffi.__version__} (unsupported)' - raise ImportError('Only curl_cffi versions 0.5.10, 0.7.X are supported') + raise ImportError('Only curl_cffi versions 0.5.10, 0.7.0 and 0.7.1 are supported') import curl_cffi.requests from curl_cffi.const import CurlECode, CurlOpt diff --git a/yt_dlp/networking/_helper.py b/yt_dlp/networking/_helper.py index fe3354ea29..b86d3606d8 100644 --- a/yt_dlp/networking/_helper.py +++ b/yt_dlp/networking/_helper.py @@ -10,7 +10,7 @@ import urllib.parse import urllib.request -from .exceptions import RequestError, UnsupportedRequest +from .exceptions import RequestError from ..dependencies import certifi from ..socks import ProxyType, sockssocket from ..utils import format_field, traverse_obj @@ -206,7 +206,7 @@ def wrap_request_errors(func): def wrapper(self, *args, **kwargs): try: return func(self, *args, **kwargs) - except UnsupportedRequest as e: + except RequestError as e: if e.handler is None: e.handler = self raise diff --git a/yt_dlp/networking/_websockets.py b/yt_dlp/networking/_websockets.py index 21b765b91d..ec55567dae 100644 --- a/yt_dlp/networking/_websockets.py +++ b/yt_dlp/networking/_websockets.py @@ -33,8 +33,8 @@ import websockets.version websockets_version = tuple(map(int_or_none, websockets.version.version.split('.'))) -if websockets_version < (12, 0): - raise ImportError('Only websockets>=12.0 is supported') +if websockets_version < (13, 0): + raise ImportError('Only websockets>=13.0 is supported') import websockets.sync.client from websockets.uri import parse_uri diff --git a/yt_dlp/options.py b/yt_dlp/options.py index ffe2463fe2..9980b7fc3f 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -647,16 +647,16 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'You can also simply specify a field to match if the field is present, ' 'use "!field" to check if the field is not present, and "&" to check multiple conditions. ' 'Use a "\\" to escape "&" or quotes if needed. If used multiple times, ' - 'the filter matches if at least one of the conditions is met. E.g. --match-filter ' - '!is_live --match-filter "like_count>?100 & description~=\'(?i)\\bcats \\& dogs\\b\'" ' + 'the filter matches if at least one of the conditions is met. E.g. --match-filters ' + '!is_live --match-filters "like_count>?100 & description~=\'(?i)\\bcats \\& dogs\\b\'" ' 'matches only videos that are not live OR those that have a like count more than 100 ' '(or the like field is not available) and also has a description ' 'that contains the phrase "cats & dogs" (caseless). ' - 'Use "--match-filter -" to interactively ask whether to download each video')) + 'Use "--match-filters -" to interactively ask whether to download each video')) selection.add_option( '--no-match-filters', dest='match_filter', action='store_const', const=None, - help='Do not use any --match-filter (default)') + help='Do not use any --match-filters (default)') selection.add_option( '--break-match-filters', metavar='FILTER', dest='breaking_match_filter', action='append', @@ -704,7 +704,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): selection.add_option( '--break-per-input', action='store_true', dest='break_per_url', default=False, - help='Alters --max-downloads, --break-on-existing, --break-match-filter, and autonumber to reset per input URL') + help='Alters --max-downloads, --break-on-existing, --break-match-filters, and autonumber to reset per input URL') selection.add_option( '--no-break-per-input', action='store_false', dest='break_per_url', @@ -1725,15 +1725,17 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '--convert-subs', '--convert-sub', '--convert-subtitles', metavar='FORMAT', dest='convertsubtitles', default=None, help=( - 'Convert the subtitles to another format (currently supported: {}) ' - '(Alias: --convert-subtitles)'.format(', '.join(sorted(FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS))))) + 'Convert the subtitles to another format ' + f'(currently supported: {", ".join(sorted(FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS))}). ' + 'Use "--convert-subs none" to disable conversion (default) (Alias: --convert-subtitles)')) postproc.add_option( '--convert-thumbnails', metavar='FORMAT', dest='convertthumbnails', default=None, help=( 'Convert the thumbnails to another format ' f'(currently supported: {", ".join(sorted(FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS))}). ' - 'You can specify multiple rules using similar syntax as --remux-video')) + 'You can specify multiple rules using similar syntax as "--remux-video". ' + 'Use "--convert-thumbnails none" to disable conversion (default)')) postproc.add_option( '--split-chapters', '--split-tracks', dest='split_chapters', action='store_true', default=False, diff --git a/yt_dlp/plugins.py b/yt_dlp/plugins.py index 3cc879fd7e..d777d14e71 100644 --- a/yt_dlp/plugins.py +++ b/yt_dlp/plugins.py @@ -5,6 +5,7 @@ import importlib.util import inspect import itertools +import os import pkgutil import sys import traceback @@ -137,6 +138,8 @@ def load_module(module, module_name, suffix): def load_plugins(name, suffix): classes = {} + if os.environ.get('YTDLP_NO_PLUGINS'): + return classes for finder, module_name, _ in iter_modules(name): if any(x.startswith('_') for x in module_name.split('.')): diff --git a/yt_dlp/postprocessor/sponsorblock.py b/yt_dlp/postprocessor/sponsorblock.py index 6cf9ab62ea..b3fc8b54a8 100644 --- a/yt_dlp/postprocessor/sponsorblock.py +++ b/yt_dlp/postprocessor/sponsorblock.py @@ -33,7 +33,7 @@ class SponsorBlockPP(FFmpegPostProcessor): def __init__(self, downloader, categories=None, api='https://sponsor.ajay.app'): FFmpegPostProcessor.__init__(self, downloader) self._categories = tuple(categories or self.CATEGORIES.keys()) - self._API_URL = api if re.match('^https?://', api) else 'https://' + api + self._API_URL = api if re.match('https?://', api) else 'https://' + api def run(self, info): extractor = info['extractor_key'] diff --git a/yt_dlp/postprocessor/xattrpp.py b/yt_dlp/postprocessor/xattrpp.py index f822eff41c..166aabaf92 100644 --- a/yt_dlp/postprocessor/xattrpp.py +++ b/yt_dlp/postprocessor/xattrpp.py @@ -26,38 +26,40 @@ class XAttrMetadataPP(PostProcessor): XATTR_MAPPING = { 'user.xdg.referrer.url': 'webpage_url', - # 'user.xdg.comment': 'description', 'user.dublincore.title': 'title', 'user.dublincore.date': 'upload_date', - 'user.dublincore.description': 'description', 'user.dublincore.contributor': 'uploader', 'user.dublincore.format': 'format', + # We do this last because it may get us close to the xattr limits + # (e.g., 4kB on ext4), and we don't want to have the other ones fail + 'user.dublincore.description': 'description', + # 'user.xdg.comment': 'description', } def run(self, info): mtime = os.stat(info['filepath']).st_mtime self.to_screen('Writing metadata to file\'s xattrs') - try: - for xattrname, infoname in self.XATTR_MAPPING.items(): + for xattrname, infoname in self.XATTR_MAPPING.items(): + try: value = info.get(infoname) if value: if infoname == 'upload_date': value = hyphenate_date(value) write_xattr(info['filepath'], xattrname, value.encode()) - except XAttrUnavailableError as e: - raise PostProcessingError(str(e)) - except XAttrMetadataError as e: - if e.reason == 'NO_SPACE': - self.report_warning( - 'There\'s no disk space left, disk quota exceeded or filesystem xattr limit exceeded. ' - 'Some extended attributes are not written') - elif e.reason == 'VALUE_TOO_LONG': - self.report_warning('Unable to write extended attributes due to too long values.') - else: - tip = ('You need to use NTFS' if compat_os_name == 'nt' - else 'You may have to enable them in your "/etc/fstab"') - raise PostProcessingError(f'This filesystem doesn\'t support extended attributes. {tip}') + except XAttrUnavailableError as e: + raise PostProcessingError(str(e)) + except XAttrMetadataError as e: + if e.reason == 'NO_SPACE': + self.report_warning( + 'There\'s no disk space left, disk quota exceeded or filesystem xattr limit exceeded. ' + f'Extended attribute "{xattrname}" was not written.') + elif e.reason == 'VALUE_TOO_LONG': + self.report_warning(f'Unable to write extended attribute "{xattrname}" due to too long values.') + else: + tip = ('You need to use NTFS' if compat_os_name == 'nt' + else 'You may have to enable them in your "/etc/fstab"') + raise PostProcessingError(f'This filesystem doesn\'t support extended attributes. {tip}') self.try_utime(info['filepath'], mtime, mtime) return [], info diff --git a/yt_dlp/update.py b/yt_dlp/update.py index 72ae290844..4cf3bdc320 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -135,20 +135,42 @@ def _get_binary_name(): def _get_system_deprecation(): - MIN_SUPPORTED, MIN_RECOMMENDED = (3, 8), (3, 8) + MIN_SUPPORTED, MIN_RECOMMENDED = (3, 8), (3, 9) if sys.version_info > MIN_RECOMMENDED: return None major, minor = sys.version_info[:2] - if sys.version_info < MIN_SUPPORTED: - msg = f'Python version {major}.{minor} is no longer supported' - else: - msg = (f'Support for Python version {major}.{minor} has been deprecated. ' - '\nYou may stop receiving updates on this version at any time') + PYTHON_MSG = f'Please update to Python {".".join(map(str, MIN_RECOMMENDED))} or above' - major, minor = MIN_RECOMMENDED - return f'{msg}! Please update to Python {major}.{minor} or above' + if sys.version_info < MIN_SUPPORTED: + return f'Python version {major}.{minor} is no longer supported! {PYTHON_MSG}' + + EXE_MSG_TMPL = ('Support for {} has been deprecated. ' + 'See https://github.com/yt-dlp/yt-dlp/{} for details.\n{}') + STOP_MSG = 'You may stop receiving updates on this version at any time!' + variant = detect_variant() + + # Temporary until Windows builds use 3.9, which will drop support for Win7 and 2008ServerR2 + if variant in ('win_exe', 'win_x86_exe', 'py2exe'): + platform_name = platform.platform() + if any(platform_name.startswith(f'Windows-{name}') for name in ('7', '2008ServerR2')): + return EXE_MSG_TMPL.format('Windows 7/Server 2008 R2', 'issues/10086', STOP_MSG) + elif variant == 'py2exe': + return EXE_MSG_TMPL.format( + 'py2exe builds (yt-dlp_min.exe)', 'issues/10087', + 'In a future update you will be migrated to the PyInstaller-bundled executable. ' + 'This will be done automatically; no action is required on your part') + return None + + # Temporary until aarch64/armv7l build flow is bumped to Ubuntu 20.04 and Python 3.9 + elif variant in ('linux_aarch64_exe', 'linux_armv7l_exe'): + libc_ver = version_tuple(os.confstr('CS_GNU_LIBC_VERSION').partition(' ')[2]) + if libc_ver < (2, 31): + return EXE_MSG_TMPL.format('system glibc version < 2.31', 'pull/8638', STOP_MSG) + return None + + return f'Support for Python version {major}.{minor} has been deprecated. {PYTHON_MSG}' def _sha256_file(path): diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 0d3e707c58..27ebfefbcb 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -664,31 +664,51 @@ def replace_insane(char): return result +def _sanitize_path_parts(parts): + sanitized_parts = [] + for part in parts: + if not part or part == '.': + continue + elif part == '..': + if sanitized_parts and sanitized_parts[-1] != '..': + sanitized_parts.pop() + sanitized_parts.append('..') + continue + # Replace invalid segments with `#` + # - trailing dots and spaces (`asdf...` => `asdf..#`) + # - invalid chars (`<>` => `##`) + sanitized_part = re.sub(r'[/<>:"\|\\?\*]|[\s.]$', '#', part) + sanitized_parts.append(sanitized_part) + + return sanitized_parts + + def sanitize_path(s, force=False): """Sanitizes and normalizes path on Windows""" - # XXX: this handles drive relative paths (c:sth) incorrectly - if sys.platform == 'win32': - force = False - drive_or_unc, _ = os.path.splitdrive(s) - elif force: - drive_or_unc = '' - else: - return s + if sys.platform != 'win32': + if not force: + return s + root = '/' if s.startswith('/') else '' + return root + '/'.join(_sanitize_path_parts(s.split('/'))) - norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep) - if drive_or_unc: - norm_path.pop(0) - sanitized_path = [ - path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part) - for path_part in norm_path] - if drive_or_unc: - sanitized_path.insert(0, drive_or_unc + os.path.sep) - elif force and s and s[0] == os.path.sep: - sanitized_path.insert(0, os.path.sep) - # TODO: Fix behavioral differences <3.12 - # The workaround using `normpath` only superficially passes tests - # Ref: https://github.com/python/cpython/pull/100351 - return os.path.normpath(os.path.join(*sanitized_path)) + normed = s.replace('/', '\\') + + if normed.startswith('\\\\'): + # UNC path (`\\SERVER\SHARE`) or device path (`\\.`, `\\?`) + parts = normed.split('\\') + root = '\\'.join(parts[:4]) + '\\' + parts = parts[4:] + elif normed[1:2] == ':': + # absolute path or drive relative path + offset = 3 if normed[2:3] == '\\' else 2 + root = normed[:offset] + parts = normed[offset:].split('\\') + else: + # relative/drive root relative path + root = '\\' if normed[:1] == '\\' else '' + parts = normed.split('\\') + + return root + '\\'.join(_sanitize_path_parts(parts)) def sanitize_url(url, *, scheme='http'): @@ -804,14 +824,18 @@ class Popen(subprocess.Popen): _startupinfo = None @staticmethod - def _fix_pyinstaller_ld_path(env): - """Restore LD_LIBRARY_PATH when using PyInstaller - Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations - https://github.com/yt-dlp/yt-dlp/issues/4573 - """ + def _fix_pyinstaller_issues(env): if not hasattr(sys, '_MEIPASS'): return + # Force spawning independent subprocesses for exes bundled with PyInstaller>=6.10 + # Ref: https://pyinstaller.org/en/v6.10.0/CHANGES.html#incompatible-changes + # https://github.com/yt-dlp/yt-dlp/issues/11259 + env['PYINSTALLER_RESET_ENVIRONMENT'] = '1' + + # Restore LD_LIBRARY_PATH when using PyInstaller + # Ref: https://pyinstaller.org/en/v6.10.0/runtime-information.html#ld-library-path-libpath-considerations + # https://github.com/yt-dlp/yt-dlp/issues/4573 def _fix(key): orig = env.get(f'{key}_ORIG') if orig is None: @@ -825,7 +849,7 @@ def _fix(key): def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs): if env is None: env = os.environ.copy() - self._fix_pyinstaller_ld_path(env) + self._fix_pyinstaller_issues(env) self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines') if text is True: @@ -1954,7 +1978,7 @@ def urljoin(base, path): path = path.decode() if not isinstance(path, str) or not path: return None - if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path): + if re.match(r'(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path): return path if isinstance(base, bytes): base = base.decode() @@ -1964,11 +1988,30 @@ def urljoin(base, path): return urllib.parse.urljoin(base, path) -def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): +def partial_application(func): + sig = inspect.signature(func) + + @functools.wraps(func) + def wrapped(*args, **kwargs): + try: + sig.bind(*args, **kwargs) + except TypeError: + return functools.partial(func, *args, **kwargs) + else: + return func(*args, **kwargs) + + return wrapped + + +@partial_application +def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1, base=None): if get_attr and v is not None: v = getattr(v, get_attr, None) + if invscale == 1 and scale < 1: + invscale = int(1 / scale) + scale = 1 try: - return int(v) * invscale // scale + return (int(v) if base is None else int(v, base=base)) * invscale // scale except (ValueError, TypeError, OverflowError): return default @@ -1986,9 +2029,13 @@ def str_to_int(int_str): return int_or_none(int_str) +@partial_application def float_or_none(v, scale=1, invscale=1, default=None): if v is None: return default + if invscale == 1 and scale < 1: + invscale = int(1 / scale) + scale = 1 try: return float(v) * invscale / scale except (ValueError, TypeError): @@ -2007,7 +2054,7 @@ def url_or_none(url): if not url or not isinstance(url, str): return None url = url.strip() - return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None + return url if re.match(r'(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None def strftime_or_none(timestamp, date_format='%Y%m%d', default=None): @@ -2919,6 +2966,7 @@ def mimetype2ext(mt, default=NO_DEFAULT): 'audio/webm': 'webm', 'audio/x-matroska': 'mka', 'audio/x-mpegurl': 'm3u', + 'aacp': 'aac', 'midi': 'mid', 'ogg': 'ogg', 'wav': 'wav', @@ -3112,7 +3160,7 @@ def is_html(first_bytes): while first_bytes.startswith(bom): encoding, first_bytes = enc, first_bytes[len(bom):] - return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace')) + return re.match(r'\s*<', first_bytes.decode(encoding, 'replace')) def determine_protocol(info_dict): @@ -5280,7 +5328,7 @@ class FormatSorter: settings = { 'vcodec': {'type': 'ordered', 'regex': True, - 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, + 'order': ['av0?1', 'vp0?9.0?2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, 'acodec': {'type': 'ordered', 'regex': True, 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']}, 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range', diff --git a/yt_dlp/utils/traversal.py b/yt_dlp/utils/traversal.py index 96eb2eddf5..b918487f98 100644 --- a/yt_dlp/utils/traversal.py +++ b/yt_dlp/utils/traversal.py @@ -1,18 +1,35 @@ +from __future__ import annotations + +import collections import collections.abc import contextlib +import functools import http.cookies import inspect import itertools import re +import typing import xml.etree.ElementTree from ._utils import ( IDENTITY, NO_DEFAULT, + ExtractorError, LazyList, deprecation_warning, + get_elements_html_by_class, + get_elements_html_by_attribute, + get_elements_by_attribute, + get_element_html_by_attribute, + get_element_by_attribute, + get_element_html_by_id, + get_element_by_id, + get_element_html_by_class, + get_elements_by_class, + get_element_text_and_html_by_tag, is_iterable_like, try_call, + url_or_none, variadic, ) @@ -54,6 +71,7 @@ def traverse_obj( Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`. - `any`-builtin: Take the first matching object and return it, resetting branching. - `all`-builtin: Take all matching objects and return them as a list, resetting branching. + - `filter`-builtin: Return the value if it is truthy, `None` otherwise. `tuple`, `list`, and `dict` all support nested paths and branches. @@ -247,6 +265,10 @@ def apply_path(start_obj, path, test_type): objs = (list(filtered_objs),) continue + if key is filter: + objs = filter(None, objs) + continue + if __debug__ and callable(key): # Verify function signature inspect.signature(key).bind(None, None) @@ -277,13 +299,143 @@ def _traverse_obj(obj, path, allow_empty, test_type): return results[0] if results else {} if allow_empty and is_dict else None for index, path in enumerate(paths, 1): - result = _traverse_obj(obj, path, index == len(paths), True) - if result is not None: - return result + is_last = index == len(paths) + try: + result = _traverse_obj(obj, path, is_last, True) + if result is not None: + return result + except _RequiredError as e: + if is_last: + # Reraise to get cleaner stack trace + raise ExtractorError(e.orig_msg, expected=e.expected) from None return None if default is NO_DEFAULT else default +def value(value, /): + return lambda _: value + + +def require(name, /, *, expected=False): + def func(value): + if value is None: + raise _RequiredError(f'Unable to extract {name}', expected=expected) + + return value + + return func + + +class _RequiredError(ExtractorError): + pass + + +@typing.overload +def subs_list_to_dict(*, ext: str | None = None) -> collections.abc.Callable[[list[dict]], dict[str, list[dict]]]: ... + + +@typing.overload +def subs_list_to_dict(subs: list[dict] | None, /, *, ext: str | None = None) -> dict[str, list[dict]]: ... + + +def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None): + """ + Convert subtitles from a traversal into a subtitle dict. + The path should have an `all` immediately before this function. + + Arguments: + `ext` The default value for `ext` in the subtitle dict + + In the dict you can set the following additional items: + `id` The subtitle id to sort the dict into + `quality` The sort order for each subtitle + """ + if subs is None: + return functools.partial(subs_list_to_dict, ext=ext) + + result = collections.defaultdict(list) + + for sub in subs: + if not url_or_none(sub.get('url')) and not sub.get('data'): + continue + sub_id = sub.pop('id', None) + if sub_id is None: + continue + if ext is not None and not sub.get('ext'): + sub['ext'] = ext + result[sub_id].append(sub) + result = dict(result) + + for subs in result.values(): + subs.sort(key=lambda x: x.pop('quality', 0) or 0) + + return result + + +@typing.overload +def find_element(*, attr: str, value: str, tag: str | None = None, html=False): ... + + +@typing.overload +def find_element(*, cls: str, html=False): ... + + +@typing.overload +def find_element(*, id: str, tag: str | None = None, html=False): ... + + +@typing.overload +def find_element(*, tag: str, html=False): ... + + +def find_element(*, tag=None, id=None, cls=None, attr=None, value=None, html=False): + # deliberately using `id=` and `cls=` for ease of readability + assert tag or id or cls or (attr and value), 'One of tag, id, cls or (attr AND value) is required' + if not tag: + tag = r'[\w:.-]+' + + if attr and value: + assert not cls, 'Cannot match both attr and cls' + assert not id, 'Cannot match both attr and id' + func = get_element_html_by_attribute if html else get_element_by_attribute + return functools.partial(func, attr, value, tag=tag) + + elif cls: + assert not id, 'Cannot match both cls and id' + assert tag is None, 'Cannot match both cls and tag' + func = get_element_html_by_class if html else get_elements_by_class + return functools.partial(func, cls) + + elif id: + func = get_element_html_by_id if html else get_element_by_id + return functools.partial(func, id, tag=tag) + + index = int(bool(html)) + return lambda html: get_element_text_and_html_by_tag(tag, html)[index] + + +@typing.overload +def find_elements(*, cls: str, html=False): ... + + +@typing.overload +def find_elements(*, attr: str, value: str, tag: str | None = None, html=False): ... + + +def find_elements(*, tag=None, cls=None, attr=None, value=None, html=False): + # deliberately using `cls=` for ease of readability + assert cls or (attr and value), 'One of cls or (attr AND value) is required' + + if attr and value: + assert not cls, 'Cannot match both attr and cls' + func = get_elements_html_by_attribute if html else get_elements_by_attribute + return functools.partial(func, attr, value, tag=tag or r'[\w:.-]+') + + assert not tag, 'Cannot match both cls and tag' + func = get_elements_html_by_class if html else get_elements_by_class + return functools.partial(func, cls) + + def get_first(obj, *paths, **kwargs): return traverse_obj(obj, *((..., *variadic(keys)) for keys in paths), **kwargs, get_all=False) diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 6633a11b91..2ad18dd196 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2024.08.06' +__version__ = '2024.10.07' -RELEASE_GIT_HEAD = '4d9231208332d4c32364b8cd814bff8b20232cae' +RELEASE_GIT_HEAD = '1a176d874e6772cd898ce507379ea388e96ee3f7' VARIANT = None @@ -12,4 +12,4 @@ ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2024.08.06' +_pkg_version = '2024.10.07'