diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md deleted file mode 100644 index f6c01ce7af..0000000000 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ /dev/null @@ -1,70 +0,0 @@ ---- -name: Broken site support -about: Report broken or misfunctioning site -title: "[Broken]" -labels: Broken -assignees: '' - ---- - - - - -## Checklist - - - -- [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running yt-dlp version **2021.08.10** -- [ ] I've checked that all provided URLs are alive and playable in a browser -- [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped -- [ ] I've searched the bugtracker for similar issues including closed ones - - -## Verbose log - - - -``` -PASTE VERBOSE LOG HERE - -``` - - - -## Description - - - -WRITE DESCRIPTION HERE diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml new file mode 100644 index 0000000000..9fbea7afb2 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -0,0 +1,63 @@ +name: Broken site support +description: Report broken or misfunctioning site +labels: [triage, site-bug] +body: + - type: checkboxes + id: checklist + attributes: + label: Checklist + description: | + Carefully read and work through this check list in order to prevent the most common mistakes and misuse of yt-dlp: + options: + - label: I'm reporting a broken site + required: true + - label: I've verified that I'm running yt-dlp version **2021.12.27**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + required: true + - label: I've checked that all provided URLs are alive and playable in a browser + required: true + - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/ytdl-org/youtube-dl#video-url-contains-an-ampersand-and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) + required: true + - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues including closed ones. DO NOT post duplicates + required: true + - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + required: true + - label: I've read about [sharing account credentials](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#are-you-willing-to-share-account-details-if-needed) and I'm willing to share it if required + - type: input + id: region + attributes: + label: Region + description: "Enter the region the site is accessible from" + placeholder: "India" + - type: textarea + id: description + attributes: + label: Description + description: | + Provide an explanation of your issue in an arbitrary form. + Provide any additional information, any suggested solutions, and as much context and examples as possible + placeholder: WRITE DESCRIPTION HERE + validations: + required: true + - type: textarea + id: log + attributes: + label: Verbose log + description: | + Provide the complete verbose output of yt-dlp **that clearly demonstrates the problem**. + Add the `-Uv` flag to your command line you run yt-dlp with (`yt-dlp -Uv `), copy the WHOLE output and insert it below. + It should look similar to this: + placeholder: | + [debug] Command-line config: ['-Uv', 'http://www.youtube.com/watch?v=BaW_jenozKc'] + [debug] Portable config file: yt-dlp.conf + [debug] Portable config: ['-i'] + [debug] Encodings: locale cp1252, fs utf-8, stdout utf-8, stderr utf-8, pref cp1252 + [debug] yt-dlp version 2021.12.27 (exe) + [debug] Python version 3.8.8 (CPython 64bit) - Windows-10-10.0.19041-SP0 + [debug] exe versions: ffmpeg 3.0.1, ffprobe 3.0.1 + [debug] Optional libraries: Cryptodome, keyring, mutagen, sqlite, websockets + [debug] Proxy map: {} + yt-dlp is up to date (2021.12.27) + + render: shell + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md deleted file mode 100644 index a3cf7f77fc..0000000000 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ /dev/null @@ -1,57 +0,0 @@ ---- -name: Site support request -about: Request support for a new site -title: "[Site Request]" -labels: Request -assignees: '' - ---- - - - - -## Checklist - - - -- [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running yt-dlp version **2021.08.10** -- [ ] I've checked that all provided URLs are alive and playable in a browser -- [ ] I've checked that none of provided URLs violate any copyrights -- [ ] The provided URLs do not contain any DRM to the best of my knowledge -- [ ] I've searched the bugtracker for similar site support requests including closed ones - - -## Example URLs - - - -- Single video: https://www.youtube.com/watch?v=BaW_jenozKc -- Single video: https://youtu.be/BaW_jenozKc -- Playlist: https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc - - -## Description - - - -WRITE DESCRIPTION HERE diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml new file mode 100644 index 0000000000..3637941dd5 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -0,0 +1,74 @@ +name: Site support request +description: Request support for a new site +labels: [triage, site-request] +body: + - type: checkboxes + id: checklist + attributes: + label: Checklist + description: | + Carefully read and work through this check list in order to prevent the most common mistakes and misuse of yt-dlp: + options: + - label: I'm reporting a new site support request + required: true + - label: I've verified that I'm running yt-dlp version **2021.12.27**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + required: true + - label: I've checked that all provided URLs are alive and playable in a browser + required: true + - label: I've checked that none of provided URLs [violate any copyrights](https://github.com/ytdl-org/youtube-dl#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free) or contain any [DRM](https://en.wikipedia.org/wiki/Digital_rights_management) to the best of my knowledge + required: true + - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues including closed ones. DO NOT post duplicates + required: true + - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + required: true + - label: I've read about [sharing account credentials](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#are-you-willing-to-share-account-details-if-needed) and am willing to share it if required + - type: input + id: region + attributes: + label: Region + description: "Enter the region the site is accessible from" + placeholder: "India" + - type: textarea + id: example-urls + attributes: + label: Example URLs + description: | + Provide all kinds of example URLs for which support should be added + placeholder: | + - Single video: https://www.youtube.com/watch?v=BaW_jenozKc + - Single video: https://youtu.be/BaW_jenozKc + - Playlist: https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc + validations: + required: true + - type: textarea + id: description + attributes: + label: Description + description: | + Provide any additional information + placeholder: WRITE DESCRIPTION HERE + validations: + required: true + - type: textarea + id: log + attributes: + label: Verbose log + description: | + Provide the complete verbose output **using one of the example URLs provided above**. + Add the `-Uv` flag to your command line you run yt-dlp with (`yt-dlp -Uv `), copy the WHOLE output and insert it below. + It should look similar to this: + placeholder: | + [debug] Command-line config: ['-Uv', 'http://www.youtube.com/watch?v=BaW_jenozKc'] + [debug] Portable config file: yt-dlp.conf + [debug] Portable config: ['-i'] + [debug] Encodings: locale cp1252, fs utf-8, stdout utf-8, stderr utf-8, pref cp1252 + [debug] yt-dlp version 2021.12.27 (exe) + [debug] Python version 3.8.8 (CPython 64bit) - Windows-10-10.0.19041-SP0 + [debug] exe versions: ffmpeg 3.0.1, ffprobe 3.0.1 + [debug] Optional libraries: Cryptodome, keyring, mutagen, sqlite, websockets + [debug] Proxy map: {} + yt-dlp is up to date (2021.12.27) + + render: shell + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md deleted file mode 100644 index c4f2617961..0000000000 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ /dev/null @@ -1,40 +0,0 @@ ---- -name: Site feature request -about: Request a new functionality for a site -title: "[Site Request]" -labels: Request -assignees: '' - ---- - - - - -## Checklist - - - -- [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running yt-dlp version **2021.08.10** -- [ ] I've searched the bugtracker for similar site feature requests including closed ones - - -## Description - - - -WRITE DESCRIPTION HERE diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml new file mode 100644 index 0000000000..39245e4769 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -0,0 +1,72 @@ +name: Site feature request +description: Request a new functionality for a supported site +labels: [triage, site-enhancement] +body: + - type: checkboxes + id: checklist + attributes: + label: Checklist + description: | + Carefully read and work through this check list in order to prevent the most common mistakes and misuse of yt-dlp: + options: + - label: I'm reporting a site feature request + required: true + - label: I've verified that I'm running yt-dlp version **2021.12.27**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + required: true + - label: I've checked that all provided URLs are alive and playable in a browser + required: true + - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues including closed ones. DO NOT post duplicates + required: true + - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + required: true + - label: I've read about [sharing account credentials](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#are-you-willing-to-share-account-details-if-needed) and I'm willing to share it if required + - type: input + id: region + attributes: + label: Region + description: "Enter the region the site is accessible from" + placeholder: "India" + - type: textarea + id: example-urls + attributes: + label: Example URLs + description: | + Example URLs that can be used to demonstrate the requested feature + value: | + https://www.youtube.com/watch?v=BaW_jenozKc + validations: + required: true + - type: textarea + id: description + attributes: + label: Description + description: | + Provide an explanation of your site feature request in an arbitrary form. + Please make sure the description is worded well enough to be understood, see [is-the-description-of-the-issue-itself-sufficient](https://github.com/ytdl-org/youtube-dl#is-the-description-of-the-issue-itself-sufficient). + Provide any additional information, any suggested solutions, and as much context and examples as possible + placeholder: WRITE DESCRIPTION HERE + validations: + required: true + - type: textarea + id: log + attributes: + label: Verbose log + description: | + Provide the complete verbose output of yt-dlp that demonstrates the need for the enhancement. + Add the `-Uv` flag to your command line you run yt-dlp with (`yt-dlp -Uv `), copy the WHOLE output and insert it below. + It should look similar to this: + placeholder: | + [debug] Command-line config: ['-Uv', 'http://www.youtube.com/watch?v=BaW_jenozKc'] + [debug] Portable config file: yt-dlp.conf + [debug] Portable config: ['-i'] + [debug] Encodings: locale cp1252, fs utf-8, stdout utf-8, stderr utf-8, pref cp1252 + [debug] yt-dlp version 2021.12.27 (exe) + [debug] Python version 3.8.8 (CPython 64bit) - Windows-10-10.0.19041-SP0 + [debug] exe versions: ffmpeg 3.0.1, ffprobe 3.0.1 + [debug] Optional libraries: Cryptodome, keyring, mutagen, sqlite, websockets + [debug] Proxy map: {} + yt-dlp is up to date (2021.12.27) + + render: shell + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md deleted file mode 100644 index cf2763b2ec..0000000000 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ /dev/null @@ -1,73 +0,0 @@ ---- -name: Bug report -about: Report a bug unrelated to any particular site or extractor -title: '' -labels: '' -assignees: '' - ---- - - - - -## Checklist - - - -- [ ] I'm reporting a bug unrelated to a specific site -- [ ] I've verified that I'm running yt-dlp version **2021.08.10** -- [ ] I've checked that all provided URLs are alive and playable in a browser -- [ ] The provided URLs do not contain any DRM to the best of my knowledge -- [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped -- [ ] I've searched the bugtracker for similar bug reports including closed ones -- [ ] I've read bugs section in FAQ - - -## Verbose log - - - -``` -PASTE VERBOSE LOG HERE - -``` - - - -## Description - - - -WRITE DESCRIPTION HERE diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml new file mode 100644 index 0000000000..002859185e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -0,0 +1,57 @@ +name: Bug report +description: Report a bug unrelated to any particular site or extractor +labels: [triage, bug] +body: + - type: checkboxes + id: checklist + attributes: + label: Checklist + description: | + Carefully read and work through this check list in order to prevent the most common mistakes and misuse of yt-dlp: + options: + - label: I'm reporting a bug unrelated to a specific site + required: true + - label: I've verified that I'm running yt-dlp version **2021.12.27**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + required: true + - label: I've checked that all provided URLs are alive and playable in a browser + required: true + - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/ytdl-org/youtube-dl#video-url-contains-an-ampersand-and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) + required: true + - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues including closed ones. DO NOT post duplicates + required: true + - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + required: true + - type: textarea + id: description + attributes: + label: Description + description: | + Provide an explanation of your issue in an arbitrary form. + Please make sure the description is worded well enough to be understood, see [is-the-description-of-the-issue-itself-sufficient](https://github.com/ytdl-org/youtube-dl#is-the-description-of-the-issue-itself-sufficient). + Provide any additional information, any suggested solutions, and as much context and examples as possible + placeholder: WRITE DESCRIPTION HERE + validations: + required: true + - type: textarea + id: log + attributes: + label: Verbose log + description: | + Provide the complete verbose output of yt-dlp **that clearly demonstrates the problem**. + Add the `-Uv` flag to **your** command line you run yt-dlp with (`yt-dlp -Uv `), copy the WHOLE output and insert it below. + It should look similar to this: + placeholder: | + [debug] Command-line config: ['-Uv', 'http://www.youtube.com/watch?v=BaW_jenozKc'] + [debug] Portable config file: yt-dlp.conf + [debug] Portable config: ['-i'] + [debug] Encodings: locale cp1252, fs utf-8, stdout utf-8, stderr utf-8, pref cp1252 + [debug] yt-dlp version 2021.12.27 (exe) + [debug] Python version 3.8.8 (CPython 64bit) - Windows-10-10.0.19041-SP0 + [debug] exe versions: ffmpeg 3.0.1, ffprobe 3.0.1 + [debug] Optional libraries: Cryptodome, keyring, mutagen, sqlite, websockets + [debug] Proxy map: {} + yt-dlp is up to date (2021.12.27) + + render: shell + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md deleted file mode 100644 index 77bf4b29df..0000000000 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ /dev/null @@ -1,40 +0,0 @@ ---- -name: Feature request -about: Request a new functionality unrelated to any particular site or extractor -title: "[Feature Request]" -labels: Request -assignees: '' - ---- - - - - -## Checklist - - - -- [ ] I'm reporting a feature request -- [ ] I've verified that I'm running yt-dlp version **2021.08.10** -- [ ] I've searched the bugtracker for similar feature requests including closed ones - - -## Description - - - -WRITE DESCRIPTION HERE diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml new file mode 100644 index 0000000000..5b0a3918c6 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -0,0 +1,30 @@ +name: Feature request +description: Request a new functionality unrelated to any particular site or extractor +labels: [triage, enhancement] +body: + - type: checkboxes + id: checklist + attributes: + label: Checklist + description: | + Carefully read and work through this check list in order to prevent the most common mistakes and misuse of yt-dlp: + options: + - label: I'm reporting a feature request + required: true + - label: I've verified that I'm running yt-dlp version **2021.12.27**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + required: true + - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues including closed ones. DO NOT post duplicates + required: true + - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + required: true + - type: textarea + id: description + attributes: + label: Description + description: | + Provide an explanation of your site feature request in an arbitrary form. + Please make sure the description is worded well enough to be understood, see [is-the-description-of-the-issue-itself-sufficient](https://github.com/ytdl-org/youtube-dl#is-the-description-of-the-issue-itself-sufficient). + Provide any additional information, any suggested solutions, and as much context and examples as possible + placeholder: WRITE DESCRIPTION HERE + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/6_question.md b/.github/ISSUE_TEMPLATE/6_question.md deleted file mode 100644 index dd2857c092..0000000000 --- a/.github/ISSUE_TEMPLATE/6_question.md +++ /dev/null @@ -1,40 +0,0 @@ ---- -name: Ask question -about: Ask youtube-dl related question -title: "[Question]" -labels: question -assignees: '' - ---- - - - - -## Checklist - - - -- [ ] I'm asking a question -- [ ] I've looked through the README and FAQ for similar questions -- [ ] I've searched the bugtracker for similar questions including closed ones - - -## Question - - - -WRITE QUESTION HERE diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml new file mode 100644 index 0000000000..ef8ab68358 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -0,0 +1,52 @@ +name: Ask question +description: Ask yt-dlp related question +labels: [question] +body: + - type: checkboxes + id: checklist + attributes: + label: Checklist + description: | + Carefully read and work through this check list in order to prevent the most common mistakes and misuse of yt-dlp: + options: + - label: I'm asking a question and **not** reporting a bug/feature request + required: true + - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) + required: true + - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + required: true + - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions including closed ones + required: true + - type: textarea + id: question + attributes: + label: Question + description: | + Ask your question in an arbitrary form. + Please make sure it's worded well enough to be understood, see [is-the-description-of-the-issue-itself-sufficient](https://github.com/ytdl-org/youtube-dl#is-the-description-of-the-issue-itself-sufficient). + Provide any additional information and as much context and examples as possible. + If your question contains "isn't working" or "can you add", this is most likely the wrong template + placeholder: WRITE QUESTION HERE + validations: + required: true + - type: textarea + id: log + attributes: + label: Verbose log + description: | + If your question involes a yt-dlp command, provide the complete verbose output of that command. + Add the `-Uv` flag to **your** command line you run yt-dlp with (`yt-dlp -Uv `), copy the WHOLE output and insert it below. + It should look similar to this: + placeholder: | + [debug] Command-line config: ['-Uv', 'http://www.youtube.com/watch?v=BaW_jenozKc'] + [debug] Portable config file: yt-dlp.conf + [debug] Portable config: ['-i'] + [debug] Encodings: locale cp1252, fs utf-8, stdout utf-8, stderr utf-8, pref cp1252 + [debug] yt-dlp version 2021.12.01 (exe) + [debug] Python version 3.8.8 (CPython 64bit) - Windows-10-10.0.19041-SP0 + [debug] exe versions: ffmpeg 3.0.1, ffprobe 3.0.1 + [debug] Optional libraries: Cryptodome, keyring, mutagen, sqlite, websockets + [debug] Proxy map: {} + yt-dlp is up to date (2021.12.01) + + render: shell diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000..3d168fc736 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,5 @@ +blank_issues_enabled: false +contact_links: + - name: Get help from the community on Discord + url: https://discord.gg/H5MNcFW63r + about: Join the yt-dlp Discord for community-powered support! diff --git a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.md b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.md deleted file mode 100644 index 6da13a7b50..0000000000 --- a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.md +++ /dev/null @@ -1,70 +0,0 @@ ---- -name: Broken site support -about: Report broken or misfunctioning site -title: "[Broken]" -labels: Broken -assignees: '' - ---- - - - - -## Checklist - - - -- [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running yt-dlp version **%(version)s** -- [ ] I've checked that all provided URLs are alive and playable in a browser -- [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped -- [ ] I've searched the bugtracker for similar issues including closed ones - - -## Verbose log - - - -``` -PASTE VERBOSE LOG HERE - -``` - - - -## Description - - - -WRITE DESCRIPTION HERE diff --git a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml new file mode 100644 index 0000000000..ad6af55cfb --- /dev/null +++ b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml @@ -0,0 +1,63 @@ +name: Broken site support +description: Report broken or misfunctioning site +labels: [triage, site-bug] +body: + - type: checkboxes + id: checklist + attributes: + label: Checklist + description: | + Carefully read and work through this check list in order to prevent the most common mistakes and misuse of yt-dlp: + options: + - label: I'm reporting a broken site + required: true + - label: I've verified that I'm running yt-dlp version **%(version)s**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + required: true + - label: I've checked that all provided URLs are alive and playable in a browser + required: true + - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/ytdl-org/youtube-dl#video-url-contains-an-ampersand-and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) + required: true + - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues including closed ones. DO NOT post duplicates + required: true + - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + required: true + - label: I've read about [sharing account credentials](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#are-you-willing-to-share-account-details-if-needed) and I'm willing to share it if required + - type: input + id: region + attributes: + label: Region + description: "Enter the region the site is accessible from" + placeholder: "India" + - type: textarea + id: description + attributes: + label: Description + description: | + Provide an explanation of your issue in an arbitrary form. + Provide any additional information, any suggested solutions, and as much context and examples as possible + placeholder: WRITE DESCRIPTION HERE + validations: + required: true + - type: textarea + id: log + attributes: + label: Verbose log + description: | + Provide the complete verbose output of yt-dlp **that clearly demonstrates the problem**. + Add the `-Uv` flag to your command line you run yt-dlp with (`yt-dlp -Uv `), copy the WHOLE output and insert it below. + It should look similar to this: + placeholder: | + [debug] Command-line config: ['-Uv', 'http://www.youtube.com/watch?v=BaW_jenozKc'] + [debug] Portable config file: yt-dlp.conf + [debug] Portable config: ['-i'] + [debug] Encodings: locale cp1252, fs utf-8, stdout utf-8, stderr utf-8, pref cp1252 + [debug] yt-dlp version %(version)s (exe) + [debug] Python version 3.8.8 (CPython 64bit) - Windows-10-10.0.19041-SP0 + [debug] exe versions: ffmpeg 3.0.1, ffprobe 3.0.1 + [debug] Optional libraries: Cryptodome, keyring, mutagen, sqlite, websockets + [debug] Proxy map: {} + yt-dlp is up to date (%(version)s) + + render: shell + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md deleted file mode 100644 index 79adb709c1..0000000000 --- a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md +++ /dev/null @@ -1,57 +0,0 @@ ---- -name: Site support request -about: Request support for a new site -title: "[Site Request]" -labels: Request -assignees: '' - ---- - - - - -## Checklist - - - -- [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running yt-dlp version **%(version)s** -- [ ] I've checked that all provided URLs are alive and playable in a browser -- [ ] I've checked that none of provided URLs violate any copyrights -- [ ] The provided URLs do not contain any DRM to the best of my knowledge -- [ ] I've searched the bugtracker for similar site support requests including closed ones - - -## Example URLs - - - -- Single video: https://www.youtube.com/watch?v=BaW_jenozKc -- Single video: https://youtu.be/BaW_jenozKc -- Playlist: https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc - - -## Description - - - -WRITE DESCRIPTION HERE diff --git a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml new file mode 100644 index 0000000000..cc71fd3823 --- /dev/null +++ b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml @@ -0,0 +1,74 @@ +name: Site support request +description: Request support for a new site +labels: [triage, site-request] +body: + - type: checkboxes + id: checklist + attributes: + label: Checklist + description: | + Carefully read and work through this check list in order to prevent the most common mistakes and misuse of yt-dlp: + options: + - label: I'm reporting a new site support request + required: true + - label: I've verified that I'm running yt-dlp version **%(version)s**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + required: true + - label: I've checked that all provided URLs are alive and playable in a browser + required: true + - label: I've checked that none of provided URLs [violate any copyrights](https://github.com/ytdl-org/youtube-dl#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free) or contain any [DRM](https://en.wikipedia.org/wiki/Digital_rights_management) to the best of my knowledge + required: true + - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues including closed ones. DO NOT post duplicates + required: true + - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + required: true + - label: I've read about [sharing account credentials](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#are-you-willing-to-share-account-details-if-needed) and am willing to share it if required + - type: input + id: region + attributes: + label: Region + description: "Enter the region the site is accessible from" + placeholder: "India" + - type: textarea + id: example-urls + attributes: + label: Example URLs + description: | + Provide all kinds of example URLs for which support should be added + placeholder: | + - Single video: https://www.youtube.com/watch?v=BaW_jenozKc + - Single video: https://youtu.be/BaW_jenozKc + - Playlist: https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc + validations: + required: true + - type: textarea + id: description + attributes: + label: Description + description: | + Provide any additional information + placeholder: WRITE DESCRIPTION HERE + validations: + required: true + - type: textarea + id: log + attributes: + label: Verbose log + description: | + Provide the complete verbose output **using one of the example URLs provided above**. + Add the `-Uv` flag to your command line you run yt-dlp with (`yt-dlp -Uv `), copy the WHOLE output and insert it below. + It should look similar to this: + placeholder: | + [debug] Command-line config: ['-Uv', 'http://www.youtube.com/watch?v=BaW_jenozKc'] + [debug] Portable config file: yt-dlp.conf + [debug] Portable config: ['-i'] + [debug] Encodings: locale cp1252, fs utf-8, stdout utf-8, stderr utf-8, pref cp1252 + [debug] yt-dlp version %(version)s (exe) + [debug] Python version 3.8.8 (CPython 64bit) - Windows-10-10.0.19041-SP0 + [debug] exe versions: ffmpeg 3.0.1, ffprobe 3.0.1 + [debug] Optional libraries: Cryptodome, keyring, mutagen, sqlite, websockets + [debug] Proxy map: {} + yt-dlp is up to date (%(version)s) + + render: shell + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md deleted file mode 100644 index d74b6e279f..0000000000 --- a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md +++ /dev/null @@ -1,40 +0,0 @@ ---- -name: Site feature request -about: Request a new functionality for a site -title: "[Site Request]" -labels: Request -assignees: '' - ---- - - - - -## Checklist - - - -- [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running yt-dlp version **%(version)s** -- [ ] I've searched the bugtracker for similar site feature requests including closed ones - - -## Description - - - -WRITE DESCRIPTION HERE diff --git a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml new file mode 100644 index 0000000000..2107bbf72b --- /dev/null +++ b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml @@ -0,0 +1,72 @@ +name: Site feature request +description: Request a new functionality for a supported site +labels: [triage, site-enhancement] +body: + - type: checkboxes + id: checklist + attributes: + label: Checklist + description: | + Carefully read and work through this check list in order to prevent the most common mistakes and misuse of yt-dlp: + options: + - label: I'm reporting a site feature request + required: true + - label: I've verified that I'm running yt-dlp version **%(version)s**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + required: true + - label: I've checked that all provided URLs are alive and playable in a browser + required: true + - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues including closed ones. DO NOT post duplicates + required: true + - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + required: true + - label: I've read about [sharing account credentials](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#are-you-willing-to-share-account-details-if-needed) and I'm willing to share it if required + - type: input + id: region + attributes: + label: Region + description: "Enter the region the site is accessible from" + placeholder: "India" + - type: textarea + id: example-urls + attributes: + label: Example URLs + description: | + Example URLs that can be used to demonstrate the requested feature + value: | + https://www.youtube.com/watch?v=BaW_jenozKc + validations: + required: true + - type: textarea + id: description + attributes: + label: Description + description: | + Provide an explanation of your site feature request in an arbitrary form. + Please make sure the description is worded well enough to be understood, see [is-the-description-of-the-issue-itself-sufficient](https://github.com/ytdl-org/youtube-dl#is-the-description-of-the-issue-itself-sufficient). + Provide any additional information, any suggested solutions, and as much context and examples as possible + placeholder: WRITE DESCRIPTION HERE + validations: + required: true + - type: textarea + id: log + attributes: + label: Verbose log + description: | + Provide the complete verbose output of yt-dlp that demonstrates the need for the enhancement. + Add the `-Uv` flag to your command line you run yt-dlp with (`yt-dlp -Uv `), copy the WHOLE output and insert it below. + It should look similar to this: + placeholder: | + [debug] Command-line config: ['-Uv', 'http://www.youtube.com/watch?v=BaW_jenozKc'] + [debug] Portable config file: yt-dlp.conf + [debug] Portable config: ['-i'] + [debug] Encodings: locale cp1252, fs utf-8, stdout utf-8, stderr utf-8, pref cp1252 + [debug] yt-dlp version %(version)s (exe) + [debug] Python version 3.8.8 (CPython 64bit) - Windows-10-10.0.19041-SP0 + [debug] exe versions: ffmpeg 3.0.1, ffprobe 3.0.1 + [debug] Optional libraries: Cryptodome, keyring, mutagen, sqlite, websockets + [debug] Proxy map: {} + yt-dlp is up to date (%(version)s) + + render: shell + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.md b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.md deleted file mode 100644 index 13b577f862..0000000000 --- a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.md +++ /dev/null @@ -1,73 +0,0 @@ ---- -name: Bug report -about: Report a bug unrelated to any particular site or extractor -title: '' -labels: '' -assignees: '' - ---- - - - - -## Checklist - - - -- [ ] I'm reporting a bug unrelated to a specific site -- [ ] I've verified that I'm running yt-dlp version **%(version)s** -- [ ] I've checked that all provided URLs are alive and playable in a browser -- [ ] The provided URLs do not contain any DRM to the best of my knowledge -- [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped -- [ ] I've searched the bugtracker for similar bug reports including closed ones -- [ ] I've read bugs section in FAQ - - -## Verbose log - - - -``` -PASTE VERBOSE LOG HERE - -``` - - - -## Description - - - -WRITE DESCRIPTION HERE diff --git a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml new file mode 100644 index 0000000000..d06b072aa4 --- /dev/null +++ b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml @@ -0,0 +1,57 @@ +name: Bug report +description: Report a bug unrelated to any particular site or extractor +labels: [triage, bug] +body: + - type: checkboxes + id: checklist + attributes: + label: Checklist + description: | + Carefully read and work through this check list in order to prevent the most common mistakes and misuse of yt-dlp: + options: + - label: I'm reporting a bug unrelated to a specific site + required: true + - label: I've verified that I'm running yt-dlp version **%(version)s**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + required: true + - label: I've checked that all provided URLs are alive and playable in a browser + required: true + - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/ytdl-org/youtube-dl#video-url-contains-an-ampersand-and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) + required: true + - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues including closed ones. DO NOT post duplicates + required: true + - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + required: true + - type: textarea + id: description + attributes: + label: Description + description: | + Provide an explanation of your issue in an arbitrary form. + Please make sure the description is worded well enough to be understood, see [is-the-description-of-the-issue-itself-sufficient](https://github.com/ytdl-org/youtube-dl#is-the-description-of-the-issue-itself-sufficient). + Provide any additional information, any suggested solutions, and as much context and examples as possible + placeholder: WRITE DESCRIPTION HERE + validations: + required: true + - type: textarea + id: log + attributes: + label: Verbose log + description: | + Provide the complete verbose output of yt-dlp **that clearly demonstrates the problem**. + Add the `-Uv` flag to **your** command line you run yt-dlp with (`yt-dlp -Uv `), copy the WHOLE output and insert it below. + It should look similar to this: + placeholder: | + [debug] Command-line config: ['-Uv', 'http://www.youtube.com/watch?v=BaW_jenozKc'] + [debug] Portable config file: yt-dlp.conf + [debug] Portable config: ['-i'] + [debug] Encodings: locale cp1252, fs utf-8, stdout utf-8, stderr utf-8, pref cp1252 + [debug] yt-dlp version %(version)s (exe) + [debug] Python version 3.8.8 (CPython 64bit) - Windows-10-10.0.19041-SP0 + [debug] exe versions: ffmpeg 3.0.1, ffprobe 3.0.1 + [debug] Optional libraries: Cryptodome, keyring, mutagen, sqlite, websockets + [debug] Proxy map: {} + yt-dlp is up to date (%(version)s) + + render: shell + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.md b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.md deleted file mode 100644 index 4a0209db1b..0000000000 --- a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.md +++ /dev/null @@ -1,40 +0,0 @@ ---- -name: Feature request -about: Request a new functionality unrelated to any particular site or extractor -title: "[Feature Request]" -labels: Request -assignees: '' - ---- - - - - -## Checklist - - - -- [ ] I'm reporting a feature request -- [ ] I've verified that I'm running yt-dlp version **%(version)s** -- [ ] I've searched the bugtracker for similar feature requests including closed ones - - -## Description - - - -WRITE DESCRIPTION HERE diff --git a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml new file mode 100644 index 0000000000..6e8b2fd286 --- /dev/null +++ b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml @@ -0,0 +1,30 @@ +name: Feature request +description: Request a new functionality unrelated to any particular site or extractor +labels: [triage, enhancement] +body: + - type: checkboxes + id: checklist + attributes: + label: Checklist + description: | + Carefully read and work through this check list in order to prevent the most common mistakes and misuse of yt-dlp: + options: + - label: I'm reporting a feature request + required: true + - label: I've verified that I'm running yt-dlp version **%(version)s**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + required: true + - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues including closed ones. DO NOT post duplicates + required: true + - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + required: true + - type: textarea + id: description + attributes: + label: Description + description: | + Provide an explanation of your site feature request in an arbitrary form. + Please make sure the description is worded well enough to be understood, see [is-the-description-of-the-issue-itself-sufficient](https://github.com/ytdl-org/youtube-dl#is-the-description-of-the-issue-itself-sufficient). + Provide any additional information, any suggested solutions, and as much context and examples as possible + placeholder: WRITE DESCRIPTION HERE + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/6_question.yml b/.github/ISSUE_TEMPLATE_tmpl/6_question.yml new file mode 100644 index 0000000000..ef8ab68358 --- /dev/null +++ b/.github/ISSUE_TEMPLATE_tmpl/6_question.yml @@ -0,0 +1,52 @@ +name: Ask question +description: Ask yt-dlp related question +labels: [question] +body: + - type: checkboxes + id: checklist + attributes: + label: Checklist + description: | + Carefully read and work through this check list in order to prevent the most common mistakes and misuse of yt-dlp: + options: + - label: I'm asking a question and **not** reporting a bug/feature request + required: true + - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) + required: true + - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + required: true + - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions including closed ones + required: true + - type: textarea + id: question + attributes: + label: Question + description: | + Ask your question in an arbitrary form. + Please make sure it's worded well enough to be understood, see [is-the-description-of-the-issue-itself-sufficient](https://github.com/ytdl-org/youtube-dl#is-the-description-of-the-issue-itself-sufficient). + Provide any additional information and as much context and examples as possible. + If your question contains "isn't working" or "can you add", this is most likely the wrong template + placeholder: WRITE QUESTION HERE + validations: + required: true + - type: textarea + id: log + attributes: + label: Verbose log + description: | + If your question involes a yt-dlp command, provide the complete verbose output of that command. + Add the `-Uv` flag to **your** command line you run yt-dlp with (`yt-dlp -Uv `), copy the WHOLE output and insert it below. + It should look similar to this: + placeholder: | + [debug] Command-line config: ['-Uv', 'http://www.youtube.com/watch?v=BaW_jenozKc'] + [debug] Portable config file: yt-dlp.conf + [debug] Portable config: ['-i'] + [debug] Encodings: locale cp1252, fs utf-8, stdout utf-8, stderr utf-8, pref cp1252 + [debug] yt-dlp version 2021.12.01 (exe) + [debug] Python version 3.8.8 (CPython 64bit) - Windows-10-10.0.19041-SP0 + [debug] exe versions: ffmpeg 3.0.1, ffprobe 3.0.1 + [debug] Optional libraries: Cryptodome, keyring, mutagen, sqlite, websockets + [debug] Proxy map: {} + yt-dlp is up to date (2021.12.01) + + render: shell diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index f711701cb6..684bf59e91 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -7,11 +7,11 @@ ## Please follow the guide below --- ### Before submitting a *pull request* make sure you have: -- [ ] At least skimmed through [adding new extractor tutorial](https://github.com/ytdl-org/youtube-dl#adding-support-for-a-new-site) and [youtube-dl coding conventions](https://github.com/ytdl-org/youtube-dl#youtube-dl-coding-conventions) sections +- [ ] At least skimmed through [contributing guidelines](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#developer-instructions) including [yt-dlp coding conventions](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#yt-dlp-coding-conventions) - [ ] [Searched](https://github.com/yt-dlp/yt-dlp/search?q=is%3Apr&type=Issues) the bugtracker for similar pull requests - [ ] Checked the code with [flake8](https://pypi.python.org/pypi/flake8) -### In order to be accepted and merged into youtube-dl each piece of code must be in public domain or released under [Unlicense](http://unlicense.org/). Check one of the following options: +### In order to be accepted and merged into yt-dlp each piece of code must be in public domain or released under [Unlicense](http://unlicense.org/). Check one of the following options: - [ ] I am the original author of this code and I am willing to release it under [Unlicense](http://unlicense.org/) - [ ] I am not the original author of this code but it is in public domain or released under [Unlicense](http://unlicense.org/) (provide reliable evidence) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4c56a5180b..4a1c68f0db 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,35 +1,116 @@ name: Build - -on: - push: - branches: - - release +on: workflow_dispatch jobs: build_unix: runs-on: ubuntu-latest - outputs: + version_suffix: ${{ steps.version_suffix.outputs.version_suffix }} ytdlp_version: ${{ steps.bump_version.outputs.ytdlp_version }} upload_url: ${{ steps.create_release.outputs.upload_url }} - sha256_unix: ${{ steps.sha256_file.outputs.sha256_unix }} - sha512_unix: ${{ steps.sha512_file.outputs.sha512_unix }} + sha256_bin: ${{ steps.sha256_bin.outputs.sha256_bin }} + sha512_bin: ${{ steps.sha512_bin.outputs.sha512_bin }} + sha256_tar: ${{ steps.sha256_tar.outputs.sha256_tar }} + sha512_tar: ${{ steps.sha512_tar.outputs.sha512_tar }} steps: - uses: actions/checkout@v2 + with: + fetch-depth: 0 - name: Set up Python uses: actions/setup-python@v2 with: python-version: '3.8' - name: Install packages run: sudo apt-get -y install zip pandoc man + - name: Set version suffix + id: version_suffix + env: + PUSH_VERSION_COMMIT: ${{ secrets.PUSH_VERSION_COMMIT }} + if: "env.PUSH_VERSION_COMMIT == ''" + run: echo ::set-output name=version_suffix::$(date -u +"%H%M%S") - name: Bump version id: bump_version - run: python devscripts/update-version.py - - name: Print version - run: echo "${{ steps.bump_version.outputs.ytdlp_version }}" + run: | + python devscripts/update-version.py ${{ steps.version_suffix.outputs.version_suffix }} + make issuetemplates + - name: Push to release + id: push_release + run: | + git config --global user.name github-actions + git config --global user.email github-actions@example.com + git add -u + git commit -m "[version] update" -m "Created by: ${{ github.event.sender.login }}" -m ":ci skip all" + git push origin --force ${{ github.event.ref }}:release + echo ::set-output name=head_sha::$(git rev-parse HEAD) + - name: Update master + id: push_master + env: + PUSH_VERSION_COMMIT: ${{ secrets.PUSH_VERSION_COMMIT }} + if: "env.PUSH_VERSION_COMMIT != ''" + run: git push origin ${{ github.event.ref }} + - name: Get Changelog + id: get_changelog + run: | + changelog=$(cat Changelog.md | grep -oPz '(?s)(?<=### ${{ steps.bump_version.outputs.ytdlp_version }}\n{2}).+?(?=\n{2,3}###)') || true + echo "changelog<> $GITHUB_ENV + echo "$changelog" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + - name: Build lazy extractors + id: lazy_extractors + run: python devscripts/make_lazy_extractors.py - name: Run Make run: make all tar + - name: Get SHA2-256SUMS for yt-dlp + id: sha256_bin + run: echo "::set-output name=sha256_bin::$(sha256sum yt-dlp | awk '{print $1}')" + - name: Get SHA2-256SUMS for yt-dlp.tar.gz + id: sha256_tar + run: echo "::set-output name=sha256_tar::$(sha256sum yt-dlp.tar.gz | awk '{print $1}')" + - name: Get SHA2-512SUMS for yt-dlp + id: sha512_bin + run: echo "::set-output name=sha512_bin::$(sha512sum yt-dlp | awk '{print $1}')" + - name: Get SHA2-512SUMS for yt-dlp.tar.gz + id: sha512_tar + run: echo "::set-output name=sha512_tar::$(sha512sum yt-dlp.tar.gz | awk '{print $1}')" + + - name: Install dependencies for pypi + env: + PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} + if: "env.PYPI_TOKEN != ''" + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + - name: Build and publish on pypi + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} + if: "env.TWINE_PASSWORD != ''" + run: | + rm -rf dist/* + python setup.py sdist bdist_wheel + twine upload dist/* + + - name: Install SSH private key + env: + BREW_TOKEN: ${{ secrets.BREW_TOKEN }} + if: "env.BREW_TOKEN != ''" + uses: webfactory/ssh-agent@v0.5.3 + with: + ssh-private-key: ${{ env.BREW_TOKEN }} + - name: Update Homebrew Formulae + env: + BREW_TOKEN: ${{ secrets.BREW_TOKEN }} + if: "env.BREW_TOKEN != ''" + run: | + git clone git@github.com:yt-dlp/homebrew-taps taps/ + python3 devscripts/update-formulae.py taps/Formula/yt-dlp.rb "${{ steps.bump_version.outputs.ytdlp_version }}" + git -C taps/ config user.name github-actions + git -C taps/ config user.email github-actions@example.com + git -C taps/ commit -am 'yt-dlp: ${{ steps.bump_version.outputs.ytdlp_version }}' + git -C taps/ push + - name: Create Release id: create_release uses: actions/create-release@v1 @@ -38,9 +119,14 @@ jobs: with: tag_name: ${{ steps.bump_version.outputs.ytdlp_version }} release_name: yt-dlp ${{ steps.bump_version.outputs.ytdlp_version }} + commitish: ${{ steps.push_release.outputs.head_sha }} body: | - Changelog: - PLACEHOLDER + #### [A description of the various files]((https://github.com/yt-dlp/yt-dlp#release-files)) are in the README + + --- + + ### Changelog: + ${{ env.changelog }} draft: false prerelease: false - name: Upload yt-dlp Unix binary @@ -62,36 +148,82 @@ jobs: asset_path: ./yt-dlp.tar.gz asset_name: yt-dlp.tar.gz asset_content_type: application/gzip - - name: Get SHA2-256SUMS for yt-dlp - id: sha256_file - run: echo "::set-output name=sha256_unix::$(sha256sum yt-dlp | awk '{print $1}')" - - name: Get SHA2-512SUMS for yt-dlp - id: sha512_file - run: echo "::set-output name=sha512_unix::$(sha512sum yt-dlp | awk '{print $1}')" - - name: Install dependencies for pypi - env: - PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} - if: "env.PYPI_TOKEN != ''" + + build_macos: + runs-on: macos-11 + needs: build_unix + outputs: + sha256_macos: ${{ steps.sha256_macos.outputs.sha256_macos }} + sha512_macos: ${{ steps.sha512_macos.outputs.sha512_macos }} + sha256_macos_zip: ${{ steps.sha256_macos_zip.outputs.sha256_macos_zip }} + sha512_macos_zip: ${{ steps.sha512_macos_zip.outputs.sha512_macos_zip }} + + steps: + - uses: actions/checkout@v2 + # In order to create a universal2 application, the version of python3 in /usr/bin has to be used + # Pyinstaller is pinned to 4.5.1 because the builds are failing in 4.6, 4.7 + - name: Install Requirements run: | - python -m pip install --upgrade pip - pip install setuptools wheel twine - - name: Build and publish on pypi + brew install coreutils + /usr/bin/python3 -m pip install -U --user pip Pyinstaller==4.5.1 mutagen pycryptodomex websockets + - name: Bump version + id: bump_version + run: /usr/bin/python3 devscripts/update-version.py + - name: Build lazy extractors + id: lazy_extractors + run: /usr/bin/python3 devscripts/make_lazy_extractors.py + - name: Run PyInstaller Script + run: /usr/bin/python3 pyinst.py --target-architecture universal2 --onefile + - name: Upload yt-dlp MacOS binary + id: upload-release-macos + uses: actions/upload-release-asset@v1 env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} - if: "env.TWINE_PASSWORD != ''" - run: | - rm -rf dist/* - python setup.py sdist bdist_wheel - twine upload dist/* + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ needs.build_unix.outputs.upload_url }} + asset_path: ./dist/yt-dlp_macos + asset_name: yt-dlp_macos + asset_content_type: application/octet-stream + - name: Get SHA2-256SUMS for yt-dlp_macos + id: sha256_macos + run: echo "::set-output name=sha256_macos::$(sha256sum dist/yt-dlp_macos | awk '{print $1}')" + - name: Get SHA2-512SUMS for yt-dlp_macos + id: sha512_macos + run: echo "::set-output name=sha512_macos::$(sha512sum dist/yt-dlp_macos | awk '{print $1}')" + + - name: Run PyInstaller Script with --onedir + run: /usr/bin/python3 pyinst.py --target-architecture universal2 --onedir + - uses: papeloto/action-zip@v1 + with: + files: ./dist/yt-dlp_macos + dest: ./dist/yt-dlp_macos.zip + - name: Upload yt-dlp MacOS onedir + id: upload-release-macos-zip + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ needs.build_unix.outputs.upload_url }} + asset_path: ./dist/yt-dlp_macos.zip + asset_name: yt-dlp_macos.zip + asset_content_type: application/zip + - name: Get SHA2-256SUMS for yt-dlp_macos.zip + id: sha256_macos_zip + run: echo "::set-output name=sha256_macos_zip::$(sha256sum dist/yt-dlp_macos.zip | awk '{print $1}')" + - name: Get SHA2-512SUMS for yt-dlp_macos + id: sha512_macos_zip + run: echo "::set-output name=sha512_macos_zip::$(sha512sum dist/yt-dlp_macos.zip | awk '{print $1}')" build_windows: runs-on: windows-latest needs: build_unix - outputs: - sha256_windows: ${{ steps.sha256_file_win.outputs.sha256_windows }} - sha512_windows: ${{ steps.sha512_file_win.outputs.sha512_windows }} + sha256_win: ${{ steps.sha256_win.outputs.sha256_win }} + sha512_win: ${{ steps.sha512_win.outputs.sha512_win }} + sha256_py2exe: ${{ steps.sha256_py2exe.outputs.sha256_py2exe }} + sha512_py2exe: ${{ steps.sha512_py2exe.outputs.sha512_py2exe }} + sha256_win_zip: ${{ steps.sha256_win_zip.outputs.sha256_win_zip }} + sha512_win_zip: ${{ steps.sha512_win_zip.outputs.sha512_win_zip }} steps: - uses: actions/checkout@v2 @@ -100,18 +232,21 @@ jobs: uses: actions/setup-python@v2 with: python-version: '3.8' - - name: Upgrade pip and enable wheel support - run: python -m pip install --upgrade pip setuptools wheel - name: Install Requirements # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds - run: pip install "https://yt-dlp.github.io/pyinstaller-builds/x86_64/pyinstaller-4.5.1-py3-none-any.whl" mutagen pycryptodome websockets + run: | + python -m pip install --upgrade pip setuptools wheel py2exe + pip install "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-4.5.1-py3-none-any.whl" mutagen pycryptodomex websockets - name: Bump version id: bump_version - run: python devscripts/update-version.py - - name: Print version - run: echo "${{ steps.bump_version.outputs.ytdlp_version }}" + env: + version_suffix: ${{ needs.build_unix.outputs.version_suffix }} + run: python devscripts/update-version.py ${{ env.version_suffix }} + - name: Build lazy extractors + id: lazy_extractors + run: python devscripts/make_lazy_extractors.py - name: Run PyInstaller Script - run: python pyinst.py 64 + run: python pyinst.py - name: Upload yt-dlp.exe Windows binary id: upload-release-windows uses: actions/upload-release-asset@v1 @@ -123,19 +258,61 @@ jobs: asset_name: yt-dlp.exe asset_content_type: application/vnd.microsoft.portable-executable - name: Get SHA2-256SUMS for yt-dlp.exe - id: sha256_file_win - run: echo "::set-output name=sha256_windows::$((Get-FileHash dist\yt-dlp.exe -Algorithm SHA256).Hash.ToLower())" + id: sha256_win + run: echo "::set-output name=sha256_win::$((Get-FileHash dist\yt-dlp.exe -Algorithm SHA256).Hash.ToLower())" - name: Get SHA2-512SUMS for yt-dlp.exe - id: sha512_file_win - run: echo "::set-output name=sha512_windows::$((Get-FileHash dist\yt-dlp.exe -Algorithm SHA512).Hash.ToLower())" + id: sha512_win + run: echo "::set-output name=sha512_win::$((Get-FileHash dist\yt-dlp.exe -Algorithm SHA512).Hash.ToLower())" + + - name: Run PyInstaller Script with --onedir + run: python pyinst.py --onedir + - uses: papeloto/action-zip@v1 + with: + files: ./dist/yt-dlp + dest: ./dist/yt-dlp_win.zip + - name: Upload yt-dlp Windows onedir + id: upload-release-windows-zip + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ needs.build_unix.outputs.upload_url }} + asset_path: ./dist/yt-dlp_win.zip + asset_name: yt-dlp_win.zip + asset_content_type: application/zip + - name: Get SHA2-256SUMS for yt-dlp_win.zip + id: sha256_win_zip + run: echo "::set-output name=sha256_win_zip::$((Get-FileHash dist\yt-dlp_win.zip -Algorithm SHA256).Hash.ToLower())" + - name: Get SHA2-512SUMS for yt-dlp_win.zip + id: sha512_win_zip + run: echo "::set-output name=sha512_win_zip::$((Get-FileHash dist\yt-dlp_win.zip -Algorithm SHA512).Hash.ToLower())" + + - name: Run py2exe Script + run: python setup.py py2exe + - name: Upload yt-dlp_min.exe Windows binary + id: upload-release-windows-py2exe + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ needs.build_unix.outputs.upload_url }} + asset_path: ./dist/yt-dlp.exe + asset_name: yt-dlp_min.exe + asset_content_type: application/vnd.microsoft.portable-executable + - name: Get SHA2-256SUMS for yt-dlp_min.exe + id: sha256_py2exe + run: echo "::set-output name=sha256_py2exe::$((Get-FileHash dist\yt-dlp.exe -Algorithm SHA256).Hash.ToLower())" + - name: Get SHA2-512SUMS for yt-dlp_min.exe + id: sha512_py2exe + run: echo "::set-output name=sha512_py2exe::$((Get-FileHash dist\yt-dlp.exe -Algorithm SHA512).Hash.ToLower())" build_windows32: runs-on: windows-latest - needs: [build_unix, build_windows] + needs: build_unix outputs: - sha256_windows32: ${{ steps.sha256_file_win32.outputs.sha256_windows32 }} - sha512_windows32: ${{ steps.sha512_file_win32.outputs.sha512_windows32 }} + sha256_win32: ${{ steps.sha256_win32.outputs.sha256_win32 }} + sha512_win32: ${{ steps.sha512_win32.outputs.sha512_win32 }} steps: - uses: actions/checkout@v2 @@ -145,17 +322,20 @@ jobs: with: python-version: '3.7' architecture: 'x86' - - name: Upgrade pip and enable wheel support - run: python -m pip install --upgrade pip setuptools wheel - name: Install Requirements - run: pip install "https://yt-dlp.github.io/pyinstaller-builds/i686/pyinstaller-4.5.1-py3-none-any.whl" mutagen pycryptodome websockets + run: | + python -m pip install --upgrade pip setuptools wheel + pip install "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-4.5.1-py3-none-any.whl" mutagen pycryptodomex websockets - name: Bump version id: bump_version - run: python devscripts/update-version.py - - name: Print version - run: echo "${{ steps.bump_version.outputs.ytdlp_version }}" + env: + version_suffix: ${{ needs.build_unix.outputs.version_suffix }} + run: python devscripts/update-version.py ${{ env.version_suffix }} + - name: Build lazy extractors + id: lazy_extractors + run: python devscripts/make_lazy_extractors.py - name: Run PyInstaller Script for 32 Bit - run: python pyinst.py 32 + run: python pyinst.py - name: Upload Executable yt-dlp_x86.exe id: upload-release-windows32 uses: actions/upload-release-asset@v1 @@ -167,28 +347,36 @@ jobs: asset_name: yt-dlp_x86.exe asset_content_type: application/vnd.microsoft.portable-executable - name: Get SHA2-256SUMS for yt-dlp_x86.exe - id: sha256_file_win32 - run: echo "::set-output name=sha256_windows32::$((Get-FileHash dist\yt-dlp_x86.exe -Algorithm SHA256).Hash.ToLower())" + id: sha256_win32 + run: echo "::set-output name=sha256_win32::$((Get-FileHash dist\yt-dlp_x86.exe -Algorithm SHA256).Hash.ToLower())" - name: Get SHA2-512SUMS for yt-dlp_x86.exe - id: sha512_file_win32 - run: echo "::set-output name=sha512_windows32::$((Get-FileHash dist\yt-dlp_x86.exe -Algorithm SHA512).Hash.ToLower())" + id: sha512_win32 + run: echo "::set-output name=sha512_win32::$((Get-FileHash dist\yt-dlp_x86.exe -Algorithm SHA512).Hash.ToLower())" finish: runs-on: ubuntu-latest - needs: [build_unix, build_windows, build_windows32] + needs: [build_unix, build_windows, build_windows32, build_macos] steps: - name: Make SHA2-256SUMS file env: - SHA256_WINDOWS: ${{ needs.build_windows.outputs.sha256_windows }} - SHA256_WINDOWS32: ${{ needs.build_windows32.outputs.sha256_windows32 }} - SHA256_UNIX: ${{ needs.build_unix.outputs.sha256_unix }} - YTDLP_VERSION: ${{ needs.build_unix.outputs.ytdlp_version }} + SHA256_BIN: ${{ needs.build_unix.outputs.sha256_bin }} + SHA256_TAR: ${{ needs.build_unix.outputs.sha256_tar }} + SHA256_WIN: ${{ needs.build_windows.outputs.sha256_win }} + SHA256_PY2EXE: ${{ needs.build_windows.outputs.sha256_py2exe }} + SHA256_WIN_ZIP: ${{ needs.build_windows.outputs.sha256_win_zip }} + SHA256_WIN32: ${{ needs.build_windows32.outputs.sha256_win32 }} + SHA256_MACOS: ${{ needs.build_macos.outputs.sha256_macos }} + SHA256_MACOS_ZIP: ${{ needs.build_macos.outputs.sha256_macos_zip }} run: | - echo "version:${{ env.YTDLP_VERSION }}" >> SHA2-256SUMS - echo "yt-dlp.exe:${{ env.SHA256_WINDOWS }}" >> SHA2-256SUMS - echo "yt-dlp_x86.exe:${{ env.SHA256_WINDOWS32 }}" >> SHA2-256SUMS - echo "yt-dlp:${{ env.SHA256_UNIX }}" >> SHA2-256SUMS + echo "${{ env.SHA256_BIN }} yt-dlp" >> SHA2-256SUMS + echo "${{ env.SHA256_TAR }} yt-dlp.tar.gz" >> SHA2-256SUMS + echo "${{ env.SHA256_WIN }} yt-dlp.exe" >> SHA2-256SUMS + echo "${{ env.SHA256_PY2EXE }} yt-dlp_min.exe" >> SHA2-256SUMS + echo "${{ env.SHA256_WIN32 }} yt-dlp_x86.exe" >> SHA2-256SUMS + echo "${{ env.SHA256_WIN_ZIP }} yt-dlp_win.zip" >> SHA2-256SUMS + echo "${{ env.SHA256_MACOS }} yt-dlp_macos" >> SHA2-256SUMS + echo "${{ env.SHA256_MACOS_ZIP }} yt-dlp_macos.zip" >> SHA2-256SUMS - name: Upload 256SUMS file id: upload-sums uses: actions/upload-release-asset@v1 @@ -201,13 +389,23 @@ jobs: asset_content_type: text/plain - name: Make SHA2-512SUMS file env: - SHA512_WINDOWS: ${{ needs.build_windows.outputs.sha512_windows }} - SHA512_WINDOWS32: ${{ needs.build_windows32.outputs.sha512_windows32 }} - SHA512_UNIX: ${{ needs.build_unix.outputs.sha512_unix }} + SHA512_BIN: ${{ needs.build_unix.outputs.sha512_bin }} + SHA512_TAR: ${{ needs.build_unix.outputs.sha512_tar }} + SHA512_WIN: ${{ needs.build_windows.outputs.sha512_win }} + SHA512_PY2EXE: ${{ needs.build_windows.outputs.sha512_py2exe }} + SHA512_WIN_ZIP: ${{ needs.build_windows.outputs.sha512_win_zip }} + SHA512_WIN32: ${{ needs.build_windows32.outputs.sha512_win32 }} + SHA512_MACOS: ${{ needs.build_macos.outputs.sha512_macos }} + SHA512_MACOS_ZIP: ${{ needs.build_macos.outputs.sha512_macos_zip }} run: | - echo "${{ env.SHA512_WINDOWS }} yt-dlp.exe" >> SHA2-512SUMS - echo "${{ env.SHA512_WINDOWS32 }} yt-dlp_x86.exe" >> SHA2-512SUMS - echo "${{ env.SHA512_UNIX }} yt-dlp" >> SHA2-512SUMS + echo "${{ env.SHA512_BIN }} yt-dlp" >> SHA2-512SUMS + echo "${{ env.SHA512_TAR }} yt-dlp.tar.gz" >> SHA2-512SUMS + echo "${{ env.SHA512_WIN }} yt-dlp.exe" >> SHA2-512SUMS + echo "${{ env.SHA512_WIN_ZIP }} yt-dlp_win.zip" >> SHA2-512SUMS + echo "${{ env.SHA512_PY2EXE }} yt-dlp_min.exe" >> SHA2-512SUMS + echo "${{ env.SHA512_WIN32 }} yt-dlp_x86.exe" >> SHA2-512SUMS + echo "${{ env.SHA512_MACOS }} yt-dlp_macos" >> SHA2-512SUMS + echo "${{ env.SHA512_MACOS_ZIP }} yt-dlp_macos.zip" >> SHA2-512SUMS - name: Upload 512SUMS file id: upload-512sums uses: actions/upload-release-asset@v1 diff --git a/.github/workflows/quick-test.yml b/.github/workflows/quick-test.yml index 500a504a4a..d8e14f4705 100644 --- a/.github/workflows/quick-test.yml +++ b/.github/workflows/quick-test.yml @@ -12,7 +12,7 @@ jobs: with: python-version: 3.9 - name: Install test requirements - run: pip install pytest pycryptodome + run: pip install pytest pycryptodomex - name: Run tests run: ./devscripts/run_tests.sh core flake8: @@ -28,6 +28,6 @@ jobs: - name: Install flake8 run: pip install flake8 - name: Make lazy extractors - run: python devscripts/make_lazy_extractors.py yt_dlp/extractor/lazy_extractors.py + run: python devscripts/make_lazy_extractors.py - name: Run flake8 run: flake8 . diff --git a/.gitignore b/.gitignore index 7ed34448a1..232096916c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,46 +1,56 @@ # Config *.conf -*.spec cookies -cookies.txt +*cookies.txt +.netrc # Downloaded -*.srt -*.ttml -*.sbv -*.vtt -*.flv -*.mp4 -*.m4a -*.m4v -*.mp3 -*.3gp -*.webm -*.wav -*.ape -*.mkv -*.swf -*.part -*.part-* -*.ytdl +*.annotations.xml +*.aria2 +*.description *.dump *.frag +*.frag.aria2 *.frag.urls -*.aria2 -*.swp -*.ogg -*.opus *.info.json *.live_chat.json -*.jpg +*.part* +*.unknown_video +*.ytdl +.cache/ + +*.3gp +*.ape +*.avi +*.desktop +*.flac +*.flv *.jpeg +*.jpg +*.m4a +*.m4v +*.mhtml +*.mkv +*.mov +*.mp3 +*.mp4 +*.ogg +*.opus *.png +*.sbv +*.srt +*.swf +*.swp +*.ttml +*.url +*.vtt +*.wav +*.webloc +*.webm *.webp -*.annotations.xml -*.description # Allow config/media files in testdata -!test/testdata/** +!test/** # Python *.pyc @@ -76,7 +86,6 @@ README.txt *.1 *.bash-completion *.fish -*.exe *.tar.gz *.zsh *.spec diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5faf97b102..3a390dd9ac 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,26 +1,60 @@ -**Please include the full output of youtube-dl when run with `-v`**, i.e. **add** `-v` flag to **your command line**, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this: +# CONTRIBUTING TO YT-DLP + +- [OPENING AN ISSUE](#opening-an-issue) + - [Is the description of the issue itself sufficient?](#is-the-description-of-the-issue-itself-sufficient) + - [Are you using the latest version?](#are-you-using-the-latest-version) + - [Is the issue already documented?](#is-the-issue-already-documented) + - [Why are existing options not enough?](#why-are-existing-options-not-enough) + - [Have you read and understood the changes, between youtube-dl and yt-dlp](#have-you-read-and-understood-the-changes-between-youtube-dl-and-yt-dlp) + - [Is there enough context in your bug report?](#is-there-enough-context-in-your-bug-report) + - [Does the issue involve one problem, and one problem only?](#does-the-issue-involve-one-problem-and-one-problem-only) + - [Is anyone going to need the feature?](#is-anyone-going-to-need-the-feature) + - [Is your question about yt-dlp?](#is-your-question-about-yt-dlp) + - [Are you willing to share account details if needed?](#are-you-willing-to-share-account-details-if-needed) +- [DEVELOPER INSTRUCTIONS](#developer-instructions) + - [Adding new feature or making overarching changes](#adding-new-feature-or-making-overarching-changes) + - [Adding support for a new site](#adding-support-for-a-new-site) + - [yt-dlp coding conventions](#yt-dlp-coding-conventions) + - [Mandatory and optional metafields](#mandatory-and-optional-metafields) + - [Provide fallbacks](#provide-fallbacks) + - [Regular expressions](#regular-expressions) + - [Long lines policy](#long-lines-policy) + - [Inline values](#inline-values) + - [Collapse fallbacks](#collapse-fallbacks) + - [Trailing parentheses](#trailing-parentheses) + - [Use convenience conversion and parsing functions](#use-convenience-conversion-and-parsing-functions) +- [EMBEDDING YT-DLP](README.md#embedding-yt-dlp) + + + +# OPENING AN ISSUE + +Bugs and suggestions should be reported at: [yt-dlp/yt-dlp/issues](https://github.com/yt-dlp/yt-dlp/issues). Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in our [discord server](https://discord.gg/H5MNcFW63r). + +**Please include the full output of yt-dlp when run with `-Uv`**, i.e. **add** `-Uv` flag to **your command line**, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this: ``` -$ youtube-dl -v -[debug] System config: [] -[debug] User config: [] -[debug] Command-line args: [u'-v', u'https://www.youtube.com/watch?v=BaW_jenozKc'] -[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2015.12.06 -[debug] Git HEAD: 135392e -[debug] Python version 2.6.6 - Windows-2003Server-5.2.3790-SP2 -[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 +$ yt-dlp -Uv +[debug] Command-line config: ['-v', 'demo.com'] +[debug] Encodings: locale UTF-8, fs utf-8, out utf-8, pref UTF-8 +[debug] yt-dlp version 2021.09.25 (zip) +[debug] Python version 3.8.10 (CPython 64bit) - Linux-5.4.0-74-generic-x86_64-with-glibc2.29 +[debug] exe versions: ffmpeg 4.2.4, ffprobe 4.2.4 [debug] Proxy map: {} +Current Build Hash 25cc412d1d3c0725a1f2f5b7e4682f6fb40e6d15f7024e96f7afd572e9919535 +yt-dlp is up to date (2021.09.25) ... ``` **Do not post screenshots of verbose logs; only plain text is acceptable.** -The output (including the first lines) contains important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever. +The output (including the first lines) contains important debugging information. Issues without the full output are often not reproducible and therefore will be closed as `incomplete`. + +The templates provided for the Issues, should be completed and **not removed**, this helps aide the resolution of the issue. Please re-read your issue once again to avoid a couple of common mistakes (you can and should use this as a checklist): ### Is the description of the issue itself sufficient? -We often get issue reports that we cannot really decipher. While in most cases we eventually get the required information after asking back multiple times, this poses an unnecessary drain on our resources. Many contributors, including myself, are also not native speakers, so we may misread some parts. +We often get issue reports that we cannot really decipher. While in most cases we eventually get the required information after asking back multiple times, this poses an unnecessary drain on our resources. So please elaborate on what feature you are requesting, or what bug you want to be fixed. Make sure that it's obvious @@ -28,25 +62,31 @@ ### Is the description of the issue itself sufficient? - How it could be fixed - How your proposed solution would look like -If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a committer myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. +If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. We often get frustrated by these issues, since the only possible way for us to move forward on them is to ask for clarification over and over. -For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the `-v` flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. +For bug reports, this means that your report should contain the **complete** output of yt-dlp when called with the `-Uv` flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. -If your server has multiple IPs or you suspect censorship, adding `--call-home` may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). +If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--write-pages` and upload the `.dump` files you get [somewhere](https://gist.github.com). **Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `https://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `https://www.youtube.com/`) is *not* an example URL. ### Are you using the latest version? -Before reporting any issue, type `youtube-dl -U`. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well. +Before reporting any issue, type `yt-dlp -U`. This should report that you're up-to-date. This goes for feature requests as well. ### Is the issue already documented? -Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/ytdl-org/youtube-dl/search?type=Issues) of this repository. If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2015.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity. +Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/yt-dlp/yt-dlp/search?type=Issues) of this repository. If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2021.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity. + +Additionally, it is also helpful to see if the issue has already been documented in the [youtube-dl issue tracker](https://github.com/ytdl-org/youtube-dl/issues). If similar issues have already been reported in youtube-dl (but not in our issue tracker), links to them can be included in your issue report here. ### Why are existing options not enough? -Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/ytdl-org/youtube-dl/blob/master/README.md#options). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. +Before requesting a new feature, please have a quick peek at [the list of supported options](README.md#usage-and-options). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. + +### Have you read and understood the changes, between youtube-dl and yt-dlp + +There are many changes between youtube-dl and yt-dlp [(changes to default behavior)](README.md#differences-in-default-behavior), and some of the options available have a different behaviour in yt-dlp, or have been removed all together [(list of changes to options)](README.md#deprecated-options). Make sure you have read and understand the differences in the options and how this may impact your downloads before opening an issue. ### Is there enough context in your bug report? @@ -58,23 +98,40 @@ ### Does the issue involve one problem, and one problem only? Some of our users seem to think there is a limit of issues they can or should open. There is no limit of issues they can or should open. While it may seem appealing to be able to dump all your issues into one ticket, that means that someone who solves one of your issues cannot mark the issue as closed. Typically, reporting a bunch of issues leads to the ticket lingering since nobody wants to attack that behemoth, until someone mercifully splits the issue into multiple ones. -In particular, every site support request issue should only pertain to services at one site (generally under a common domain, but always using the same backend technology). Do not request support for vimeo user videos, White house podcasts, and Google Plus pages in the same issue. Also, make sure that you don't post bug reports alongside feature requests. As a rule of thumb, a feature request does not include outputs of youtube-dl that are not immediately related to the feature at hand. Do not post reports of a network error alongside the request for a new video service. +In particular, every site support request issue should only pertain to services at one site (generally under a common domain, but always using the same backend technology). Do not request support for vimeo user videos, White house podcasts, and Google Plus pages in the same issue. Also, make sure that you don't post bug reports alongside feature requests. As a rule of thumb, a feature request does not include outputs of yt-dlp that are not immediately related to the feature at hand. Do not post reports of a network error alongside the request for a new video service. ### Is anyone going to need the feature? Only post features that you (or an incapacitated friend you can personally talk to) require. Do not post features because they seem like a good idea. If they are really useful, they will be requested by someone who requires them. -### Is your question about youtube-dl? +### Is your question about yt-dlp? + +Some bug reports are completely unrelated to yt-dlp and relate to a different, or even the reporter's own, application. Please make sure that you are actually using yt-dlp. If you are using a UI for yt-dlp, report the bug to the maintainer of the actual application providing the UI. In general, if you are unable to provide the verbose log, you should not be opening the issue here. + +If the issue is with `youtube-dl` (the upstream fork of yt-dlp) and not with yt-dlp, the issue should be raised in the youtube-dl project. + +### Are you willing to share account details if needed? + +The maintainers and potential contributors of the project often do not have an account for the website you are asking support for. So any developer interested in solving your issue may ask you for account details. It is your personal discression whether you are willing to share the account in order for the developer to try and solve your issue. However, if you are unwilling or unable to provide details, they obviously cannot work on the issue and it cannot be solved unless some developer who both has an account and is willing/able to contribute decides to solve it. + +By sharing an account with anyone, you agree to bear all risks associated with it. The maintainers and yt-dlp can't be held responsible for any misuse of the credentials. + +While these steps won't necessarily ensure that no misuse of the account takes place, these are still some good practices to follow. + +- Look for people with `Member` (maintainers of the project) or `Contributor` (people who have previously contributed code) tag on their messages. +- Change the password before sharing the account to something random (use [this](https://passwordsgenerator.net/) if you don't have a random password generator). +- Change the password after receiving the account back. + + -It may sound strange, but some bug reports we receive are completely unrelated to youtube-dl and relate to a different, or even the reporter's own, application. Please make sure that you are actually using youtube-dl. If you are using a UI for youtube-dl, report the bug to the maintainer of the actual application providing the UI. On the other hand, if your UI for youtube-dl fails in some way you believe is related to youtube-dl, by all means, go ahead and report the bug. # DEVELOPER INSTRUCTIONS -Most users do not need to build youtube-dl and can [download the builds](https://ytdl-org.github.io/youtube-dl/download.html) or get them from their distribution. +Most users do not need to build yt-dlp and can [download the builds](https://github.com/yt-dlp/yt-dlp/releases) or get them via [the other installation methods](README.md#installation). -To run youtube-dl as a developer, you don't need to build anything either. Simply execute +To run yt-dlp as a developer, you don't need to build anything either. Simply execute - python -m youtube_dl + python -m yt_dlp To run the test, simply invoke your favorite test runner, or execute a test file directly; any of the following work: @@ -85,42 +142,42 @@ # DEVELOPER INSTRUCTIONS See item 6 of [new extractor tutorial](#adding-support-for-a-new-site) for how to run extractor specific test cases. -If you want to create a build of youtube-dl yourself, you'll need +If you want to create a build of yt-dlp yourself, you can follow the instructions [here](README.md#compile). -* python3 -* make (only GNU make is supported) -* pandoc -* zip -* pytest -### Adding support for a new site +## Adding new feature or making overarching changes -If you want to add support for a new site, first of all **make sure** this site is **not dedicated to [copyright infringement](README.md#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free)**. youtube-dl does **not support** such sites thus pull requests adding support for them **will be rejected**. +Before you start writing code for implementing a new feature, open an issue explaining your feature request and atleast one use case. This allows the maintainers to decide whether such a feature is desired for the project in the first place, and will provide an avenue to discuss some implementation details. If you open a pull request for a new feature without discussing with us first, do not be surprised when we ask for large changes to the code, or even reject it outright. + +The same applies for changes to the documentation, code style, or overarching changes to the architecture + + +## Adding support for a new site + +If you want to add support for a new site, first of all **make sure** this site is **not dedicated to [copyright infringement](https://www.github.com/ytdl-org/youtube-dl#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free)**. yt-dlp does **not support** such sites thus pull requests adding support for them **will be rejected**. After you have ensured this site is distributing its content legally, you can follow this quick list (assuming your service is called `yourextractor`): -1. [Fork this repository](https://github.com/ytdl-org/youtube-dl/fork) -2. Check out the source code with: +1. [Fork this repository](https://github.com/yt-dlp/yt-dlp/fork) +1. Check out the source code with: - git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git + git clone git@github.com:YOUR_GITHUB_USERNAME/yt-dlp.git -3. Start a new git branch with +1. Start a new git branch with - cd youtube-dl + cd yt-dlp git checkout -b yourextractor -4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`: +1. Start with this simple template and save it to `yt_dlp/extractor/yourextractor.py`: ```python # coding: utf-8 - from __future__ import unicode_literals - from .common import InfoExtractor - - + + class YourExtractorIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P[0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'https://yourextractor.com/watch/42', 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', 'info_dict': { @@ -134,12 +191,12 @@ ### Adding support for a new site # * A regular expression; start the string with re: # * Any Python type (for example int or float) } - } + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - + # TODO more code goes here, for example ... title = self._html_search_regex(r'

(.+?)

', webpage, 'title') @@ -148,45 +205,55 @@ ### Adding support for a new site 'title': title, 'description': self._og_search_description(webpage), 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), - # TODO more properties (see youtube_dl/extractor/common.py) + # TODO more properties (see yt_dlp/extractor/common.py) } ``` -5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). -6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. -7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. -8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): +1. Add an import in [`yt_dlp/extractor/extractors.py`](yt_dlp/extractor/extractors.py). +1. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, the tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. You can also run all the tests in one go with `TestDownload.test_YourExtractor_all` +1. Make sure you have atleast one test for your extractor. Even if all videos covered by the extractor are expected to be inaccessible for automated testing, tests should still be added with a `skip` parameter indicating why the particular test is disabled from running. +1. Have a look at [`yt_dlp/extractor/common.py`](yt_dlp/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](yt_dlp/extractor/common.py#L91-L426). Add tests and code for as many as you want. +1. Make sure your code follows [yt-dlp coding conventions](#yt-dlp-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): - $ flake8 youtube_dl/extractor/yourextractor.py + $ flake8 yt_dlp/extractor/yourextractor.py -9. Make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. -10. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: +1. Make sure your code works under all [Python](https://www.python.org/) versions supported by yt-dlp, namely CPython and PyPy for Python 3.6 and above. Backward compatibility is not required for even older versions of Python. +1. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files, [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: - $ git add youtube_dl/extractor/extractors.py - $ git add youtube_dl/extractor/yourextractor.py - $ git commit -m '[yourextractor] Add new extractor' + $ git add yt_dlp/extractor/extractors.py + $ git add yt_dlp/extractor/yourextractor.py + $ git commit -m '[yourextractor] Add extractor' $ git push origin yourextractor -11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. +1. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. In any case, thank you very much for your contributions! -## youtube-dl coding conventions +**Tip:** To test extractors that require login information, create a file `test/local_parameters.json` and add `"usenetrc": true` or your username and password in it: +```json +{ + "username": "your user name", + "password": "your password" +} +``` + +## yt-dlp coding conventions This section introduces a guide lines for writing idiomatic, robust and future-proof extractor code. -Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hosters out of your control and this layout tends to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize dependency on the source's layout and even to make the code foresee potential future changes and be ready for that. This is important because it will allow the extractor not to break on minor layout changes thus keeping old youtube-dl versions working. Even though this breakage issue is easily fixed by emitting a new version of youtube-dl with a fix incorporated, all the previous versions become broken in all repositories and distros' packages that may not be so prompt in fetching the update from us. Needless to say, some non rolling release distros may never receive an update at all. +Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hosters out of your control and this layout tends to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize dependency on the source's layout and even to make the code foresee potential future changes and be ready for that. This is important because it will allow the extractor not to break on minor layout changes thus keeping old yt-dlp versions working. Even though this breakage issue may be easily fixed by a new version of yt-dlp, this could take some time, during which the the extractor will remain broken. + ### Mandatory and optional metafields -For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl: +For extraction to work yt-dlp relies on metadata your extractor extracts and provides to yt-dlp expressed by an [information dictionary](yt_dlp/extractor/common.py#L91-L426) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by yt-dlp: - `id` (media identifier) - `title` (media title) - `url` (media download URL) or `formats` -In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` as mandatory. Thus the aforementioned metafields are the critical data that the extraction does not make any sense without and if any of them fail to be extracted then the extractor is considered completely broken. +The aforementioned metafields are the critical data that the extraction does not make any sense without and if any of them fail to be extracted then the extractor is considered completely broken. While, in fact, only `id` is technically mandatory, due to compatibility reasons, yt-dlp also treats `title` as mandatory. The extractor is allowed to return the info dict without url or formats in some special cases if it allows the user to extract usefull information with `--ignore-no-formats-error` - Eg: when the video is a live stream that has not started yet. -[Any field](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L188-L303) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. +[Any field](yt_dlp/extractor/common.py#219-L426) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. #### Example @@ -200,8 +267,10 @@ #### Example ```python { - ... "summary": "some fancy summary text", + "user": { + "name": "uploader name" + }, ... } ``` @@ -220,6 +289,30 @@ #### Example The latter will break extraction process with `KeyError` if `summary` disappears from `meta` at some later time but with the former approach extraction will just go ahead with `description` set to `None` which is perfectly fine (remember `None` is equivalent to the absence of data). + +If the data is nested, do not use `.get` chains, but instead make use of the utility functions `try_get` or `traverse_obj` + +Considering the above `meta` again, assume you want to extract `["user"]["name"]` and put it in the resulting info dict as `uploader` + +```python +uploader = try_get(meta, lambda x: x['user']['name']) # correct +``` +or +```python +uploader = traverse_obj(meta, ('user', 'name')) # correct +``` + +and not like: + +```python +uploader = meta['user']['name'] # incorrect +``` +or +```python +uploader = meta.get('user', {}).get('name') # incorrect +``` + + Similarly, you should pass `fatal=False` when extracting optional data from a webpage with `_search_regex`, `_html_search_regex` or similar methods, for instance: ```python @@ -239,11 +332,36 @@ #### Example ``` On failure this code will silently continue the extraction with `description` set to `None`. That is useful for metafields that may or may not be present. - + + +Another thing to remember is not to try to iterate over `None` + +Say you extracted a list of thumbnails into `thumbnail_data` using `try_get` and now want to iterate over them + +```python +thumbnail_data = try_get(...) +thumbnails = [{ + 'url': item['url'] +} for item in thumbnail_data or []] # correct +``` + +and not like: + +```python +thumbnail_data = try_get(...) +thumbnails = [{ + 'url': item['url'] +} for item in thumbnail_data] # incorrect +``` + +In the later case, `thumbnail_data` will be `None` if the field was not found and this will cause the loop `for item in thumbnail_data` to raise a fatal error. Using `for item in thumbnail_data or []` avoids this error and results in setting an empty list in `thumbnails` instead. + + ### Provide fallbacks When extracting metadata try to do so from multiple sources. For example if `title` is present in several places, try extracting from at least some of them. This makes it more future-proof in case some of the sources become unavailable. + #### Example Say `meta` from the previous example has a `title` and you are about to extract it. Since `title` is a mandatory meta field you should end up with something like: @@ -262,6 +380,7 @@ #### Example This code will try to extract from `meta` first and if it fails it will try extracting `og:title` from a `webpage`. + ### Regular expressions #### Don't capture groups you don't use @@ -283,11 +402,10 @@ ##### Example r'(id|ID)=(?P\d+)' ``` - #### Make regular expressions relaxed and flexible When using regular expressions try to write them fuzzy, relaxed and flexible, skipping insignificant parts that are more likely to change, allowing both single and double quotes for quoted values and so on. - + ##### Example Say you need to extract `title` from the following HTML code: @@ -299,14 +417,14 @@ ##### Example The code for that task should look similar to: ```python -title = self._search_regex( +title = self._search_regex( # correct r']+class="title"[^>]*>([^<]+)', webpage, 'title') ``` Or even better: ```python -title = self._search_regex( +title = self._search_regex( # correct r']+class=(["\'])title\1[^>]*>(?P[^<]+)', webpage, 'title', group='title') ``` @@ -316,14 +434,25 @@ ##### Example The code definitely should not look like: ```python -title = self._search_regex( +title = self._search_regex( # incorrect r'<span style="position: absolute; left: 910px; width: 90px; float: right; z-index: 9999;" class="title">(.*?)</span>', webpage, 'title', group='title') ``` +or even + +```python +title = self._search_regex( # incorrect + r'<span style=".*?" class="title">(.*?)</span>', + webpage, 'title', group='title') +``` + +Here the presence or absence of other attributes including `style` is irrelevent for the data we need, and so the regex must not depend on it + + ### Long lines policy -There is a soft limit to keep lines of code under 80 characters long. This means it should be respected if possible and if it does not make readability and code maintenance worse. +There is a soft limit to keep lines of code under 100 characters long. This means it should be respected if possible and if it does not make readability and code maintenance worse. Sometimes, it may be reasonable to go upto 120 characters and sometimes even 80 can be unreadable. Keep in mind that this is not a hard limit and is just one of many tools to make the code more readable For example, you should **never** split long string literals like URLs or some other often copied entities over multiple lines to fit this limit: @@ -360,6 +489,7 @@ # ...some lines of code... title = self._html_search_regex(TITLE_RE, webpage, 'title') ``` + ### Collapse fallbacks Multiple fallback values can quickly become unwieldy. Collapse multiple fallback values into a single expression via a list of patterns. @@ -385,10 +515,13 @@ #### Example Methods supporting list of patterns are: `_search_regex`, `_html_search_regex`, `_og_search_property`, `_html_search_meta`. + ### Trailing parentheses Always move trailing parentheses after the last argument. +Note that this *does not* apply to braces `}` or square brackets `]` both of which should closed be in a new line + #### Example Correct: @@ -406,30 +539,36 @@ #### Example ) ``` + ### Use convenience conversion and parsing functions -Wrap all extracted numeric data into safe functions from [`youtube_dl/utils.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. +Wrap all extracted numeric data into safe functions from [`yt_dlp/utils.py`](yt_dlp/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. Use `url_or_none` for safe URL processing. -Use `try_get` for safe metadata extraction from parsed JSON. +Use `try_get`, `dict_get` and `traverse_obj` for safe metadata extraction from parsed JSON. Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction. -Explore [`youtube_dl/utils.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py) for more useful convenience functions. +Explore [`yt_dlp/utils.py`](yt_dlp/utils.py) for more useful convenience functions. #### More examples ##### Safely extract optional description from parsed JSON ```python -description = try_get(response, lambda x: x['result']['video'][0]['summary'], compat_str) +description = traverse_obj(response, ('result', 'video', 'summary'), expected_type=str) ``` ##### Safely extract more optional metadata ```python -video = try_get(response, lambda x: x['result']['video'][0], dict) or {} +video = traverse_obj(response, ('result', 'video', 0), default={}, expected_type=dict) description = video.get('summary') duration = float_or_none(video.get('durationMs'), scale=1000) view_count = int_or_none(video.get('views')) ``` + + + +# EMBEDDING YT-DLP +See [README.md#embedding-yt-dlp](README.md#embedding-yt-dlp) for instructions on how to embed yt-dlp in another Python program diff --git a/CONTRIBUTORS b/CONTRIBUTORS index fe28dfc784..35a0764a29 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -22,7 +22,7 @@ Zocker1999NET nao20010128nao kurumigi bbepis -animelover1984 +animelover1984/horahoradev Pccode66 RobinD42 hseg @@ -78,3 +78,103 @@ pgaig PSlava stdedos u-spec-png +Sipherdrakon +kidonng +smege1001 +tandy1000 +IONECarter +capntrips +mrfade +ParadoxGBB +wlritchi +NeroBurner +mahanstreamer +alerikaisattera +Derkades +BunnyHelp +i6t +std-move +Chocobozzz +ouwou +korli +octotherp +CeruleanSky +zootedb0t +chao813 +ChillingPepper +ConquerorDopy +dalanmiller +DigitalDJ +f4pp3rk1ng +gesa +Jules-A +makeworld-the-better-one +MKSherbini +mrx23dot +poschi3 +raphaeldore +renalid +sleaux-meaux +sulyi +tmarki +Vangelis66 +AjaxGb +ajj8 +jakubadamw +jfogelman +timethrow +sarnoud +Bojidarist +18928172992817182/gustaf +nixklai +smplayer-dev +Zirro +CrypticSignal +flashdagger +fractalf +frafra +kaz-us +ozburo +rhendric +sdomi +selfisekai +stanoarn +0xA7404A/Aurora +4a1e2y5 +aarubui +chio0hai +cntrl-s +Deer-Spangle +DEvmIb +Grabien +j54vc1bk +mpeter50 +mrpapersonic +pabs3 +staubichsauger +xenova +Yakabuff +zulaport +ehoogeveen-medweb +PilzAdam +zmousm +iw0nderhow +unit193 +TwoThousandHedgehogs +Jertzukka +cypheron +Hyeeji +bwildenhain +C0D3D3V +kebianizao +Lapin0t +abdullah-if +DavidSkrundz +mkubecek +raleeper +YuenSzeHong +Sematre +jaller94 +r5d +julien-hadleyjack +git-anony-mouse diff --git a/Changelog.md b/Changelog.md index e818aaddcb..5c30457023 100644 --- a/Changelog.md +++ b/Changelog.md @@ -5,20 +5,694 @@ # Instuctions for creating release * Run `make doc` * Update Changelog.md and CONTRIBUTORS -* Change "Merged with ytdl" version in Readme.md if needed -* Add new/fixed extractors in "new features" section of Readme.md -* Commit to master as `Release <version>` -* Push to origin/release using `git push origin master:release` - build task will now run -* Update version.py using `devscripts\update-version.py` -* Run `make issuetemplates` -* Commit to master as `[version] update :ci skip all` -* Push to origin/master -* Update changelog in /releases - +* Change "Based on ytdl" version in Readme.md if needed +* Commit as `Release <version>` and push to master +* Dispatch the workflow https://github.com/yt-dlp/yt-dlp/actions/workflows/build.yml on master --> +### 2021.12.27 + +* Avoid recursion error when re-extracting info +* [ffmpeg] Fix position of `--ppa` +* [aria2c] Don't show progress when `--no-progress` +* [cookies] Support other keyrings by [mbway](https://github.com/mbway) +* [EmbedThumbnail] Prefer AtomicParsley over ffmpeg if available +* [generic] Fix HTTP KVS Player by [git-anony-mouse](https://github.com/git-anony-mouse) +* [ThumbnailsConvertor] Fix for when there are no thumbnails +* [docs] Add examples for using `TYPES:` in `-P`/`-o` +* [PixivSketch] Add extractors by [nao20010128nao](https://github.com/nao20010128nao) +* [tiktok] Add music, sticker and tag IEs by [MinePlayersPE](https://github.com/MinePlayersPE) +* [BiliIntl] Fix extractor by [MinePlayersPE](https://github.com/MinePlayersPE) +* [CBC] Fix URL regex +* [tiktok] Fix `extractor_key` used in archive +* [youtube] **End `live-from-start` properly when stream ends with 403** +* [Zee5] Fix VALID_URL for tv-shows by [Ashish0804](https://github.com/Ashish0804) + +### 2021.12.25 + +* [dash,youtube] **Download live from start to end** by [nao20010128nao](https://github.com/nao20010128nao), [pukkandan](https://github.com/pukkandan) + * Add option `--live-from-start` to enable downloading live videos from start + * Add key `is_from_start` in formats to identify formats (of live videos) that downloads from start + * [dash] Create protocol `http_dash_segments_generator` that allows a function to be passed instead of fragments + * [fragment] Allow multiple live dash formats to download simultaneously + * [youtube] Implement fragment re-fetching for the live dash formats + * [youtube] Re-extract dash manifest every 5 hours (manifest expires in 6hrs) + * [postprocessor/ffmpeg] Add `FFmpegFixupDuplicateMoovPP` to fixup duplicated moov atoms + * Known issues: + * Ctrl+C doesn't work on Windows when downloading multiple formats + * If video becomes private, download hangs +* [SponsorBlock] Add `Filler` and `Highlight` categories by [nihil-admirari](https://github.com/nihil-admirari), [pukkandan](https://github.com/pukkandan) + * Change `--sponsorblock-cut all` to `--sponsorblock-cut default` if you do not want filler sections to be removed +* Add field `webpage_url_domain` +* Add interactive format selection with `-f -` +* Add option `--file-access-retries` by [ehoogeveen-medweb](https://github.com/ehoogeveen-medweb) +* [outtmpl] Add alternate forms `S`, `D` and improve `id` detection +* [outtmpl] Add operator `&` for replacement text by [PilzAdam](https://github.com/PilzAdam) +* [EmbedSubtitle] Disable duration check temporarily +* [extractor] Add `_search_nuxt_data` by [nao20010128nao](https://github.com/nao20010128nao) +* [extractor] Ignore errors in comment extraction when `-i` is given +* [extractor] Standardize `_live_title` +* [FormatSort] Prevent incorrect deprecation warning +* [generic] Extract m3u8 formats from JSON-LD +* [postprocessor/ffmpeg] Always add `faststart` +* [utils] Fix parsing `YYYYMMDD` dates in Nov/Dec by [wlritchi](https://github.com/wlritchi) +* [utils] Improve `parse_count` +* [utils] Update `std_headers` by [kikuyan](https://github.com/kikuyan), [fstirlitz](https://github.com/fstirlitz) +* [lazy_extractors] Fix for search IEs +* [extractor] Support default implicit graph in JSON-LD by [zmousm](https://github.com/zmousm) +* Allow `--no-write-thumbnail` to override `--write-all-thumbnail` +* Fix `--throttled-rate` +* Fix control characters being printed to `--console-title` +* Fix PostProcessor hooks not registered for some PPs +* Pre-process when using `--flat-playlist` +* Remove known invalid thumbnails from `info_dict` +* Add warning when using `-f best` +* Use `parse_duration` for `--wait-for-video` and some minor fix +* [test/download] Add more fields +* [test/download] Ignore field `webpage_url_domain` by [std-move](https://github.com/std-move) +* [compat] Suppress errors in enabling VT mode +* [docs] Improve manpage format by [iw0nderhow](https://github.com/iw0nderhow), [pukkandan](https://github.com/pukkandan) +* [docs,cleanup] Minor fixes and cleanup +* [cleanup] Fix some typos by [unit193](https://github.com/unit193) +* [ABC:iview] Add show extractor by [pabs3](https://github.com/pabs3) +* [dropout] Add extractor by [TwoThousandHedgehogs](https://github.com/TwoThousandHedgehogs), [pukkandan](https://github.com/pukkandan) +* [GameJolt] Add extractors by [MinePlayersPE](https://github.com/MinePlayersPE) +* [gofile] Add extractor by [Jertzukka](https://github.com/Jertzukka), [Ashish0804](https://github.com/Ashish0804) +* [hse] Add extractors by [cypheron](https://github.com/cypheron), [pukkandan](https://github.com/pukkandan) +* [NateTV] Add NateIE and NateProgramIE by [Ashish0804](https://github.com/Ashish0804), [Hyeeji](https://github.com/Hyeeji) +* [OpenCast] Add extractors by [bwildenhain](https://github.com/bwildenhain), [C0D3D3V](https://github.com/C0D3D3V) +* [rtve] Add `RTVEAudioIE` by [kebianizao](https://github.com/kebianizao) +* [Rutube] Add RutubeChannelIE by [Ashish0804](https://github.com/Ashish0804) +* [skeb] Add extractor by [nao20010128nao](https://github.com/nao20010128nao) +* [soundcloud] Add related tracks extractor by [Lapin0t](https://github.com/Lapin0t) +* [toggo] Add extractor by [nyuszika7h](https://github.com/nyuszika7h) +* [TrueID] Add extractor by [MinePlayersPE](https://github.com/MinePlayersPE) +* [audiomack] Update album and song VALID_URL by [abdullah-if](https://github.com/abdullah-if), [dirkf](https://github.com/dirkf) +* [CBC Gem] Extract 1080p formats by [DavidSkrundz](https://github.com/DavidSkrundz) +* [ceskatelevize] Fetch iframe from nextJS data by [mkubecek](https://github.com/mkubecek) +* [crackle] Look for non-DRM formats by [raleeper](https://github.com/raleeper) +* [dplay] Temporary fix for `discoveryplus.com/it` +* [DiscoveryPlusShowBaseIE] yield actual video id by [Ashish0804](https://github.com/Ashish0804) +* [Facebook] Handle redirect URLs +* [fujitv] Extract 1080p from `tv_android` m3u8 by [YuenSzeHong](https://github.com/YuenSzeHong) +* [gronkh] Support new URL pattern by [Sematre](https://github.com/Sematre) +* [instagram] Expand valid URL by [u-spec-png](https://github.com/u-spec-png) +* [Instagram] Try bypassing login wall with embed page by [MinePlayersPE](https://github.com/MinePlayersPE) +* [Jamendo] Fix use of `_VALID_URL_RE` by [jaller94](https://github.com/jaller94) +* [LBRY] Support livestreams by [Ashish0804](https://github.com/Ashish0804), [pukkandan](https://github.com/pukkandan) +* [NJPWWorld] Extract formats from m3u8 by [aarubui](https://github.com/aarubui) +* [NovaEmbed] update player regex by [std-move](https://github.com/std-move) +* [npr] Make SMIL extraction non-fatal by [r5d](https://github.com/r5d) +* [ntvcojp] Extract NUXT data by [nao20010128nao](https://github.com/nao20010128nao) +* [ok.ru] add mobile fallback by [nao20010128nao](https://github.com/nao20010128nao) +* [olympics] Add uploader and cleanup by [u-spec-png](https://github.com/u-spec-png) +* [ondemandkorea] Update `jw_config` regex by [julien-hadleyjack](https://github.com/julien-hadleyjack) +* [PlutoTV] Expand `_VALID_URL` +* [RaiNews] Fix extractor by [nixxo](https://github.com/nixxo) +* [RCTIPlusSeries] Lazy extraction and video type selection by [MinePlayersPE](https://github.com/MinePlayersPE) +* [redtube] Handle formats delivered inside a JSON by [dirkf](https://github.com/dirkf), [nixxo](https://github.com/nixxo) +* [SonyLiv] Add OTP login support by [Ashish0804](https://github.com/Ashish0804) +* [Steam] Fix extractor by [u-spec-png](https://github.com/u-spec-png) +* [TikTok] Pass cookies to mobile API by [MinePlayersPE](https://github.com/MinePlayersPE) +* [trovo] Fix inheritance of `TrovoChannelBaseIE` +* [TVer] Extract better thumbnails by [YuenSzeHong](https://github.com/YuenSzeHong) +* [vimeo] Extract chapters +* [web.archive:youtube] Improve metadata extraction by [coletdjnz](https://github.com/coletdjnz) +* [youtube:comments] Add more options for limiting number of comments extracted by [coletdjnz](https://github.com/coletdjnz) +* [youtube:tab] Extract more metadata from feeds/channels/playlists by [coletdjnz](https://github.com/coletdjnz) +* [youtube:tab] Extract video thumbnails from playlist by [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) +* [youtube:tab] Ignore query when redirecting channel to playlist and cleanup of related code +* [youtube] Fix `ytsearchdate` +* [zdf] Support videos with different ptmd location by [iw0nderhow](https://github.com/iw0nderhow) +* [zee5] Support /episodes in URL + + +### 2021.12.01 + +* **Add option `--wait-for-video` to wait for scheduled streams** +* Add option `--break-per-input` to apply --break-on... to each input URL +* Add option `--embed-info-json` to embed info.json in mkv +* Add compat-option `embed-metadata` +* Allow using a custom format selector through API +* [AES] Add ECB mode by [nao20010128nao](https://github.com/nao20010128nao) +* [build] Fix MacOS Build +* [build] Save Git HEAD at release alongside version info +* [build] Use `workflow_dispatch` for release +* [downloader/ffmpeg] Fix for direct videos inside mpd manifests +* [downloader] Add colors to download progress +* [EmbedSubtitles] Slightly relax duration check and related cleanup +* [ExtractAudio] Fix conversion to `wav` and `vorbis` +* [ExtractAudio] Support `alac` +* [extractor] Extract `average_rating` from JSON-LD +* [FixupM3u8] Fixup MPEG-TS in MP4 container +* [generic] Support mpd manifests without extension by [shirt](https://github.com/shirt-dev) +* [hls] Better FairPlay DRM detection by [nyuszika7h](https://github.com/nyuszika7h) +* [jsinterp] Fix splice to handle float (for youtube js player f1ca6900) +* [utils] Allow alignment in `render_table` and add tests +* [utils] Fix `PagedList` +* [utils] Fix error when copying `LazyList` +* Clarify video/audio-only formats in -F +* Ensure directory exists when checking formats +* Ensure path for link files exists by [Zirro](https://github.com/Zirro) +* Ensure same config file is not loaded multiple times +* Fix `postprocessor_hooks` +* Fix `--break-on-archive` when pre-checking +* Fix `--check-formats` for `mhtml` +* Fix `--load-info-json` of playlists with failed entries +* Fix `--trim-filename` when filename has `.` +* Fix bug in parsing `--add-header` +* Fix error in `report_unplayable_conflict` by [shirt](https://github.com/shirt-dev) +* Fix writing playlist infojson with `--no-clean-infojson` +* Validate --get-bypass-country +* [blogger] Add extractor by [pabs3](https://github.com/pabs3) +* [breitbart] Add extractor by [Grabien](https://github.com/Grabien) +* [CableAV] Add extractor by [j54vc1bk](https://github.com/j54vc1bk) +* [CanalAlpha] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [CozyTV] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [CPTwentyFour] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [DiscoveryPlus] Add `DiscoveryPlusItalyShowIE` by [Ashish0804](https://github.com/Ashish0804) +* [ESPNCricInfo] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [LinkedIn] Add extractor by [u-spec-png](https://github.com/u-spec-png) +* [mixch] Add extractor by [nao20010128nao](https://github.com/nao20010128nao) +* [nebula] Add `NebulaCollectionIE` and rewrite extractor by [hheimbuerger](https://github.com/hheimbuerger) +* [OneFootball] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [peer.tv] Add extractor by [u-spec-png](https://github.com/u-spec-png) +* [radiozet] Add extractor by [0xA7404A](https://github.com/0xA7404A) (Aurora) +* [redgifs] Add extractor by [chio0hai](https://github.com/chio0hai) +* [RedGifs] Add Search and User extractors by [Deer-Spangle](https://github.com/Deer-Spangle) +* [rtrfm] Add extractor by [pabs3](https://github.com/pabs3) +* [Streamff] Add extractor by [cntrl-s](https://github.com/cntrl-s) +* [Stripchat] Add extractor by [zulaport](https://github.com/zulaport) +* [Aljazeera] Fix extractor by [u-spec-png](https://github.com/u-spec-png) +* [AmazonStoreIE] Fix regex to not match vdp urls by [Ashish0804](https://github.com/Ashish0804) +* [ARDBetaMediathek] Handle new URLs +* [bbc] Get all available formats by [nyuszika7h](https://github.com/nyuszika7h) +* [Bilibili] Fix title extraction by [u-spec-png](https://github.com/u-spec-png) +* [CBC Gem] Fix for shows that don't have all seasons by [makeworld-the-better-one](https://github.com/makeworld-the-better-one) +* [curiositystream] Add more metadata +* [CuriosityStream] Fix series +* [DiscoveryPlus] Rewrite extractors by [Ashish0804](https://github.com/Ashish0804), [pukkandan](https://github.com/pukkandan) +* [HotStar] Set language field from tags by [Ashish0804](https://github.com/Ashish0804) +* [instagram, cleanup] Refactor extractors +* [Instagram] Display more login errors by [MinePlayersPE](https://github.com/MinePlayersPE) +* [itv] Fix extractor by [staubichsauger](https://github.com/staubichsauger), [pukkandan](https://github.com/pukkandan) +* [mediaklikk] Expand valid URL +* [MTV] Improve mgid extraction by [Sipherdrakon](https://github.com/Sipherdrakon), [kikuyan](https://github.com/kikuyan) +* [nexx] Better error message for unsupported format +* [NovaEmbed] Fix extractor by [pukkandan](https://github.com/pukkandan), [std-move](https://github.com/std-move) +* [PatreonUser] Do not capture RSS URLs +* [Reddit] Add support for 1080p videos by [xenova](https://github.com/xenova) +* [RoosterTeethSeries] Fix for multiple pages by [MinePlayersPE](https://github.com/MinePlayersPE) +* [sbs] Fix for movies and livestreams +* [Senate.gov] Add SenateGovIE and fix SenateISVPIE by [Grabien](https://github.com/Grabien), [pukkandan](https://github.com/pukkandan) +* [soundcloud:search] Fix pagination +* [tiktok:user] Set `webpage_url` correctly +* [Tokentube] Fix description by [u-spec-png](https://github.com/u-spec-png) +* [trovo] Fix extractor by [nyuszika7h](https://github.com/nyuszika7h) +* [tv2] Expand valid URL +* [Tvplayhome] Fix extractor by [pukkandan](https://github.com/pukkandan), [18928172992817182](https://github.com/18928172992817182) +* [Twitch:vod] Add chapters by [mpeter50](https://github.com/mpeter50) +* [twitch:vod] Extract live status by [DEvmIb](https://github.com/DEvmIb) +* [VidLii] Add 720p support by [mrpapersonic](https://github.com/mrpapersonic) +* [vimeo] Add fallback for config URL +* [vimeo] Sort http formats higher +* [WDR] Expand valid URL +* [willow] Add extractor by [aarubui](https://github.com/aarubui) +* [xvideos] Detect embed URLs by [4a1e2y5](https://github.com/4a1e2y5) +* [xvideos] Fix extractor by [Yakabuff](https://github.com/Yakabuff) +* [youtube, cleanup] Reorganize Tab and Search extractor inheritances +* [youtube:search_url] Add playlist/channel support +* [youtube] Add `default` player client by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Add storyboard formats +* [youtube] Decrypt n-sig for URLs with `ratebypass` +* [youtube] Minor improvement to format sorting +* [cleanup] Add deprecation warnings +* [cleanup] Refactor `JSInterpreter._seperate` +* [Cleanup] Remove some unnecessary groups in regexes by [Ashish0804](https://github.com/Ashish0804) +* [cleanup] Misc cleanup + + +### 2021.11.10.1 + +* Temporarily disable MacOS Build + +### 2021.11.10 + +* [youtube] **Fix throttling by decrypting n-sig** +* Merging extractors from [haruhi-dl](https://git.sakamoto.pl/laudom/haruhi-dl) by [selfisekai](https://github.com/selfisekai) + * [extractor] Add `_search_nextjs_data` + * [tvp] Fix extractors + * [tvp] Add TVPStreamIE + * [wppilot] Add extractors + * [polskieradio] Add extractors + * [radiokapital] Add extractors + * [polsatgo] Add extractor by [selfisekai](https://github.com/selfisekai), [sdomi](https://github.com/sdomi) +* Separate `--check-all-formats` from `--check-formats` +* Approximate filesize from bitrate +* Don't create console in `windows_enable_vt_mode` +* Fix bug in `--load-infojson` of playlists +* [minicurses] Add colors to `-F` and standardize color-printing code +* [outtmpl] Add type `link` for internet shortcut files +* [outtmpl] Add alternate forms for `q` and `j` +* [outtmpl] Do not traverse `None` +* [fragment] Fix progress display in fragmented downloads +* [downloader/ffmpeg] Fix vtt download with ffmpeg +* [ffmpeg] Detect presence of setts and libavformat version +* [ExtractAudio] Rescale `--audio-quality` correctly by [CrypticSignal](https://github.com/CrypticSignal), [pukkandan](https://github.com/pukkandan) +* [ExtractAudio] Use `libfdk_aac` if available by [CrypticSignal](https://github.com/CrypticSignal) +* [FormatSort] `eac3` is better than `ac3` +* [FormatSort] Fix some fields' defaults +* [generic] Detect more json_ld +* [generic] parse jwplayer with only the json URL +* [extractor] Add keyword automatically to SearchIE descriptions +* [extractor] Fix some errors being converted to `ExtractorError` +* [utils] Add `join_nonempty` +* [utils] Add `jwt_decode_hs256` by [Ashish0804](https://github.com/Ashish0804) +* [utils] Create `DownloadCancelled` exception +* [utils] Parse `vp09` as vp9 +* [utils] Sanitize URL when determining protocol +* [test/download] Fallback test to `bv` +* [docs] Minor documentation improvements +* [cleanup] Improvements to error and debug messages +* [cleanup] Minor fixes and cleanup +* [3speak] Add extractors by [Ashish0804](https://github.com/Ashish0804) +* [AmazonStore] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [Gab] Add extractor by [u-spec-png](https://github.com/u-spec-png) +* [mediaset] Add playlist support by [nixxo](https://github.com/nixxo) +* [MLSScoccer] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [N1] Add support for nova.rs by [u-spec-png](https://github.com/u-spec-png) +* [PlanetMarathi] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [RaiplayRadio] Add extractors by [frafra](https://github.com/frafra) +* [roosterteeth] Add series extractor +* [sky] Add `SkyNewsStoryIE` by [ajj8](https://github.com/ajj8) +* [youtube] Fix sorting for some videos +* [youtube] Populate `thumbnail` with the best "known" thumbnail +* [youtube] Refactor itag processing +* [youtube] Remove unnecessary no-playlist warning +* [youtube:tab] Add Invidious list for playlists/channels by [rhendric](https://github.com/rhendric) +* [Bilibili:comments] Fix infinite loop by [u-spec-png](https://github.com/u-spec-png) +* [ceskatelevize] Fix extractor by [flashdagger](https://github.com/flashdagger) +* [Coub] Fix media format identification by [wlritchi](https://github.com/wlritchi) +* [crunchyroll] Add extractor-args `language` and `hardsub` +* [DiscoveryPlus] Allow language codes in URL +* [imdb] Fix thumbnail by [ozburo](https://github.com/ozburo) +* [instagram] Add IOS URL support by [u-spec-png](https://github.com/u-spec-png) +* [instagram] Improve login code by [u-spec-png](https://github.com/u-spec-png) +* [Instagram] Improve metadata extraction by [u-spec-png](https://github.com/u-spec-png) +* [iPrima] Fix extractor by [stanoarn](https://github.com/stanoarn) +* [itv] Add support for ITV News by [ajj8](https://github.com/ajj8) +* [la7] Fix extractor by [nixxo](https://github.com/nixxo) +* [linkedin] Don't login multiple times +* [mtv] Fix some videos by [Sipherdrakon](https://github.com/Sipherdrakon) +* [Newgrounds] Fix description by [u-spec-png](https://github.com/u-spec-png) +* [Nrk] Minor fixes by [fractalf](https://github.com/fractalf) +* [Olympics] Fix extractor by [u-spec-png](https://github.com/u-spec-png) +* [piksel] Fix sorting +* [twitter] Do not sort by codec +* [viewlift] Add cookie-based login and series support by [Ashish0804](https://github.com/Ashish0804), [pukkandan](https://github.com/pukkandan) +* [vimeo] Detect source extension and misc cleanup by [flashdagger](https://github.com/flashdagger) +* [vimeo] Fix ondemand videos and direct URLs with hash +* [vk] Fix login and add subtitles by [kaz-us](https://github.com/kaz-us) +* [VLive] Add upload_date and thumbnail by [Ashish0804](https://github.com/Ashish0804) +* [VRT] Fix login by [pgaig](https://github.com/pgaig) +* [Vupload] Fix extractor by [u-spec-png](https://github.com/u-spec-png) +* [wakanim] Add support for MPD manifests by [nyuszika7h](https://github.com/nyuszika7h) +* [wakanim] Detect geo-restriction by [nyuszika7h](https://github.com/nyuszika7h) +* [ZenYandex] Fix extractor by [u-spec-png](https://github.com/u-spec-png) + + +### 2021.10.22 + +* [build] Improvements + * Build standalone MacOS packages by [smplayer-dev](https://github.com/smplayer-dev) + * Release windows exe built with `py2exe` + * Enable lazy-extractors in releases. + * Set env var `YTDLP_NO_LAZY_EXTRACTORS` to forcefully disable this (experimental) + * Clean up error reporting in update + * Refactor `pyinst.py`, misc cleanup and improve docs +* [docs] Migrate issues to use forms by [Ashish0804](https://github.com/Ashish0804) +* [downloader] **Fix slow progress hooks** + * This was causing HLS/DASH downloads to be extremely slow in some situations +* [downloader/ffmpeg] Improve simultaneous download and merge +* [EmbedMetadata] Allow overwriting all default metadata with `meta_default` key +* [ModifyChapters] Add ability for `--remove-chapters` to remove sections by timestamp +* [utils] Allow duration strings in `--match-filter` +* Add HDR information to formats +* Add negative option `--no-batch-file` by [Zirro](https://github.com/Zirro) +* Calculate more fields for merged formats +* Do not verify thumbnail URLs unless `--check-formats` is specified +* Don't create console for subprocesses on Windows +* Fix `--restrict-filename` when used with default template +* Fix `check_formats` output being written to stdout when `-qv` +* Fix bug in storyboards +* Fix conflict b/w id and ext in format selection +* Fix verbose head not showing custom configs +* Load archive only after printing verbose head +* Make `duration_string` and `resolution` available in --match-filter +* Re-implement deprecated option `--id` +* Reduce default `--socket-timeout` +* Write verbose header to logger +* [outtmpl] Fix bug in expanding environment variables +* [cookies] Local State should be opened as utf-8 +* [extractor,utils] Detect more codecs/mimetypes +* [extractor] Detect `EXT-X-KEY` Apple FairPlay +* [utils] Use `importlib` to load plugins by [sulyi](https://github.com/sulyi) +* [http] Retry on socket timeout and show the last encountered error +* [fragment] Print error message when skipping fragment +* [aria2c] Fix `--skip-unavailable-fragment` +* [SponsorBlock] Obey `extractor-retries` and `sleep-requests` +* [Merger] Do not add `aac_adtstoasc` to non-hls audio +* [ModifyChapters] Do not mutate original chapters by [nihil-admirari](https://github.com/nihil-admirari) +* [devscripts/run_tests] Use markers to filter tests by [sulyi](https://github.com/sulyi) +* [7plus] Add cookie based authentication by [nyuszika7h](https://github.com/nyuszika7h) +* [AdobePass] Fix RCN MSO by [jfogelman](https://github.com/jfogelman) +* [CBC] Fix Gem livestream by [makeworld-the-better-one](https://github.com/makeworld-the-better-one) +* [CBC] Support CBC Gem member content by [makeworld-the-better-one](https://github.com/makeworld-the-better-one) +* [crunchyroll] Add season to flat-playlist +* [crunchyroll] Add support for `beta.crunchyroll` URLs and fix series URLs with language code +* [EUScreen] Add Extractor by [Ashish0804](https://github.com/Ashish0804) +* [Gronkh] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [hidive] Fix typo +* [Hotstar] Mention Dynamic Range in `format_id` by [Ashish0804](https://github.com/Ashish0804) +* [Hotstar] Raise appropriate error for DRM +* [instagram] Add login by [u-spec-png](https://github.com/u-spec-png) +* [instagram] Show appropriate error when login is needed +* [microsoftstream] Add extractor by [damianoamatruda](https://github.com/damianoamatruda), [nixklai](https://github.com/nixklai) +* [on24] Add extractor by [damianoamatruda](https://github.com/damianoamatruda) +* [patreon] Fix vimeo player regex by [zenerdi0de](https://github.com/zenerdi0de) +* [SkyNewsAU] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [tagesschau] Fix extractor by [u-spec-png](https://github.com/u-spec-png) +* [tbs] Add tbs live streams by [llacb47](https://github.com/llacb47) +* [tiktok] Fix typo and update tests +* [trovo] Support channel clips and VODs by [Ashish0804](https://github.com/Ashish0804) +* [Viafree] Add support for Finland by [18928172992817182](https://github.com/18928172992817182) +* [vimeo] Fix embedded `player.vimeo` +* [vlive:channel] Fix extraction by [kikuyan](https://github.com/kikuyan), [pukkandan](https://github.com/pukkandan) +* [youtube] Add auto-translated subtitles +* [youtube] Expose different formats with same itag +* [youtube:comments] Fix for new layout by [coletdjnz](https://github.com/coletdjnz) +* [cleanup] Cleanup bilibili code by [pukkandan](https://github.com/pukkandan), [u-spec-png](https://github.com/u-spec-png) +* [cleanup] Remove broken youtube login code +* [cleanup] Standardize timestamp formatting code +* [cleanup] Generalize `getcomments` implementation for extractors +* [cleanup] Simplify search extractors code +* [cleanup] misc + + +### 2021.10.10 + +* [downloader/ffmpeg] Fix bug in initializing `FFmpegPostProcessor` +* [minicurses] Fix when printing to file +* [downloader] Fix throttledratelimit +* [francetv] Fix extractor by [fstirlitz](https://github.com/fstirlitz), [sarnoud](https://github.com/sarnoud) +* [NovaPlay] Add extractor by [Bojidarist](https://github.com/Bojidarist) +* [ffmpeg] Revert "Set max probesize" - No longer needed +* [docs] Remove incorrect dependency on VC++10 +* [build] Allow to release without changelog + +### 2021.10.09 + +* Improved progress reporting + * Separate `--console-title` and `--no-progress` + * Add option `--progress` to show progress-bar even in quiet mode + * Fix and refactor `minicurses` and use it for all progress reporting + * Standardize use of terminal sequences and enable color support for windows 10 + * Add option `--progress-template` to customize progress-bar and console-title + * Add postprocessor hooks and progress reporting +* [postprocessor] Add plugin support with option `--use-postprocessor` +* [extractor] Extract storyboards from SMIL manifests by [fstirlitz](https://github.com/fstirlitz) +* [outtmpl] Alternate form of format type `l` for `\n` delimited list +* [outtmpl] Format type `U` for unicode normalization +* [outtmpl] Allow empty output template to skip a type of file +* Merge webm formats into mkv if thumbnails are to be embedded +* [adobepass] Add RCN as MSO by [jfogelman](https://github.com/jfogelman) +* [ciscowebex] Add extractor by [damianoamatruda](https://github.com/damianoamatruda) +* [Gettr] Add extractor by [i6t](https://github.com/i6t) +* [GoPro] Add extractor by [i6t](https://github.com/i6t) +* [N1] Add extractor by [u-spec-png](https://github.com/u-spec-png) +* [Theta] Add video extractor by [alerikaisattera](https://github.com/alerikaisattera) +* [Veo] Add extractor by [i6t](https://github.com/i6t) +* [Vupload] Add extractor by [u-spec-png](https://github.com/u-spec-png) +* [bbc] Extract better quality videos by [ajj8](https://github.com/ajj8) +* [Bilibili] Add subtitle converter by [u-spec-png](https://github.com/u-spec-png) +* [CBC] Cleanup tests by [makeworld-the-better-one](https://github.com/makeworld-the-better-one) +* [Douyin] Rewrite extractor by [MinePlayersPE](https://github.com/MinePlayersPE) +* [Funimation] Fix for /v/ urls by [pukkandan](https://github.com/pukkandan), [Jules-A](https://github.com/Jules-A) +* [Funimation] Sort formats according to the relevant extractor-args +* [Hidive] Fix duplicate and incorrect formats +* [HotStarSeries] Fix cookies by [Ashish0804](https://github.com/Ashish0804) +* [LinkedInLearning] Add subtitles by [Ashish0804](https://github.com/Ashish0804) +* [Mediaite] Relax valid url by [coletdjnz](https://github.com/coletdjnz) +* [Newgrounds] Add age_limit and fix duration by [u-spec-png](https://github.com/u-spec-png) +* [Newgrounds] Fix view count on songs by [u-spec-png](https://github.com/u-spec-png) +* [parliamentlive.tv] Fix extractor by [u-spec-png](https://github.com/u-spec-png) +* [PolskieRadio] Fix extractors by [jakubadamw](https://github.com/jakubadamw), [u-spec-png](https://github.com/u-spec-png) +* [reddit] Add embedded url by [u-spec-png](https://github.com/u-spec-png) +* [reddit] Fix 429 by generating a random `reddit_session` by [AjaxGb](https://github.com/AjaxGb) +* [Rumble] Add RumbleChannelIE by [Ashish0804](https://github.com/Ashish0804) +* [soundcloud:playlist] Detect last page correctly +* [SovietsCloset] Add duration from m3u8 by [ChillingPepper](https://github.com/ChillingPepper) +* [Streamable] Add codecs by [u-spec-png](https://github.com/u-spec-png) +* [vidme] Remove extractor by [alerikaisattera](https://github.com/alerikaisattera) +* [youtube:tab] Fallback to API when webpage fails to download by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Fix non-fatal errors in fetching player +* Fix `--flat-playlist` when neither IE nor id is known +* Fix `-f mp4` behaving differently from youtube-dl +* Workaround for bug in `ssl.SSLContext.load_default_certs` +* [aes] Improve performance slightly by [sulyi](https://github.com/sulyi) +* [cookies] Fix keyring fallback by [mbway](https://github.com/mbway) +* [embedsubtitle] Fix error when duration is unknown +* [ffmpeg] Fix error when subtitle file is missing +* [ffmpeg] Set max probesize to workaround AAC HLS stream issues by [shirt](https://github.com/shirt-dev) +* [FixupM3u8] Remove redundant run if merged is needed +* [hls] Fix decryption issues by [shirt](https://github.com/shirt-dev), [pukkandan](https://github.com/pukkandan) +* [http] Respect user-provided chunk size over extractor's +* [utils] Let traverse_obj accept functions as keys +* [docs] Add note about our custom ffmpeg builds +* [docs] Write embedding and contributing documentation by [pukkandan](https://github.com/pukkandan), [timethrow](https://github.com/timethrow) +* [update] Check for new version even if not updateable +* [build] Add more files to the tarball +* [build] Allow building with py2exe (and misc fixes) +* [build] Use pycryptodomex by [shirt](https://github.com/shirt-dev), [pukkandan](https://github.com/pukkandan) +* [cleanup] Some minor refactoring, improve docs and misc cleanup + + +### 2021.09.25 + +* Add new option `--netrc-location` +* [outtmpl] Allow alternate fields using `,` +* [outtmpl] Add format type `B` to treat the value as bytes (eg: to limit the filename to a certain number of bytes) +* Separate the options `--ignore-errors` and `--no-abort-on-error` +* Basic framework for simultaneous download of multiple formats by [nao20010128nao](https://github.com/nao20010128nao) +* [17live] Add 17.live extractor by [nao20010128nao](https://github.com/nao20010128nao) +* [bilibili] Add BiliIntlIE and BiliIntlSeriesIE by [Ashish0804](https://github.com/Ashish0804) +* [CAM4] Add extractor by [alerikaisattera](https://github.com/alerikaisattera) +* [Chingari] Add extractors by [Ashish0804](https://github.com/Ashish0804) +* [CGTN] Add extractor by [chao813](https://github.com/chao813) +* [damtomo] Add extractor by [nao20010128nao](https://github.com/nao20010128nao) +* [gotostage] Add extractor by [poschi3](https://github.com/poschi3) +* [Koo] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [Mediaite] Add Extractor by [Ashish0804](https://github.com/Ashish0804) +* [Mediaklikk] Add Extractor by [tmarki](https://github.com/tmarki), [mrx23dot](https://github.com/mrx23dot), [coletdjnz](https://github.com/coletdjnz) +* [MuseScore] Add Extractor by [Ashish0804](https://github.com/Ashish0804) +* [Newgrounds] Add NewgroundsUserIE and improve extractor by [u-spec-png](https://github.com/u-spec-png) +* [nzherald] Add NZHeraldIE by [coletdjnz](https://github.com/coletdjnz) +* [Olympics] Add replay extractor by [Ashish0804](https://github.com/Ashish0804) +* [Peertube] Add channel and playlist extractors by [u-spec-png](https://github.com/u-spec-png) +* [radlive] Add extractor by [nyuszika7h](https://github.com/nyuszika7h) +* [SovietsCloset] Add extractor by [ChillingPepper](https://github.com/ChillingPepper) +* [Streamanity] Add Extractor by [alerikaisattera](https://github.com/alerikaisattera) +* [Theta] Add extractor by [alerikaisattera](https://github.com/alerikaisattera) +* [Yandex] Add ZenYandexIE and ZenYandexChannelIE by [Ashish0804](https://github.com/Ashish0804) +* [9Now] handle episodes of series by [dalanmiller](https://github.com/dalanmiller) +* [AnimalPlanet] Fix extractor by [Sipherdrakon](https://github.com/Sipherdrakon) +* [Arte] Improve description extraction by [renalid](https://github.com/renalid) +* [atv.at] Use jwt for API by [NeroBurner](https://github.com/NeroBurner) +* [brightcove] Extract subtitles from manifests +* [CBC] Fix CBC Gem extractors by [makeworld-the-better-one](https://github.com/makeworld-the-better-one) +* [cbs] Report appropriate error for DRM +* [comedycentral] Support `collection-playlist` by [nixxo](https://github.com/nixxo) +* [DIYNetwork] Support new format by [Sipherdrakon](https://github.com/Sipherdrakon) +* [downloader/niconico] Pass custom headers by [nao20010128nao](https://github.com/nao20010128nao) +* [dw] Fix extractor +* [Fancode] Fix live streams by [zenerdi0de](https://github.com/zenerdi0de) +* [funimation] Fix for locations outside US by [Jules-A](https://github.com/Jules-A), [pukkandan](https://github.com/pukkandan) +* [globo] Fix GloboIE by [Ashish0804](https://github.com/Ashish0804) +* [HiDive] Fix extractor by [Ashish0804](https://github.com/Ashish0804) +* [Hotstar] Add referer for subs by [Ashish0804](https://github.com/Ashish0804) +* [itv] Fix extractor, add subtitles and thumbnails by [coletdjnz](https://github.com/coletdjnz), [sleaux-meaux](https://github.com/sleaux-meaux), [Vangelis66](https://github.com/Vangelis66) +* [lbry] Show error message from API response +* [Mxplayer] Use mobile API by [Ashish0804](https://github.com/Ashish0804) +* [NDR] Rewrite NDRIE by [Ashish0804](https://github.com/Ashish0804) +* [Nuvid] Fix extractor by [u-spec-png](https://github.com/u-spec-png) +* [Oreilly] Handle new web url by [MKSherbini](https://github.com/MKSherbini) +* [pbs] Fix subtitle extraction by [coletdjnz](https://github.com/coletdjnz), [gesa](https://github.com/gesa), [raphaeldore](https://github.com/raphaeldore) +* [peertube] Update instances by [u-spec-png](https://github.com/u-spec-png) +* [plutotv] Fix extractor for URLs with `/en` +* [reddit] Workaround for 429 by redirecting to old.reddit.com +* [redtube] Fix exts +* [soundcloud] Make playlist extraction lazy +* [soundcloud] Retry playlist pages on `502` error and update `_CLIENT_ID` +* [southpark] Fix SouthParkDE by [coletdjnz](https://github.com/coletdjnz) +* [SovietsCloset] Fix playlists for games with only named categories by [ConquerorDopy](https://github.com/ConquerorDopy) +* [SpankBang] Fix uploader by [f4pp3rk1ng](https://github.com/f4pp3rk1ng), [coletdjnz](https://github.com/coletdjnz) +* [tiktok] Use API to fetch higher quality video by [MinePlayersPE](https://github.com/MinePlayersPE), [llacb47](https://github.com/llacb47) +* [TikTokUser] Fix extractor using mobile API by [MinePlayersPE](https://github.com/MinePlayersPE), [llacb47](https://github.com/llacb47) +* [videa] Fix some extraction errors by [nyuszika7h](https://github.com/nyuszika7h) +* [VrtNU] Handle login errors by [llacb47](https://github.com/llacb47) +* [vrv] Don't raise error when thumbnails are missing +* [youtube] Cleanup authentication code by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Fix `--mark-watched` with `--cookies-from-browser` +* [youtube] Improvements to JS player extraction and add extractor-args to skip it by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Retry on 'Unknown Error' by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Return full URL instead of just ID +* [youtube] Warn when trying to download clips +* [zdf] Improve format sorting +* [zype] Extract subtitles from the m3u8 manifest by [fstirlitz](https://github.com/fstirlitz) +* Allow `--force-write-archive` to work with `--flat-playlist` +* Download subtitles in order of `--sub-langs` +* Allow `0` in `--playlist-items` +* Handle more playlist errors with `-i` +* Fix `--no-get-comments` +* Fix `extra_info` being reused across runs +* Fix compat options `no-direct-merge` and `playlist-index` +* Dump files should obey `--trim-filename` by [sulyi](https://github.com/sulyi) +* [aes] Add `aes_gcm_decrypt_and_verify` by [sulyi](https://github.com/sulyi), [pukkandan](https://github.com/pukkandan) +* [aria2c] Fix IV for some AES-128 streams by [shirt](https://github.com/shirt-dev) +* [compat] Don't ignore `HOME` (if set) on windows +* [cookies] Make browser names case insensitive +* [cookies] Print warning for cookie decoding error only once +* [extractor] Fix root-relative URLs in MPD by [DigitalDJ](https://github.com/DigitalDJ) +* [ffmpeg] Add `aac_adtstoasc` when merging if needed +* [fragment,aria2c] Generalize and refactor some code +* [fragment] Avoid repeated request for AES key +* [fragment] Fix range header when using `-N` and media sequence by [shirt](https://github.com/shirt-dev) +* [hls,aes] Fallback to native implementation for AES-CBC and detect `Cryptodome` in addition to `Crypto` +* [hls] Byterange + AES128 is supported by native downloader +* [ModifyChapters] Improve sponsor chapter merge algorithm by [nihil-admirari](https://github.com/nihil-admirari) +* [ModifyChapters] Minor fixes +* [WebVTT] Adjust parser to accommodate PBS subtitles +* [utils] Improve `extract_timezone` by [dirkf](https://github.com/dirkf) +* [options] Fix `--no-config` and refactor reading of config files +* [options] Strip spaces and ignore empty entries in list-like switches +* [test/cookies] Improve logging +* [build] Automate more of the release process by [animelover1984](https://github.com/animelover1984), [pukkandan](https://github.com/pukkandan) +* [build] Fix sha256 by [nihil-admirari](https://github.com/nihil-admirari) +* [build] Bring back brew taps by [nao20010128nao](https://github.com/nao20010128nao) +* [build] Provide `--onedir` zip for windows by [pukkandan](https://github.com/pukkandan) +* [cleanup,docs] Add deprecation warning in docs for some counter intuitive behaviour +* [cleanup] Fix line endings for `nebula.py` by [glenn-slayden](https://github.com/glenn-slayden) +* [cleanup] Improve `make clean-test` by [sulyi](https://github.com/sulyi) +* [cleanup] Misc + + +### 2021.09.02 + +* **Native SponsorBlock** implementation by [nihil-admirari](https://github.com/nihil-admirari), [pukkandan](https://github.com/pukkandan) + * `--sponsorblock-remove CATS` removes specified chapters from file + * `--sponsorblock-mark CATS` marks the specified sponsor sections as chapters + * `--sponsorblock-chapter-title TMPL` to specify sponsor chapter template + * `--sponsorblock-api URL` to use a different API + * No re-encoding is done unless `--force-keyframes-at-cuts` is used + * The fetched sponsor sections are written to the infojson + * Deprecates: `--sponskrub`, `--no-sponskrub`, `--sponskrub-cut`, `--no-sponskrub-cut`, `--sponskrub-force`, `--no-sponskrub-force`, `--sponskrub-location`, `--sponskrub-args` +* Split `--embed-chapters` from `--embed-metadata` (it still implies the former by default) +* Add option `--remove-chapters` to remove arbitrary chapters by [nihil-admirari](https://github.com/nihil-admirari), [pukkandan](https://github.com/pukkandan) +* Add option `--force-keyframes-at-cuts` for more accurate cuts when removing and splitting chapters by [nihil-admirari](https://github.com/nihil-admirari) +* Let `--match-filter` reject entries early + * Makes redundant: `--match-title`, `--reject-title`, `--min-views`, `--max-views` +* [lazy_extractor] Improvements (It now passes all tests) + * Bugfix for when plugin directory doesn't exist by [kidonng](https://github.com/kidonng) + * Create instance only after pre-checking archive + * Import actual class if an attribute is accessed + * Fix `suitable` and add flake8 test +* [downloader/ffmpeg] Experimental support for DASH manifests (including live) + * Your ffmpeg must have [this patch](https://github.com/FFmpeg/FFmpeg/commit/3249c757aed678780e22e99a1a49f4672851bca9) applied for YouTube DASH to work +* [downloader/ffmpeg] Allow passing custom arguments before `-i` +* [BannedVideo] Add extractor by [smege1001](https://github.com/smege1001), [blackjack4494](https://github.com/blackjack4494), [pukkandan](https://github.com/pukkandan) +* [bilibili] Add category extractor by [animelover1984](https://github.com/animelover1984) +* [Epicon] Add extractors by [Ashish0804](https://github.com/Ashish0804) +* [filmmodu] Add extractor by [mzbaulhaque](https://github.com/mzbaulhaque) +* [GabTV] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [Hungama] Fix `HungamaSongIE` and add `HungamaAlbumPlaylistIE` by [Ashish0804](https://github.com/Ashish0804) +* [ManotoTV] Add new extractors by [tandy1000](https://github.com/tandy1000) +* [Niconico] Add Search extractors by [animelover1984](https://github.com/animelover1984), [pukkandan](https://github.com/pukkandan) +* [Patreon] Add `PatreonUserIE` by [zenerdi0de](https://github.com/zenerdi0de) +* [peloton] Add extractor by [IONECarter](https://github.com/IONECarter), [capntrips](https://github.com/capntrips), [pukkandan](https://github.com/pukkandan) +* [ProjectVeritas] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [radiko] Add extractors by [nao20010128nao](https://github.com/nao20010128nao) +* [StarTV] Add extractor for `startv.com.tr` by [mrfade](https://github.com/mrfade), [coletdjnz](https://github.com/coletdjnz) +* [tiktok] Add `TikTokUserIE` by [Ashish0804](https://github.com/Ashish0804), [pukkandan](https://github.com/pukkandan) +* [Tokentube] Add extractor by [u-spec-png](https://github.com/u-spec-png) +* [TV2Hu] Fix `TV2HuIE` and add `TV2HuSeriesIE` by [Ashish0804](https://github.com/Ashish0804) +* [voicy] Add extractor by [nao20010128nao](https://github.com/nao20010128nao) +* [adobepass] Fix Verizon SAML login by [nyuszika7h](https://github.com/nyuszika7h), [ParadoxGBB](https://github.com/ParadoxGBB) +* [afreecatv] Fix adult VODs by [wlritchi](https://github.com/wlritchi) +* [afreecatv] Tolerate failure to parse date string by [wlritchi](https://github.com/wlritchi) +* [aljazeera] Fix extractor by [MinePlayersPE](https://github.com/MinePlayersPE) +* [ATV.at] Fix extractor for ATV.at by [NeroBurner](https://github.com/NeroBurner), [coletdjnz](https://github.com/coletdjnz) +* [bitchute] Fix test by [mahanstreamer](https://github.com/mahanstreamer) +* [camtube] Remove obsolete extractor by [alerikaisattera](https://github.com/alerikaisattera) +* [CDA] Add more formats by [u-spec-png](https://github.com/u-spec-png) +* [eroprofile] Fix page skipping in albums by [jhwgh1968](https://github.com/jhwgh1968) +* [facebook] Fix format sorting +* [facebook] Fix metadata extraction by [kikuyan](https://github.com/kikuyan) +* [facebook] Update onion URL by [Derkades](https://github.com/Derkades) +* [HearThisAtIE] Fix extractor by [Ashish0804](https://github.com/Ashish0804) +* [instagram] Add referrer to prevent throttling by [u-spec-png](https://github.com/u-spec-png), [kikuyan](https://github.com/kikuyan) +* [iwara.tv] Extract more metadata by [BunnyHelp](https://github.com/BunnyHelp) +* [iwara] Add thumbnail by [i6t](https://github.com/i6t) +* [kakao] Fix extractor +* [mediaset] Fix extraction for some videos by [nyuszika7h](https://github.com/nyuszika7h) +* [Motherless] Fix extractor by [coletdjnz](https://github.com/coletdjnz) +* [Nova] fix extractor by [std-move](https://github.com/std-move) +* [ParamountPlus] Fix geo verification by [shirt](https://github.com/shirt-dev) +* [peertube] handle new video URL format by [Chocobozzz](https://github.com/Chocobozzz) +* [pornhub] Separate and fix playlist extractor by [mzbaulhaque](https://github.com/mzbaulhaque) +* [reddit] Fix for quarantined subreddits by [ouwou](https://github.com/ouwou) +* [ShemarooMe] Fix extractor by [Ashish0804](https://github.com/Ashish0804) +* [soundcloud] Refetch `client_id` on 403 +* [tiktok] Fix metadata extraction +* [TV2] Fix extractor by [Ashish0804](https://github.com/Ashish0804) +* [tv5mondeplus] Fix extractor by [korli](https://github.com/korli) +* [VH1,TVLand] Fix extractors by [Sipherdrakon](https://github.com/Sipherdrakon) +* [Viafree] Fix extractor and extract subtitles by [coletdjnz](https://github.com/coletdjnz) +* [XHamster] Extract `uploader_id` by [octotherp](https://github.com/octotherp) +* [youtube] Add `shorts` to `_VALID_URL` +* [youtube] Add av01 itags to known formats list by [blackjack4494](https://github.com/blackjack4494) +* [youtube] Extract error messages from HTTPError response by [coletdjnz](https://github.com/coletdjnz) +* [youtube] Fix subtitle names +* [youtube] Prefer audio stream that YouTube considers default +* [youtube] Remove annotations and deprecate `--write-annotations` by [coletdjnz](https://github.com/coletdjnz) +* [Zee5] Fix extractor and add subtitles by [Ashish0804](https://github.com/Ashish0804) +* [aria2c] Obey `--rate-limit` +* [EmbedSubtitle] Continue even if some files are missing +* [extractor] Better error message for DRM +* [extractor] Common function `_match_valid_url` +* [extractor] Show video id in error messages if possible +* [FormatSort] Remove priority of `lang` +* [options] Add `_set_from_options_callback` +* [SubtitleConvertor] Fix bug during subtitle conversion +* [utils] Add `parse_qs` +* [webvtt] Fix timestamp overflow adjustment by [fstirlitz](https://github.com/fstirlitz) +* Bugfix for `--replace-in-metadata` +* Don't try to merge with final extension +* Fix `--force-overwrites` when using `-k` +* Fix `--no-prefer-free-formats` by [CeruleanSky](https://github.com/CeruleanSky) +* Fix `-F` for extractors that directly return url +* Fix `-J` when there are failed videos +* Fix `extra_info` being reused across runs +* Fix `playlist_index` not obeying `playlist_start` and add tests +* Fix resuming of single formats when using `--no-part` +* Revert erroneous use of the `Content-Length` header by [fstirlitz](https://github.com/fstirlitz) +* Use `os.replace` where applicable by; paulwrubel +* [build] Add homebrew taps `yt-dlp/taps/yt-dlp` by [nao20010128nao](https://github.com/nao20010128nao) +* [build] Fix bug in making `yt-dlp.tar.gz` +* [docs] Fix some typos by [pukkandan](https://github.com/pukkandan), [zootedb0t](https://github.com/zootedb0t) +* [cleanup] Replace improper use of tab in trovo by [glenn-slayden](https://github.com/glenn-slayden) + + ### 2021.08.10 * Add option `--replace-in-metadata` @@ -76,8 +750,8 @@ ### 2021.08.10 ### 2021.08.02 * Add logo, banner and donate links -* Expand and escape environment variables correctly in output template -* Add format types `j` (json), `l` (comma delimited list), `q` (quoted for terminal) in output template +* [outtmpl] Expand and escape environment variables +* [outtmpl] Add format types `j` (json), `l` (comma delimited list), `q` (quoted for terminal) * [downloader] Allow streaming some unmerged formats to stdout using ffmpeg * [youtube] **Age-gate bypass** * Add `agegate` clients by [pukkandan](https://github.com/pukkandan), [MinePlayersPE](https://github.com/MinePlayersPE) @@ -282,7 +956,7 @@ ### 2021.06.23 ### 2021.06.09 * Fix bug where `%(field)d` in filename template throws error -* Improve offset parsing in outtmpl +* [outtmpl] Improve offset parsing * [test] More rigorous tests for `prepare_filename` ### 2021.06.08 @@ -917,7 +1591,7 @@ ### 2021.01.05 * Cleaned up the fork for public use -**PS**: All uncredited changes above this point are authored by [pukkandan](https://github.com/pukkandan) +**Note**: All uncredited changes above this point are authored by [pukkandan](https://github.com/pukkandan) ### Unreleased changes in [blackjack4494/yt-dlc](https://github.com/blackjack4494/yt-dlc) * Updated to youtube-dl release 2020.11.26 by [pukkandan](https://github.com/pukkandan) @@ -942,8 +1616,110 @@ ### Unreleased changes in [blackjack4494/yt-dlc](https://github.com/blackjack449 * [spreaker] fix SpreakerShowIE test URL by [pukkandan](https://github.com/pukkandan) * [Vlive] Fix playlist handling when downloading a channel by [kyuyeunk](https://github.com/kyuyeunk) * [tmz] Fix extractor by [diegorodriguezv](https://github.com/diegorodriguezv) +* [ITV] BTCC URL update by [WolfganP](https://github.com/WolfganP) * [generic] Detect embedded bitchute videos by [pukkandan](https://github.com/pukkandan) * [generic] Extract embedded youtube and twitter videos by [diegorodriguezv](https://github.com/diegorodriguezv) * [ffmpeg] Ensure all streams are copied by [pukkandan](https://github.com/pukkandan) * [embedthumbnail] Fix for os.rename error by [pukkandan](https://github.com/pukkandan) * make_win.bat: don't use UPX to pack vcruntime140.dll by [jbruchon](https://github.com/jbruchon) + + +### Changelog of [blackjack4494/yt-dlc](https://github.com/blackjack4494/yt-dlc) till release 2020.11.11-3 + +**Note**: This was constructed from the merge commit messages and may not be entirely accurate + +* [bandcamp] fix failing test. remove subclass hack by [insaneracist](https://github.com/insaneracist) +* [bandcamp] restore album downloads by [insaneracist](https://github.com/insaneracist) +* [francetv] fix extractor by [Surkal](https://github.com/Surkal) +* [gdcvault] fix extractor by [blackjack4494](https://github.com/blackjack4494) +* [hotstar] Move to API v1 by [theincognito-inc](https://github.com/theincognito-inc) +* [hrfernsehen] add extractor by [blocktrron](https://github.com/blocktrron) +* [kakao] new apis by [blackjack4494](https://github.com/blackjack4494) +* [la7] fix missing protocol by [nixxo](https://github.com/nixxo) +* [mailru] removed escaped braces, use urljoin, added tests by [nixxo](https://github.com/nixxo) +* [MTV/Nick] universal mgid extractor + fix nick.de feed by [blackjack4494](https://github.com/blackjack4494) +* [mtv] Fix a missing match_id by [nixxo](https://github.com/nixxo) +* [Mtv] updated extractor logic & more by [blackjack4494](https://github.com/blackjack4494) +* [ndr] support Daserste ndr by [blackjack4494](https://github.com/blackjack4494) +* [Netzkino] Only use video id to find metadata by [TobiX](https://github.com/TobiX) +* [newgrounds] fix: video download by [insaneracist](https://github.com/insaneracist) +* [nitter] Add new extractor by [B0pol](https://github.com/B0pol) +* [soundcloud] Resolve audio/x-wav by [tfvlrue](https://github.com/tfvlrue) +* [soundcloud] sets pattern and tests by [blackjack4494](https://github.com/blackjack4494) +* [SouthparkDE/MTV] another mgid extraction (mtv_base) feed url updated by [blackjack4494](https://github.com/blackjack4494) +* [StoryFire] Add new extractor by [sgstair](https://github.com/sgstair) +* [twitch] by [geauxlo](https://github.com/geauxlo) +* [videa] Adapt to updates by [adrianheine](https://github.com/adrianheine) +* [Viki] subtitles, formats by [blackjack4494](https://github.com/blackjack4494) +* [vlive] fix extractor for revamped website by [exwm](https://github.com/exwm) +* [xtube] fix extractor by [insaneracist](https://github.com/insaneracist) +* [youtube] Convert subs when download is skipped by [blackjack4494](https://github.com/blackjack4494) +* [youtube] Fix age gate detection by [random-nick](https://github.com/random-nick) +* [youtube] fix yt-only playback when age restricted/gated - requires cookies by [blackjack4494](https://github.com/blackjack4494) +* [youtube] fix: extract artist metadata from ytInitialData by [insaneracist](https://github.com/insaneracist) +* [youtube] fix: extract mix playlist ids from ytInitialData by [insaneracist](https://github.com/insaneracist) +* [youtube] fix: mix playlist title by [insaneracist](https://github.com/insaneracist) +* [youtube] fix: Youtube Music playlists by [insaneracist](https://github.com/insaneracist) +* [Youtube] Fixed problem with new youtube player by [peet1993](https://github.com/peet1993) +* [zoom] Fix url parsing for url's containing /share/ and dots by [Romern](https://github.com/Romern) +* [zoom] new extractor by [insaneracist](https://github.com/insaneracist) +* abc by [adrianheine](https://github.com/adrianheine) +* Added Comcast_SSO fix by [merval](https://github.com/merval) +* Added DRM logic to brightcove by [merval](https://github.com/merval) +* Added regex for ABC.com site. by [kucksdorfs](https://github.com/kucksdorfs) +* alura by [hugohaa](https://github.com/hugohaa) +* Arbitrary merges by [fstirlitz](https://github.com/fstirlitz) +* ard.py_add_playlist_support by [martin54](https://github.com/martin54) +* Bugfix/youtube/chapters fix extractor by [gschizas](https://github.com/gschizas) +* bugfix_youtube_like_extraction by [RedpointsBots](https://github.com/RedpointsBots) +* Create build workflow by [blackjack4494](https://github.com/blackjack4494) +* deezer by [LucBerge](https://github.com/LucBerge) +* Detect embedded bitchute videos by [pukkandan](https://github.com/pukkandan) +* Don't install tests by [l29ah](https://github.com/l29ah) +* Don't try to embed/convert json subtitles generated by [youtube](https://github.com/youtube) livechat by [pukkandan](https://github.com/pukkandan) +* Doodstream by [sxvghd](https://github.com/sxvghd) +* duboku by [lkho](https://github.com/lkho) +* elonet by [tpikonen](https://github.com/tpikonen) +* ext/remuxe-video by [Zocker1999NET](https://github.com/Zocker1999NET) +* fall-back to the old way to fetch subtitles, if needed by [RobinD42](https://github.com/RobinD42) +* feature_subscriber_count by [RedpointsBots](https://github.com/RedpointsBots) +* Fix external downloader when there is no http_header by [pukkandan](https://github.com/pukkandan) +* Fix issue triggered by [tubeup](https://github.com/tubeup) by [nsapa](https://github.com/nsapa) +* Fix YoutubePlaylistsIE by [ZenulAbidin](https://github.com/ZenulAbidin) +* fix-mitele' by [DjMoren](https://github.com/DjMoren) +* fix/google-drive-cookie-issue by [legraphista](https://github.com/legraphista) +* fix_tiktok by [mervel-mervel](https://github.com/mervel-mervel) +* Fixed problem with JS player URL by [peet1993](https://github.com/peet1993) +* fixYTSearch by [xarantolus](https://github.com/xarantolus) +* FliegendeWurst-3sat-zdf-merger-bugfix-feature +* gilou-bandcamp_update +* implement ThisVid extractor by [rigstot](https://github.com/rigstot) +* JensTimmerman-patch-1 by [JensTimmerman](https://github.com/JensTimmerman) +* Keep download archive in memory for better performance by [jbruchon](https://github.com/jbruchon) +* la7-fix by [iamleot](https://github.com/iamleot) +* magenta by [adrianheine](https://github.com/adrianheine) +* Merge 26564 from [adrianheine](https://github.com/adrianheine) +* Merge code from [ddland](https://github.com/ddland) +* Merge code from [nixxo](https://github.com/nixxo) +* Merge code from [ssaqua](https://github.com/ssaqua) +* Merge code from [zubearc](https://github.com/zubearc) +* mkvthumbnail by [MrDoritos](https://github.com/MrDoritos) +* myvideo_ge by [fonkap](https://github.com/fonkap) +* naver by [SeonjaeHyeon](https://github.com/SeonjaeHyeon) +* ondemandkorea by [julien-hadleyjack](https://github.com/julien-hadleyjack) +* rai-update by [iamleot](https://github.com/iamleot) +* RFC: youtube: Polymer UI and JSON endpoints for playlists by [wlritchi](https://github.com/wlritchi) +* rutv by [adrianheine](https://github.com/adrianheine) +* Sc extractor web auth by [blackjack4494](https://github.com/blackjack4494) +* Switch from binary search tree to Python sets by [jbruchon](https://github.com/jbruchon) +* tiktok by [skyme5](https://github.com/skyme5) +* tvnow by [TinyToweringTree](https://github.com/TinyToweringTree) +* twitch-fix by [lel-amri](https://github.com/lel-amri) +* Twitter shortener by [blackjack4494](https://github.com/blackjack4494) +* Update README.md by [JensTimmerman](https://github.com/JensTimmerman) +* Update to reflect website changes. by [amigatomte](https://github.com/amigatomte) +* use webarchive to fix a dead link in README by [B0pol](https://github.com/B0pol) +* Viki the second by [blackjack4494](https://github.com/blackjack4494) +* wdr-subtitles by [mrtnmtth](https://github.com/mrtnmtth) +* Webpfix by [alexmerkel](https://github.com/alexmerkel) +* Youtube live chat by [siikamiika](https://github.com/siikamiika) diff --git a/Collaborators.md b/Collaborators.md index 0017e1cd48..1c17f8ab19 100644 --- a/Collaborators.md +++ b/Collaborators.md @@ -28,6 +28,7 @@ ## [coletdjnz](https://github.com/coletdjnz) [![gh-sponsor](https://img.shields.io/badge/_-Sponsor-red.svg?logo=githubsponsors&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/coletdjnz) * YouTube improvements including: age-gate bypass, private playlists, multiple-clients (to avoid throttling) and a lot of under-the-hood improvements +* Added support for downloading YoutubeWebArchive videos diff --git a/Makefile b/Makefile index bc0b4e399e..a34735f6cd 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -all: yt-dlp doc pypi-files +all: lazy-extractors yt-dlp doc pypi-files clean: clean-test clean-dist clean-cache completions: completion-bash completion-fish completion-zsh doc: README.md CONTRIBUTING.md issuetemplates supportedsites @@ -13,9 +13,13 @@ pypi-files: AUTHORS Changelog.md LICENSE README.md README.txt supportedsites com .PHONY: all clean install test tar pypi-files completions ot offlinetest codetest supportedsites clean-test: - rm -rf *.dump *.part* *.ytdl *.info.json *.mp4 *.m4a *.flv *.mp3 *.avi *.mkv *.webm *.3gp *.wav *.ape *.swf *.jpg *.png *.frag *.frag.urls *.frag.aria2 test/testdata/player-*.js *.opus *.webp *.ttml *.vtt *.jpeg + rm -rf test/testdata/player-*.js tmp/ *.annotations.xml *.aria2 *.description *.dump *.frag \ + *.frag.aria2 *.frag.urls *.info.json *.live_chat.json *.part* *.unknown_video *.ytdl \ + *.3gp *.ape *.avi *.desktop *.flac *.flv *.jpeg *.jpg *.m4a *.m4v *.mhtml *.mkv *.mov *.mp3 \ + *.mp4 *.ogg *.opus *.png *.sbv *.srt *.swf *.swp *.ttml *.url *.vtt *.wav *.webloc *.webm *.webp clean-dist: - rm -rf yt-dlp.1.temp.md yt-dlp.1 README.txt MANIFEST build/ dist/ .coverage cover/ yt-dlp.tar.gz completions/ yt_dlp/extractor/lazy_extractors.py *.spec CONTRIBUTING.md.tmp yt-dlp yt-dlp.exe yt_dlp.egg-info/ AUTHORS .mailmap + rm -rf yt-dlp.1.temp.md yt-dlp.1 README.txt MANIFEST build/ dist/ .coverage cover/ yt-dlp.tar.gz completions/ \ + yt_dlp/extractor/lazy_extractors.py *.spec CONTRIBUTING.md.tmp yt-dlp yt-dlp.exe yt_dlp.egg-info/ AUTHORS .mailmap clean-cache: find . -name "*.pyc" -o -name "*.class" -delete @@ -29,7 +33,6 @@ DESTDIR ?= . BINDIR ?= $(PREFIX)/bin MANDIR ?= $(PREFIX)/man SHAREDIR ?= $(PREFIX)/share -# make_supportedsites.py doesnot work correctly in python2 PYTHON ?= /usr/bin/env python3 # set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local @@ -38,9 +41,9 @@ SYSCONFDIR = $(shell if [ $(PREFIX) = /usr -o $(PREFIX) = /usr/local ]; then ech # set markdown input format to "markdown-smart" for pandoc version 2 and to "markdown" for pandoc prior to version 2 MARKDOWN = $(shell if [ `pandoc -v | head -n1 | cut -d" " -f2 | head -c1` = "2" ]; then echo markdown-smart; else echo markdown; fi) -install: yt-dlp yt-dlp.1 completions - install -Dm755 yt-dlp $(DESTDIR)$(BINDIR) - install -Dm644 yt-dlp.1 $(DESTDIR)$(MANDIR)/man1 +install: lazy-extractors yt-dlp yt-dlp.1 completions + install -Dm755 yt-dlp $(DESTDIR)$(BINDIR)/yt-dlp + install -Dm644 yt-dlp.1 $(DESTDIR)$(MANDIR)/man1/yt-dlp.1 install -Dm644 completions/bash/yt-dlp $(DESTDIR)$(SHAREDIR)/bash-completion/completions/yt-dlp install -Dm644 completions/zsh/_yt-dlp $(DESTDIR)$(SHAREDIR)/zsh/site-functions/_yt-dlp install -Dm644 completions/fish/yt-dlp.fish $(DESTDIR)$(SHAREDIR)/fish/vendor_completions.d/yt-dlp.fish @@ -76,12 +79,13 @@ README.md: yt_dlp/*.py yt_dlp/*/*.py CONTRIBUTING.md: README.md $(PYTHON) devscripts/make_contributing.py README.md CONTRIBUTING.md -issuetemplates: devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl/1_broken_site.md .github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md .github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md .github/ISSUE_TEMPLATE_tmpl/4_bug_report.md .github/ISSUE_TEMPLATE_tmpl/5_feature_request.md yt_dlp/version.py - $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl/1_broken_site.md .github/ISSUE_TEMPLATE/1_broken_site.md - $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md .github/ISSUE_TEMPLATE/2_site_support_request.md - $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md .github/ISSUE_TEMPLATE/3_site_feature_request.md - $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl/4_bug_report.md .github/ISSUE_TEMPLATE/4_bug_report.md - $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl/5_feature_request.md .github/ISSUE_TEMPLATE/5_feature_request.md +issuetemplates: devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml .github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml .github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml .github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml .github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml yt_dlp/version.py + $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml .github/ISSUE_TEMPLATE/1_broken_site.yml + $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml .github/ISSUE_TEMPLATE/2_site_support_request.yml + $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml .github/ISSUE_TEMPLATE/3_site_feature_request.yml + $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml .github/ISSUE_TEMPLATE/4_bug_report.yml + $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml .github/ISSUE_TEMPLATE/5_feature_request.yml + $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl/6_question.yml .github/ISSUE_TEMPLATE/6_question.yml supportedsites: $(PYTHON) devscripts/make_supportedsites.py supportedsites.md @@ -110,7 +114,7 @@ _EXTRACTOR_FILES = $(shell find yt_dlp/extractor -iname '*.py' -and -not -iname yt_dlp/extractor/lazy_extractors.py: devscripts/make_lazy_extractors.py devscripts/lazy_load_template.py $(_EXTRACTOR_FILES) $(PYTHON) devscripts/make_lazy_extractors.py $@ -yt-dlp.tar.gz: README.md yt-dlp.1 completions Changelog.md AUTHORS +yt-dlp.tar.gz: all @tar -czf $(DESTDIR)/yt-dlp.tar.gz --transform "s|^|yt-dlp/|" --owner 0 --group 0 \ --exclude '*.DS_Store' \ --exclude '*.kate-swp' \ @@ -119,12 +123,12 @@ yt-dlp.tar.gz: README.md yt-dlp.1 completions Changelog.md AUTHORS --exclude '*~' \ --exclude '__pycache__' \ --exclude '.git' \ - --exclude 'docs/_build' \ -- \ - devscripts test \ - Changelog.md AUTHORS LICENSE README.md supportedsites.md \ - Makefile MANIFEST.in yt-dlp.1 completions \ - setup.py setup.cfg yt-dlp + README.md supportedsites.md Changelog.md LICENSE \ + CONTRIBUTING.md Collaborators.md CONTRIBUTORS AUTHORS \ + Makefile MANIFEST.in yt-dlp.1 README.txt completions \ + setup.py setup.cfg yt-dlp yt_dlp requirements.txt \ + devscripts test tox.ini pytest.ini AUTHORS: .mailmap git shortlog -s -n | cut -f2 | sort > AUTHORS diff --git a/README.md b/README.md index 248b7e688c..324a1565a7 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ +<!-- MANPAGE: BEGIN EXCLUDED SECTION --> <div align="center"> [![YT-DLP](https://raw.githubusercontent.com/yt-dlp/yt-dlp/master/.github/banner.svg)](#readme) @@ -15,13 +16,18 @@ [![PyPi Downloads](https://img.shields.io/pypi/dm/yt-dlp?label=PyPi&style=for-the-badge)](https://pypi.org/project/yt-dlp) </div> +<!-- MANPAGE: END EXCLUDED SECTION --> yt-dlp is a [youtube-dl](https://github.com/ytdl-org/youtube-dl) fork based on the now inactive [youtube-dlc](https://github.com/blackjack4494/yt-dlc). The main focus of this project is adding new features and patches while also keeping up to date with the original project +<!-- MANPAGE: MOVE "USAGE AND OPTIONS" SECTION HERE --> + +<!-- MANPAGE: BEGIN EXCLUDED SECTION --> * [NEW FEATURES](#new-features) * [Differences in default behavior](#differences-in-default-behavior) * [INSTALLATION](#installation) * [Update](#update) + * [Release Files](#release-files) * [Dependencies](#dependencies) * [Compile](#compile) * [USAGE AND OPTIONS](#usage-and-options) @@ -39,7 +45,7 @@ * [Subtitle Options](#subtitle-options) * [Authentication Options](#authentication-options) * [Post-processing Options](#post-processing-options) - * [SponSkrub (SponsorBlock) Options](#sponskrub-sponsorblock-options) + * [SponsorBlock Options](#sponsorblock-options) * [Extractor Options](#extractor-options) * [CONFIGURATION](#configuration) * [Authentication with .netrc file](#authentication-with-netrc-file) @@ -54,33 +60,37 @@ * [Modifying metadata examples](#modifying-metadata-examples) * [EXTRACTOR ARGUMENTS](#extractor-arguments) * [PLUGINS](#plugins) +* [EMBEDDING YT-DLP](#embedding-yt-dlp) * [DEPRECATED OPTIONS](#deprecated-options) +* [CONTRIBUTING](CONTRIBUTING.md#contributing-to-yt-dlp) + * [Opening an Issue](CONTRIBUTING.md#opening-an-issue) + * [Developer Instructions](CONTRIBUTING.md#developer-instructions) * [MORE](#more) -</div> +<!-- MANPAGE: END EXCLUDED SECTION --> # NEW FEATURES -The major new features from the latest release of [blackjack4494/yt-dlc](https://github.com/blackjack4494/yt-dlc) are: -* **[SponSkrub Integration](#sponskrub-sponsorblock-options)**: You can use [SponSkrub](https://github.com/yt-dlp/SponSkrub) to mark/remove sponsor sections in youtube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API +* Based on **youtube-dl 2021.12.17 [commit/5014bd6](https://github.com/ytdl-org/youtube-dl/commit/5014bd67c22b421207b2650d4dc874b95b36dda1)** and **youtube-dlc 2020.11.11-3 [commit/f9401f2](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee)**: You get all the features and patches of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) in addition to the latest [youtube-dl](https://github.com/ytdl-org/youtube-dl) + +* **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in youtube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API * **[Format Sorting](#sorting-formats)**: The default format sorting options have been changed so that higher resolution and better codecs will be now preferred instead of simply using larger bitrate. Furthermore, you can now specify the sort order using `-S`. This allows for much easier format selection than what is possible by simply using `--format` ([examples](#format-selection-examples)) -* **Merged with youtube-dl [commit/379f52a](https://github.com/ytdl-org/youtube-dl/commit/379f52a4954013767219d25099cce9e0f9401961)**: (v2021.06.06) You get all the latest features and patches of [youtube-dl](https://github.com/ytdl-org/youtube-dl) in addition to all the features of [youtube-dlc](https://github.com/blackjack4494/yt-dlc) - * **Merged with animelover1984/youtube-dl**: You get most of the features and improvements from [animelover1984/youtube-dl](https://github.com/animelover1984/youtube-dl) including `--write-comments`, `BiliBiliSearch`, `BilibiliChannel`, Embedding thumbnail in mp4/ogg/opus, playlist infojson etc. Note that the NicoNico improvements are not available. See [#31](https://github.com/yt-dlp/yt-dlp/pull/31) for details. * **Youtube improvements**: * All Feeds (`:ytfav`, `:ytwatchlater`, `:ytsubs`, `:ythistory`, `:ytrec`) and private playlists supports downloading multiple pages of content * Search (`ytsearch:`, `ytsearchdate:`), search URLs and in-channel search works * Mixes supports downloading multiple pages of content - * Most (but not all) age-gated content can be downloaded without cookies - * Partial workaround for throttling issue + * Some (but not all) age-gated content can be downloaded without cookies + * Fix for [n-sig based throttling](https://github.com/ytdl-org/youtube-dl/issues/29326) * Redirect channel's home URL automatically to `/video` to preserve the old behaviour - * `255kbps` audio is extracted from youtube music if premium cookies are given - * Youtube music Albums, channels etc can be downloaded + * `255kbps` audio is extracted (if available) from youtube music when premium cookies are given + * Youtube music Albums, channels etc can be downloaded ([except self-uploaded music](https://github.com/yt-dlp/yt-dlp/issues/723)) + * Download livestreams from the start using `--live-from-start` -* **Cookies from browser**: Cookies can be automatically extracted from all major web browsers using `--cookies-from-browser BROWSER[:PROFILE]` +* **Cookies from browser**: Cookies can be automatically extracted from all major web browsers using `--cookies-from-browser BROWSER[+KEYRING][:PROFILE]` * **Split video by chapters**: Videos can be split into multiple files based on chapters using `--split-chapters` @@ -88,9 +98,9 @@ # NEW FEATURES * **Aria2c with HLS/DASH**: You can use `aria2c` as the external downloader for DASH(mpd) and HLS(m3u8) formats -* **New extractors**: AnimeLab, Philo MSO, Spectrum MSO, SlingTV MSO, Cablevision MSO, Rcs, Gedi, bitwave.tv, mildom, audius, zee5, mtv.it, wimtv, pluto.tv, niconico users, discoveryplus.in, mediathek, NFHSNetwork, nebula, ukcolumn, whowatch, MxplayerShow, parlview (au), YoutubeWebArchive, fancode, Saitosan, ShemarooMe, telemundo, VootSeries, SonyLIVSeries, HotstarSeries, VidioPremier, VidioLive, RCTIPlus, TBS Live, douyin, pornflip, ParamountPlusSeries, ScienceChannel, Utreon, OpenRec, BandcampMusic, blackboardcollaborate, eroprofile albums, mirrativ +* **New and fixed extractors**: Many new extractors have been added and a lot of existing ones have been fixed. See the [changelog](Changelog.md) or the [list of supported sites](supportedsites.md) -* **Fixed/improved extractors**: archive.org, roosterteeth.com, skyit, instagram, itv, SouthparkDe, spreaker, Vlive, akamai, ina, rumble, tennistv, amcnetworks, la7 podcasts, linuxacadamy, nitter, twitcasting, viu, crackle, curiositystream, mediasite, rmcdecouverte, sonyliv, tubi, tenplay, patreon, videa, yahoo, BravoTV, crunchyroll playlist, RTP, viki, Hotstar, vidio, vimeo, mediaset, Mxplayer, nbcolympics, ParamountPlus, Newgrounds +* **New MSOs**: Philo, Spectrum, SlingTV, Cablevision, RCN * **Subtitle extraction from manifests**: Subtitles can be extracted from streaming media manifests. See [commit/be6202f](https://github.com/yt-dlp/yt-dlp/commit/be6202f12b97858b9d716e608394b51065d0419f) for details @@ -100,35 +110,30 @@ # NEW FEATURES * **Output template improvements**: Output templates can now have date-time formatting, numeric offsets, object traversal etc. See [output template](#output-template) for details. Even more advanced operations can also be done with the help of `--parse-metadata` and `--replace-in-metadata` -* **Other new options**: `--print`, `--sleep-requests`, `--convert-thumbnails`, `--write-link`, `--force-download-archive`, `--force-overwrites`, `--break-on-reject` etc +* **Other new options**: Many new options have been added such as `--print`, `--wait-for-video`, `--sleep-requests`, `--convert-thumbnails`, `--write-link`, `--force-download-archive`, `--force-overwrites`, `--break-on-reject` etc -* **Improvements**: Regex and other operators in `--match-filter`, multiple `--postprocessor-args` and `--downloader-args`, faster archive checking, more [format selection options](#format-selection) etc +* **Improvements**: Regex and other operators in `--match-filter`, multiple `--postprocessor-args` and `--downloader-args`, faster archive checking, more [format selection options](#format-selection), merge multi-video/audio etc -* **Plugin extractors**: Extractors can be loaded from an external file. See [plugins](#plugins) for details +* **Plugins**: Extractors and PostProcessors can be loaded from an external file. See [plugins](#plugins) for details * **Self-updater**: The releases can be updated using `yt-dlp -U` - See [changelog](Changelog.md) or [commits](https://github.com/yt-dlp/yt-dlp/commits) for the full list of changes - -**PS**: Some of these changes are already in youtube-dlc, but are still unreleased. See [this](Changelog.md#unreleased-changes-in-blackjack4494yt-dlc) for details - -If you are coming from [youtube-dl](https://github.com/ytdl-org/youtube-dl), the amount of changes are very large. Compare [options](#options) and [supported sites](supportedsites.md) with youtube-dl's to get an idea of the massive number of features/patches [youtube-dlc](https://github.com/blackjack4494/yt-dlc) has accumulated. - ### Differences in default behavior -Some of yt-dlp's default options are different from that of youtube-dl and youtube-dlc. +Some of yt-dlp's default options are different from that of youtube-dl and youtube-dlc: -* The options `--id`, `--auto-number` (`-A`), `--title` (`-t`) and `--literal` (`-l`), no longer work. See [removed options](#Removed) for details -* `avconv` is not supported as as an alternative to `ffmpeg` -* The default [output template](#output-template) is `%(title)s [%(id)s].%(ext)s`. There is no real reason for this change. This was changed before yt-dlp was ever made public and now there are no plans to change it back to `%(title)s.%(id)s.%(ext)s`. Instead, you may use `--compat-options filename` +* The options `--auto-number` (`-A`), `--title` (`-t`) and `--literal` (`-l`), no longer work. See [removed options](#Removed) for details +* `avconv` is not supported as an alternative to `ffmpeg` +* The default [output template](#output-template) is `%(title)s [%(id)s].%(ext)s`. There is no real reason for this change. This was changed before yt-dlp was ever made public and now there are no plans to change it back to `%(title)s-%(id)s.%(ext)s`. Instead, you may use `--compat-options filename` * The default [format sorting](#sorting-formats) is different from youtube-dl and prefers higher resolution and better codecs rather than higher bitrates. You can use the `--format-sort` option to change this to any order you prefer, or use `--compat-options format-sort` to use youtube-dl's sorting order -* The default format selector is `bv*+ba/b`. This means that if a combined video + audio format that is better than the best video-only format is found, the former will be prefered. Use `-f bv+ba/b` or `--compat-options format-spec` to revert this +* The default format selector is `bv*+ba/b`. This means that if a combined video + audio format that is better than the best video-only format is found, the former will be preferred. Use `-f bv+ba/b` or `--compat-options format-spec` to revert this * Unlike youtube-dlc, yt-dlp does not allow merging multiple audio/video streams into one file by default (since this conflicts with the use of `-f bv*+ba`). If needed, this feature must be enabled using `--audio-multistreams` and `--video-multistreams`. You can also use `--compat-options multistreams` to enable both * `--ignore-errors` is enabled by default. Use `--abort-on-error` or `--compat-options abort-on-error` to abort on errors instead * When writing metadata files such as thumbnails, description or infojson, the same information (if available) is also written for playlists. Use `--no-write-playlist-metafiles` or `--compat-options no-playlist-metafiles` to not write these files -* `--add-metadata` attaches the `infojson` to `mkv` files in addition to writing the metadata when used with `--write-infojson`. Use `--compat-options no-attach-info-json` to revert this +* `--add-metadata` attaches the `infojson` to `mkv` files in addition to writing the metadata when used with `--write-info-json`. Use `--no-embed-info-json` or `--compat-options no-attach-info-json` to revert this +* Some metadata are embedded into different fields when using `--add-metadata` as compared to youtube-dl. Most notably, `comment` field contains the `webpage_url` and `synopsis` contains the `description`. You can [use `--parse-metadata`](https://github.com/yt-dlp/yt-dlp#modifying-metadata) to modify this to your liking or use `--compat-options embed-metadata` to revert this * `playlist_index` behaves differently when used with options like `--playlist-reverse` and `--playlist-items`. See [#302](https://github.com/yt-dlp/yt-dlp/issues/302) for details. You can use `--compat-options playlist-index` if you want to keep the earlier behavior * The output of `-F` is listed in a new format. Use `--compat-options list-formats` to revert this * All *experiences* of a funimation episode are considered as a single video. This behavior breaks existing archives. Use `--compat-options seperate-video-versions` to extract information from only the default player @@ -138,7 +143,7 @@ ### Differences in default behavior * If `ffmpeg` is used as the downloader, the downloading and merging of formats happen in a single step when possible. Use `--compat-options no-direct-merge` to revert this * Thumbnail embedding in `mp4` is done with mutagen if possible. Use `--compat-options embed-thumbnail-atomicparsley` to force the use of AtomicParsley instead * Some private fields such as filenames are removed by default from the infojson. Use `--no-clean-infojson` or `--compat-options no-clean-infojson` to revert this -* When `--embed-subs` and `--write-subs` are used together, the subtitles are written to disk and also embedded in the media file. You can use just `--embed-subs` to embed the subs and automatically delete the seperate file. See [#630 (comment)](https://github.com/yt-dlp/yt-dlp/issues/630#issuecomment-893659460) for more info. `--compat-options no-keep-subs` can be used to revert this. +* When `--embed-subs` and `--write-subs` are used together, the subtitles are written to disk and also embedded in the media file. You can use just `--embed-subs` to embed the subs and automatically delete the separate file. See [#630 (comment)](https://github.com/yt-dlp/yt-dlp/issues/630#issuecomment-893659460) for more info. `--compat-options no-keep-subs` can be used to revert this For ease of use, a few more compat options are available: * `--compat-options all`: Use all compat options @@ -147,17 +152,14 @@ ### Differences in default behavior # INSTALLATION -yt-dlp is not platform specific. So it should work on your Unix box, on Windows or on macOS You can install yt-dlp using one of the following methods: -* Download the binary from the [latest release](https://github.com/yt-dlp/yt-dlp/releases/latest) (recommended method) -* Use [PyPI package](https://pypi.org/project/yt-dlp): `python3 -m pip install --upgrade yt-dlp` -* Use pip+git: `python3 -m pip install --upgrade git+https://github.com/yt-dlp/yt-dlp.git@release` -* Install master branch: `python3 -m pip install --upgrade git+https://github.com/yt-dlp/yt-dlp` -Note that on some systems, you may need to use `py` or `python` instead of `python3` +### Using the release binary -UNIX users (Linux, macOS, BSD) can also install the [latest release](https://github.com/yt-dlp/yt-dlp/releases/latest) one of the following ways: +You can simply download the [correct binary file](#release-files) for your OS: **[[Windows](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.exe)] [[UNIX-like](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp)]** + +In UNIX-like OSes (MacOS, Linux, BSD), you can also install the same in one of the following ways: ``` sudo curl -L https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -o /usr/local/bin/yt-dlp @@ -170,49 +172,112 @@ # INSTALLATION ``` ``` -sudo aria2c https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -o /usr/local/bin/yt-dlp +sudo aria2c https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp --dir /usr/local/bin -o yt-dlp sudo chmod a+rx /usr/local/bin/yt-dlp ``` -### UPDATE -You can use `yt-dlp -U` to update if you are using the provided release. -If you are using `pip`, simply re-run the same command that was used to install the program. +PS: The manpages, shell completion files etc. are available in [yt-dlp.tar.gz](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.tar.gz) -### DEPENDENCIES +### With [PIP](https://pypi.org/project/pip) + +You can install the [PyPI package](https://pypi.org/project/yt-dlp) with: +``` +python3 -m pip install -U yt-dlp +``` + +You can install without any of the optional dependencies using: +``` +python3 -m pip install --no-deps -U yt-dlp +``` + +If you want to be on the cutting edge, you can also install the master branch with: +``` +python3 -m pip install --force-reinstall https://github.com/yt-dlp/yt-dlp/archive/master.zip +``` + +Note that on some systems, you may need to use `py` or `python` instead of `python3` + +### With [Homebrew](https://brew.sh) + +macOS or Linux users that are using Homebrew can also install it by: + +``` +brew install yt-dlp/taps/yt-dlp +``` + +## UPDATE +You can use `yt-dlp -U` to update if you are [using the provided release](#using-the-release-binary) + +If you [installed with pip](#with-pip), simply re-run the same command that was used to install the program + +If you [installed using Homebrew](#with-homebrew), run `brew upgrade yt-dlp/taps/yt-dlp` + +<!-- MANPAGE: BEGIN EXCLUDED SECTION --> +## RELEASE FILES + +#### Recommended + +File|Description +:---|:--- +[yt-dlp](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp)|Platform-independant binary. Needs Python (recommended for **UNIX-like systems**) +[yt-dlp.exe](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.exe)|Windows (Win7 SP1+) standalone x64 binary (recommended for **Windows**) + +#### Alternatives + +File|Description +:---|:--- +[yt-dlp_macos](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_macos)|MacOS (10.15+) standalone executable +[yt-dlp_x86.exe](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_x86.exe)|Windows (Vista SP2+) standalone x86 (32-bit) binary +[yt-dlp_min.exe](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_min.exe)|Windows (Win7 SP1+) standalone x64 binary built with `py2exe`.<br/> Does not contain `pycryptodomex`, needs VC++14 +[yt-dlp_win.zip](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_win.zip)|Unpackaged Windows executable (no auto-update) +[yt-dlp_macos.zip](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp_macos.zip)|Unpackaged MacOS (10.15+) executable (no auto-update) + +#### Misc + +File|Description +:---|:--- +[yt-dlp.tar.gz](https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.tar.gz)|Source tarball. Also contains manpages, completions, etc +[SHA2-512SUMS](https://github.com/yt-dlp/yt-dlp/releases/latest/download/SHA2-512SUMS)|GNU-style SHA512 sums +[SHA2-256SUMS](https://github.com/yt-dlp/yt-dlp/releases/latest/download/SHA2-256SUMS)|GNU-style SHA256 sums +<!-- MANPAGE: END EXCLUDED SECTION --> + +## DEPENDENCIES Python versions 3.6+ (CPython and PyPy) are supported. Other versions and implementations may or may not work correctly. -<!-- https://www.microsoft.com/en-us/download/details.aspx?id=26999 --> +<!-- Python 3.5+ uses VC++14 and it is already embedded in the binary created +<!x-- https://www.microsoft.com/en-us/download/details.aspx?id=26999 --x> On windows, [Microsoft Visual C++ 2010 SP1 Redistributable Package (x86)](https://download.microsoft.com/download/1/6/5/165255E7-1014-4D0A-B094-B6A430A6BFFC/vcredist_x86.exe) is also necessary to run yt-dlp. You probably already have this, but if the executable throws an error due to missing `MSVCR100.dll` you need to install it manually. +--> -While all the other dependancies are optional, `ffmpeg` and `ffprobe` are highly recommended -* [**ffmpeg** and **ffprobe**](https://www.ffmpeg.org) - Required for [merging seperate video and audio files](#format-selection) as well as for various [post-processing](#post-processing-options) tasks. Licence [depends on the build](https://www.ffmpeg.org/legal.html) -* [**sponskrub**](https://github.com/faissaloo/SponSkrub) - For using the [sponskrub options](#sponskrub-sponsorblock-options). Licenced under [GPLv3+](https://github.com/faissaloo/SponSkrub/blob/master/LICENCE.md) -* [**mutagen**](https://github.com/quodlibet/mutagen) - For embedding thumbnail in certain formats. Licenced under [GPLv2+](https://github.com/quodlibet/mutagen/blob/master/COPYING) -* [**pycryptodome**](https://github.com/Legrandin/pycryptodome) - For decrypting various data. Licenced under [BSD2](https://github.com/Legrandin/pycryptodome/blob/master/LICENSE.rst) -* [**websockets**](https://github.com/aaugustin/websockets) - For downloading over websocket. Licenced under [BSD3](https://github.com/aaugustin/websockets/blob/main/LICENSE) -* [**keyring**](https://github.com/jaraco/keyring) - For decrypting cookies of chromium-based browsers on Linux. Licenced under [MIT](https://github.com/jaraco/keyring/blob/main/LICENSE) -* [**AtomicParsley**](https://github.com/wez/atomicparsley) - For embedding thumbnail in mp4/m4a if mutagen is not present. Licenced under [GPLv2+](https://github.com/wez/atomicparsley/blob/master/COPYING) -* [**rtmpdump**](http://rtmpdump.mplayerhq.hu) - For downloading `rtmp` streams. ffmpeg will be used as a fallback. Licenced under [GPLv2+](http://rtmpdump.mplayerhq.hu) -* [**mplayer**](http://mplayerhq.hu/design7/info.html) or [**mpv**](https://mpv.io) - For downloading `rstp` streams. ffmpeg will be used as a fallback. Licenced under [GPLv2+](https://github.com/mpv-player/mpv/blob/master/Copyright) -* [**phantomjs**](https://github.com/ariya/phantomjs) - Used in extractors where javascript needs to be run. Licenced under [BSD3](https://github.com/ariya/phantomjs/blob/master/LICENSE.BSD) +While all the other dependencies are optional, `ffmpeg` and `ffprobe` are highly recommended + +* [**ffmpeg** and **ffprobe**](https://www.ffmpeg.org) - Required for [merging separate video and audio files](#format-selection) as well as for various [post-processing](#post-processing-options) tasks. License [depends on the build](https://www.ffmpeg.org/legal.html) +* [**mutagen**](https://github.com/quodlibet/mutagen) - For embedding thumbnail in certain formats. Licensed under [GPLv2+](https://github.com/quodlibet/mutagen/blob/master/COPYING) +* [**pycryptodomex**](https://github.com/Legrandin/pycryptodome) - For decrypting AES-128 HLS streams and various other data. Licensed under [BSD2](https://github.com/Legrandin/pycryptodome/blob/master/LICENSE.rst) +* [**websockets**](https://github.com/aaugustin/websockets) - For downloading over websocket. Licensed under [BSD3](https://github.com/aaugustin/websockets/blob/main/LICENSE) +* [**secretstorage**](https://github.com/mitya57/secretstorage) - For accessing the Gnome keyring while decrypting cookies of Chromium-based browsers on Linux. Licensed under [BSD](https://github.com/mitya57/secretstorage/blob/master/LICENSE) +* [**AtomicParsley**](https://github.com/wez/atomicparsley) - For embedding thumbnail in mp4/m4a if mutagen is not present. Licensed under [GPLv2+](https://github.com/wez/atomicparsley/blob/master/COPYING) +* [**rtmpdump**](http://rtmpdump.mplayerhq.hu) - For downloading `rtmp` streams. ffmpeg will be used as a fallback. Licensed under [GPLv2+](http://rtmpdump.mplayerhq.hu) +* [**mplayer**](http://mplayerhq.hu/design7/info.html) or [**mpv**](https://mpv.io) - For downloading `rstp` streams. ffmpeg will be used as a fallback. Licensed under [GPLv2+](https://github.com/mpv-player/mpv/blob/master/Copyright) +* [**phantomjs**](https://github.com/ariya/phantomjs) - Used in extractors where javascript needs to be run. Licensed under [BSD3](https://github.com/ariya/phantomjs/blob/master/LICENSE.BSD) +* [**sponskrub**](https://github.com/faissaloo/SponSkrub) - For using the now **deprecated** [sponskrub options](#sponskrub-options). Licensed under [GPLv3+](https://github.com/faissaloo/SponSkrub/blob/master/LICENCE.md) * Any external downloader that you want to use with `--downloader` To use or redistribute the dependencies, you must agree to their respective licensing terms. -Note that the windows releases are already built with the python interpreter, mutagen, pycryptodome and websockets included. +The Windows and MacOS standalone release binaries are already built with the python interpreter, mutagen, pycryptodomex and websockets included. -### COMPILE +**Note**: There are some regressions in newer ffmpeg versions that causes various issues when used alongside yt-dlp. Since ffmpeg is such an important dependency, we provide [custom builds](https://github.com/yt-dlp/FFmpeg-Builds/wiki/Latest#latest-autobuilds) with patches for these issues at [yt-dlp/FFmpeg-Builds](https://github.com/yt-dlp/FFmpeg-Builds). See [the readme](https://github.com/yt-dlp/FFmpeg-Builds#patches-applied) for details on the specific issues solved by these builds + + +## COMPILE **For Windows**: -To build the Windows executable, you must have pyinstaller (and optionally mutagen, pycryptodome, websockets) +To build the Windows executable, you must have pyinstaller (and optionally mutagen, pycryptodomex, websockets). Once you have all the necessary dependencies installed, (optionally) build lazy extractors using `devscripts/make_lazy_extractors.py`, and then just run `pyinst.py`. The executable will be built for the same architecture (32/64 bit) as the python used to build it. - python3 -m pip install --upgrade pyinstaller mutagen pycryptodome websockets - -Once you have all the necessary dependencies installed, just run `py pyinst.py`. The executable will be built for the same architecture (32/64 bit) as the python used to build it. - -You can also build the executable without any version info or metadata by using: - - pyinstaller.exe yt_dlp\__main__.py --onefile --name yt-dlp + py -m pip install -U pyinstaller -r requirements.txt + py devscripts/make_lazy_extractors.py + py pyinst.py Note that pyinstaller [does not support](https://github.com/pyinstaller/pyinstaller#requirements-and-tested-platforms) Python installed from the Windows store without using a virtual environment @@ -220,24 +285,31 @@ ### COMPILE You will need the required build tools: `python`, `make` (GNU), `pandoc`, `zip`, `pytest` Then simply run `make`. You can also run `make yt-dlp` instead to compile only the binary without updating any of the additional files -**Note**: In either platform, `devscripts\update-version.py` can be used to automatically update the version number +**Note**: In either platform, `devscripts/update-version.py` can be used to automatically update the version number + +You can also fork the project on github and run your fork's [build workflow](.github/workflows/build.yml) to automatically build a release # USAGE AND OPTIONS +<!-- MANPAGE: BEGIN EXCLUDED SECTION --> yt-dlp [OPTIONS] [--] URL [URL...] `Ctrl+F` is your friend :D -<!-- Auto generated --> +<!-- MANPAGE: END EXCLUDED SECTION --> +<!-- Auto generated --> ## General Options: -h, --help Print this help text and exit --version Print program version and exit -U, --update Update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed) - -i, --ignore-errors Continue on download errors, for example to - skip unavailable videos in a playlist - (default) (Alias: --no-abort-on-error) + -i, --ignore-errors Ignore download and postprocessing errors. + The download will be considered successful + even if the postprocessing fails + --no-abort-on-error Continue with next video on download + errors; e.g. to skip unavailable videos in + a playlist (default) --abort-on-error Abort downloading of further videos if an error occurs (Alias: --no-ignore-errors) --dump-user-agent Display the current user-agent and exit @@ -248,9 +320,9 @@ ## General Options: extractor --default-search PREFIX Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos - from google videos for youtube-dl "large - apple". Use the value "auto" to let - youtube-dl guess ("auto_warning" to emit a + from google videos for the search term + "large apple". Use the value "auto" to let + yt-dlp guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The default value "fixup_error" repairs broken URLs, but emits an error if @@ -269,7 +341,16 @@ ## General Options: --flat-playlist Do not extract the videos of a playlist, only list them --no-flat-playlist Extract the videos of a playlist - --mark-watched Mark videos watched (YouTube only) + --live-from-start Download livestreams from the start. + Currently only supported for YouTube + --no-live-from-start Download livestreams from the current time + (default) + --wait-for-video MIN[-MAX] Wait for scheduled streams to become + available. Pass the minimum number of + seconds (or range) to wait between retries + --no-wait-for-video Do not wait for scheduled streams (default) + --mark-watched Mark videos watched (even with --simulate). + Currently only supported for YouTube --no-mark-watched Do not mark videos watched (default) --no-colors Do not emit color codes in output --compat-options OPTS Options that can help keep compatibility @@ -317,16 +398,11 @@ ## Video Selection: specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13 - --match-title REGEX Download only matching titles (regex or - caseless sub-string) - --reject-title REGEX Skip download for matching titles (regex or - caseless sub-string) - --max-downloads NUMBER Abort after downloading NUMBER files --min-filesize SIZE Do not download any videos smaller than SIZE (e.g. 50k or 44.6m) --max-filesize SIZE Do not download any videos larger than SIZE (e.g. 50k or 44.6m) - --date DATE Download only videos uploaded in this date. + --date DATE Download only videos uploaded on this date. The date can be "YYYYMMDD" or in the format "(now|today)[+-][0-9](day|week|month|year)(s)?" --datebefore DATE Download only videos uploaded on or before @@ -335,10 +411,6 @@ ## Video Selection: --dateafter DATE Download only videos uploaded on or after this date. The date formats accepted is the same as --date - --min-views COUNT Do not download any videos with less than - COUNT views - --max-views COUNT Do not download any videos with more than - COUNT views --match-filter FILTER Generic video filter. Any field (see "OUTPUT TEMPLATE") can be compared with a number or a string using the operators @@ -351,7 +423,7 @@ ## Video Selection: filters can be checked with "&". Use a "\" to escape "&" or quotes if needed. Eg: --match-filter "!is_live & like_count>?100 - & description~=\'(?i)\bcats \& dogs\b\'" + & description~='(?i)\bcats \& dogs\b'" matches only videos that are not live, has a like count more than 100 (or the like field is not available), and also has a @@ -367,18 +439,23 @@ ## Video Selection: --download-archive FILE Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it + --no-download-archive Do not use archive file (default) + --max-downloads NUMBER Abort after downloading NUMBER files --break-on-existing Stop the download process when encountering a file that is in the archive --break-on-reject Stop the download process when encountering a file that has been filtered out + --break-per-input Make --break-on-existing and --break-on- + reject act only on the current input URL + --no-break-per-input --break-on-existing and --break-on-reject + terminates the entire download queue --skip-playlist-after-errors N Number of allowed failures until the rest of the playlist is skipped - --no-download-archive Do not use archive file (default) ## Download Options: -N, --concurrent-fragments N Number of fragments of a dash/hlsnative - video that should be download concurrently - (default is 1) + video that should be downloaded + concurrently (default is 1) -r, --limit-rate RATE Maximum download rate in bytes per second (e.g. 50K or 4.2M) --throttled-rate RATE Minimum download rate in bytes per second @@ -386,6 +463,8 @@ ## Download Options: video data is re-extracted (e.g. 100K) -R, --retries RETRIES Number of retries (default is 10), or "infinite" + --file-access-retries RETRIES Number of times to retry on file access + error (default is 10), or "infinite" --fragment-retries RETRIES Number of retries for a fragment (default is 10), or "infinite" (DASH, hlsnative and ISM) @@ -448,14 +527,15 @@ ## Download Options: (Alias: --external-downloader-args) ## Filesystem Options: - -a, --batch-file FILE File containing URLs to download ('-' for + -a, --batch-file FILE File containing URLs to download ("-" for stdin), one URL per line. Lines starting - with '#', ';' or ']' are considered as + with "#", ";" or "]" are considered as comments and ignored + --no-batch-file Do not read URLs from batch file (default) -P, --paths [TYPES:]PATH The paths where the files should be downloaded. Specify the type of file and the path separated by a colon ":". All the - same types as --output are supported. + same TYPES as --output are supported. Additionally, you can also provide "home" (default) and "temp" paths. All intermediary files are first downloaded to @@ -473,9 +553,9 @@ ## Filesystem Options: filenames --no-restrict-filenames Allow Unicode characters, "&" and spaces in filenames (default) - --windows-filenames Force filenames to be windows compatible - --no-windows-filenames Make filenames windows compatible only if - using windows (default) + --windows-filenames Force filenames to be Windows-compatible + --no-windows-filenames Make filenames Windows-compatible only if + using Windows (default) --trim-filenames LENGTH Limit the filename length (excluding extension) to the specified number of characters @@ -503,9 +583,6 @@ ## Filesystem Options: --write-info-json Write video metadata to a .info.json file (this may contain personal information) --no-write-info-json Do not write video metadata (default) - --write-annotations Write video annotations to a - .annotations.xml file - --no-write-annotations Do not write video annotations (default) --write-playlist-metafiles Write playlist metadata in addition to the video metadata when using --write-info-json, --write-description etc. (default) @@ -526,26 +603,29 @@ ## Filesystem Options: --load-info-json FILE JSON file containing the video information (created with the "--write-info-json" option) - --cookies FILE File to read cookies from and dump cookie - jar in + --cookies FILE Netscape formatted file to read cookies + from and dump cookie jar in --no-cookies Do not read/dump cookies from/to file (default) - --cookies-from-browser BROWSER[:PROFILE] - Load cookies from a user profile of the - given web browser. Currently supported - browsers are: brave|chrome|chromium|edge|fi - refox|opera|safari|vivaldi. You can specify - the user profile name or directory using - "BROWSER:PROFILE_NAME" or - "BROWSER:PROFILE_PATH". If no profile is - given, the most recently accessed one is - used + --cookies-from-browser BROWSER[+KEYRING][:PROFILE] + The name of the browser and (optionally) + the name/path of the profile to load + cookies from, separated by a ":". Currently + supported browsers are: brave, chrome, + chromium, edge, firefox, opera, safari, + vivaldi. By default, the most recently + accessed profile is used. The keyring used + for decrypting Chromium cookies on Linux + can be (optionally) specified after the + browser name separated by a "+". Currently + supported keyrings are: basictext, + gnomekeyring, kwallet --no-cookies-from-browser Do not load cookies from browser (default) --cache-dir DIR Location in the filesystem where youtube-dl can store some downloaded information (such as client ids and signatures) permanently. - By default $XDG_CACHE_HOME/youtube-dl or - ~/.cache/youtube-dl + By default $XDG_CACHE_HOME/yt-dlp or + ~/.cache/yt-dlp --no-cache-dir Disable filesystem caching --rm-cache-dir Delete all filesystem cache files @@ -574,9 +654,9 @@ ## Verbosity and Simulation Options: anything to disk --no-simulate Download the video even if printing/listing options are used - --ignore-no-formats-error Ignore "No video formats" error. Usefull - for extracting metadata even if the videos - are not actually available for download + --ignore-no-formats-error Ignore "No video formats" error. Useful for + extracting metadata even if the videos are + not actually available for download (experimental) --no-ignore-no-formats-error Throw error when no downloadable video formats are found (default) @@ -601,7 +681,18 @@ ## Verbosity and Simulation Options: (Alias: --force-download-archive) --newline Output progress bar as new lines --no-progress Do not print progress bar + --progress Show progress bar, even if in quiet mode --console-title Display progress in console titlebar + --progress-template [TYPES:]TEMPLATE + Template for progress outputs, optionally + prefixed with one of "download:" (default), + "download-title:" (the console title), + "postprocess:", or "postprocess-title:". + The video's fields are accessible under the + "info" key and the progress attributes are + accessible under "progress" key. E.g.: + --console-title --progress-template + "download-title:%(info.id)s-%(progress.eta)s" -v, --verbose Print various debugging information --dump-pages Print downloaded pages encoded using base64 to debug problems (very verbose) @@ -612,7 +703,7 @@ ## Verbosity and Simulation Options: ## Workarounds: --encoding ENCODING Force the specified encoding (experimental) - --no-check-certificate Suppress HTTPS certificate validation + --no-check-certificates Suppress HTTPS certificate validation --prefer-insecure Use an unencrypted connection to retrieve information about the video (Currently supported only for YouTube) @@ -661,10 +752,12 @@ ## Video Format Options: containers irrespective of quality --no-prefer-free-formats Don't give any special preference to free containers (default) - --check-formats Check that the formats selected are + --check-formats Check that the selected formats are actually downloadable - --no-check-formats Do not check that the formats selected are + --check-all-formats Check all formats for whether they are actually downloadable + --no-check-formats Do not check that the formats are actually + downloadable -F, --list-formats List available formats of each video. Simulate unless --no-simulate is used --merge-output-format FORMAT If a merge is required (e.g. @@ -686,7 +779,7 @@ ## Subtitle Options: "ass/srt/best" --sub-langs LANGS Languages of the subtitles to download (can be regex) or "all" separated by commas. - (Eg: --sub-langs en.*,ja) You can prefix + (Eg: --sub-langs "en.*,ja") You can prefix the language code with a "-" to exempt it from the requested languages. (Eg: --sub- langs all,-live_chat) Use --list-subs for a @@ -698,6 +791,9 @@ ## Authentication Options: out, yt-dlp will ask interactively -2, --twofactor TWOFACTOR Two-factor authentication code -n, --netrc Use .netrc authentication data + --netrc-location PATH Location of .netrc authentication data; + either the path or its containing + directory. Defaults to ~/.netrc --video-password PASSWORD Video password (vimeo, youku) --ap-mso MSO Adobe Pass multiple-system operator (TV provider) identifier, use --ap-list-mso for @@ -715,9 +811,9 @@ ## Post-Processing Options: --audio-format FORMAT Specify audio format to convert the audio to when -x is used. Currently supported formats are: best (default) or one of - best|aac|flac|mp3|m4a|opus|vorbis|wav + best|aac|flac|mp3|m4a|opus|vorbis|wav|alac --audio-quality QUALITY Specify ffmpeg audio quality, insert a - value between 0 (better) and 9 (worse) for + value between 0 (best) and 10 (worst) for VBR or a specific bitrate like 128K (default 5) --remux-video FORMAT Remux the video into another container if @@ -736,24 +832,23 @@ ## Post-Processing Options: and the arguments separated by a colon ":" to give the argument to the specified postprocessor/executable. Supported PP are: - Merger, ExtractAudio, SplitChapters, + Merger, ModifyChapters, SplitChapters, + ExtractAudio, VideoRemuxer, VideoConvertor, Metadata, EmbedSubtitle, EmbedThumbnail, SubtitlesConvertor, ThumbnailsConvertor, - VideoRemuxer, VideoConvertor, SponSkrub, FixupStretched, FixupM4a, FixupM3u8, FixupTimestamp and FixupDuration. The supported executables are: AtomicParsley, - FFmpeg, FFprobe, and SponSkrub. You can - also specify "PP+EXE:ARGS" to give the - arguments to the specified executable only - when being used by the specified - postprocessor. Additionally, for - ffmpeg/ffprobe, "_i"/"_o" can be appended - to the prefix optionally followed by a - number to pass the argument before the - specified input/output file. Eg: --ppa - "Merger+ffmpeg_i1:-v quiet". You can use - this option multiple times to give + FFmpeg and FFprobe. You can also specify + "PP+EXE:ARGS" to give the arguments to the + specified executable only when being used + by the specified postprocessor. + Additionally, for ffmpeg/ffprobe, "_i"/"_o" + can be appended to the prefix optionally + followed by a number to pass the argument + before the specified input/output file. Eg: + --ppa "Merger+ffmpeg_i1:-v quiet". You can + use this option multiple times to give different arguments to different postprocessors. (Alias: --ppa) -k, --keep-video Keep the intermediate video file on disk @@ -767,11 +862,20 @@ ## Post-Processing Options: --no-embed-subs Do not embed subtitles (default) --embed-thumbnail Embed thumbnail in the video as cover art --no-embed-thumbnail Do not embed thumbnail (default) - --embed-metadata Embed metadata including chapter markers - (if supported by the format) to the video - file (Alias: --add-metadata) - --no-embed-metadata Do not write metadata (default) + --embed-metadata Embed metadata to the video file. Also + embeds chapters/infojson if present unless + --no-embed-chapters/--no-embed-info-json + are used (Alias: --add-metadata) + --no-embed-metadata Do not add metadata to file (default) (Alias: --no-add-metadata) + --embed-chapters Add chapter markers to the video file + (Alias: --add-chapters) + --no-embed-chapters Do not add chapter markers (default) + (Alias: --no-add-chapters) + --embed-info-json Embed the infojson as an attachment to + mkv/mka video files + --no-embed-info-json Do not embed the infojson as an attachment + to the video file --parse-metadata FROM:TO Parse additional metadata like title/artist from other fields; see "MODIFYING METADATA" for details @@ -786,7 +890,7 @@ ## Post-Processing Options: emit a warning), detect_or_warn (the default; fix file if we can, warn otherwise), force (try fixing even if file - already exists + already exists) --ffmpeg-location PATH Location of the ffmpeg binary; either the path to the binary or its containing directory @@ -819,27 +923,71 @@ ## Post-Processing Options: files. See "OUTPUT TEMPLATE" for details --no-split-chapters Do not split video based on chapters (default) + --remove-chapters REGEX Remove chapters whose title matches the + given regular expression. Time ranges + prefixed by a "*" can also be used in place + of chapters to remove the specified range. + Eg: --remove-chapters "*10:15-15:00" + --remove-chapters "intro". This option can + be used multiple times + --no-remove-chapters Do not remove any chapters from the file + (default) + --force-keyframes-at-cuts Force keyframes around the chapters before + removing/splitting them. Requires a + re-encode and thus is very slow, but the + resulting video may have fewer artifacts + around the cuts + --no-force-keyframes-at-cuts Do not force keyframes around the chapters + when cutting/splitting (default) + --use-postprocessor NAME[:ARGS] The (case sensitive) name of plugin + postprocessors to be enabled, and + (optionally) arguments to be passed to it, + separated by a colon ":". ARGS are a + semicolon ";" delimited list of NAME=VALUE. + The "when" argument determines when the + postprocessor is invoked. It can be one of + "pre_process" (after extraction), + "before_dl" (before video download), + "post_process" (after video download; + default) or "after_move" (after moving file + to their final locations). This option can + be used multiple times to add different + postprocessors -## SponSkrub (SponsorBlock) Options: -[SponSkrub](https://github.com/yt-dlp/SponSkrub) is a utility to - mark/remove sponsor segments from downloaded YouTube videos using +## SponsorBlock Options: +Make chapter entries for, or remove various segments (sponsor, + introductions, etc.) from downloaded YouTube videos using the [SponsorBlock API](https://sponsor.ajay.app) - --sponskrub Use sponskrub to mark sponsored sections. - This is enabled by default if the sponskrub - binary exists (Youtube only) - --no-sponskrub Do not use sponskrub - --sponskrub-cut Cut out the sponsor sections instead of - simply marking them - --no-sponskrub-cut Simply mark the sponsor sections, not cut - them out (default) - --sponskrub-force Run sponskrub even if the video was already - downloaded - --no-sponskrub-force Do not cut out the sponsor sections if the - video was already downloaded (default) - --sponskrub-location PATH Location of the sponskrub binary; either - the path to the binary or its containing - directory + --sponsorblock-mark CATS SponsorBlock categories to create chapters + for, separated by commas. Available + categories are all, default(=all), sponsor, + intro, outro, selfpromo, preview, filler, + interaction, music_offtopic, poi_highlight. + You can prefix the category with a "-" to + exempt it. See [1] for description of the + categories. Eg: --sponsorblock-mark all,-preview + [1] https://wiki.sponsor.ajay.app/w/Segment_Categories + --sponsorblock-remove CATS SponsorBlock categories to be removed from + the video file, separated by commas. If a + category is present in both mark and + remove, remove takes precedence. The syntax + and available categories are the same as + for --sponsorblock-mark except that + "default" refers to "all,-filler" and + poi_highlight is not available + --sponsorblock-chapter-title TEMPLATE + The title template for SponsorBlock + chapters created by --sponsorblock-mark. + The same syntax as the output template is + used, but the only available fields are + start_time, end_time, category, categories, + name, category_names. Defaults to + "[SponsorBlock]: %(category_names)l" + --no-sponsorblock Disable both --sponsorblock-mark and + --sponsorblock-remove + --sponsorblock-api URL SponsorBlock API location, defaults to + https://sponsor.ajay.app ## Extractor Options: --extractor-retries RETRIES Number of retries for known extractor @@ -873,7 +1021,7 @@ # CONFIGURATION * `~/yt-dlp.conf` * `~/yt-dlp.conf.txt` - Note that `~` points to `C:\Users\<user name>` on windows. Also, `%XDG_CONFIG_HOME%` defaults to `~/.config` if undefined + `%XDG_CONFIG_HOME%` defaults to `~/.config` if undefined. On windows, `%APPDATA%` generally points to `C:\Users\<user name>\AppData\Roaming` and `~` points to `%HOME%` if present, `%USERPROFILE%` (generally `C:\Users\<user name>`), or `%HOMEDRIVE%%HOMEPATH%` 1. **System Configuration**: `/etc/yt-dlp.conf` For example, with the following configuration file yt-dlp will always extract the audio, not copy the mtime, use a proxy and save all videos under `YouTube` directory in your home directory: @@ -895,18 +1043,18 @@ # Save all videos under YouTube directory in your home directory Note that options in configuration file are just the same options aka switches used in regular command line calls; thus there **must be no whitespace** after `-` or `--`, e.g. `-o` or `--proxy` but not `- o` or `-- proxy`. -You can use `--ignore-config` if you want to disable all configuration files for a particular yt-dlp run. If `--ignore-config` is found inside any configuration file, no further configuration will be loaded. For example, having the option in the portable configuration file prevents loading of user and system configurations. Additionally, (for backward compatibility) if `--ignore-config` is found inside the system configuration file, the user configuration is not loaded. +You can use `--ignore-config` if you want to disable all configuration files for a particular yt-dlp run. If `--ignore-config` is found inside any configuration file, no further configuration will be loaded. For example, having the option in the portable configuration file prevents loading of home, user, and system configurations. Additionally, (for backward compatibility) if `--ignore-config` is found inside the system configuration file, the user configuration is not loaded. ### Authentication with `.netrc` file -You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every yt-dlp execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](https://stackoverflow.com/tags/.netrc/info) on a per extractor basis. For that you will need to create a `.netrc` file in your `$HOME` and restrict permissions to read/write by only you: +You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every yt-dlp execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](https://stackoverflow.com/tags/.netrc/info) on a per extractor basis. For that you will need to create a `.netrc` file in `--netrc-location` and restrict permissions to read/write by only you: ``` touch $HOME/.netrc chmod a-rwx,u+rw $HOME/.netrc ``` After that you can add credentials for an extractor in the following format, where *extractor* is the name of the extractor in lowercase: ``` -machine <extractor> login <login> password <password> +machine <extractor> login <username> password <password> ``` For example: ``` @@ -915,34 +1063,43 @@ ### Authentication with `.netrc` file ``` To activate authentication with the `.netrc` file you should pass `--netrc` to yt-dlp or place it in the [configuration file](#configuration). -On Windows you may also need to setup the `%HOME%` environment variable manually. For example: -``` -set HOME=%USERPROFILE% -``` +The default location of the .netrc file is `$HOME` (`~`) in UNIX. On Windows, it is `%HOME%` if present, `%USERPROFILE%` (generally `C:\Users\<user name>`) or `%HOMEDRIVE%%HOMEPATH%` # OUTPUT TEMPLATE The `-o` option is used to indicate a template for the output file names while `-P` option is used to specify the path each type of file should be saved to. +<!-- MANPAGE: BEGIN EXCLUDED SECTION --> **tl;dr:** [navigate me to examples](#output-template-examples). +<!-- MANPAGE: END EXCLUDED SECTION --> The simplest usage of `-o` is not to set any template arguments when downloading a single file, like in `yt-dlp -o funny_video.flv "https://some/video"` (hard-coding file extension like this is _not_ recommended and could break some post-processing). -It may however also contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by formatting operations. +It may however also contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [Python string formatting operations](https://docs.python.org/3/library/stdtypes.html#printf-style-string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by formatting operations. The field names themselves (the part inside the parenthesis) can also have some special formatting: 1. **Object traversal**: The dictionaries and lists available in metadata can be traversed by using a `.` (dot) separator. You can also do python slicing using `:`. Eg: `%(tags.0)s`, `%(subtitles.en.-1.ext)s`, `%(id.3:7:-1)s`, `%(formats.:.format_id)s`. `%()s` refers to the entire infodict. Note that all the fields that become available using this method are not listed below. Use `-j` to see such fields + 1. **Addition**: Addition and subtraction of numeric fields can be done using `+` and `-` respectively. Eg: `%(playlist_index+10)03d`, `%(n_entries+1-playlist_index)d` + 1. **Date/time Formatting**: Date/time fields can be formatted according to [strftime formatting](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes) by specifying it separated from the field name using a `>`. Eg: `%(duration>%H-%M-%S)s`, `%(upload_date>%Y-%m-%d)s`, `%(epoch-3600>%H-%M-%S)s` -1. **Default**: A default value can be specified for when the field is empty using a `|` seperator. This overrides `--output-na-template`. Eg: `%(uploader|Unknown)s` -1. **More Conversions**: In addition to the normal format types `diouxXeEfFgGcrs`, `j`, `l`, `q` can be used for converting to **j**son, a comma seperated **l**ist and a string **q**uoted for the terminal respectively + +1. **Alternatives**: Alternate fields can be specified separated with a `,`. Eg: `%(release_date>%Y,upload_date>%Y|Unknown)s` + +1. **Replacement**: A replacement value can specified using a `&` separator. If the field is *not* empty, this replacement value will be used instead of the actual field content. This is done after alternate fields are considered; thus the replacement is used if *any* of the alternative fields is *not* empty. + +1. **Default**: A literal default value can be specified for when the field is empty using a `|` separator. This overrides `--output-na-template`. Eg: `%(uploader|Unknown)s` + +1. **More Conversions**: In addition to the normal format types `diouxXeEfFgGcrs`, `B`, `j`, `l`, `q`, `D`, `S` can be used for converting to **B**ytes, **j**son (flag `#` for pretty-printing), a comma separated **l**ist (flag `#` for `\n` newline-separated), a string **q**uoted for the terminal (flag `#` to split a list into different arguments), to add **D**ecimal suffixes (Eg: 10M), and to **S**anitize as filename (flag `#` for restricted), respectively + +1. **Unicode normalization**: The format type `U` can be used for NFC [unicode normalization](https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize). The alternate form flag (`#`) changes the normalization to NFD and the conversion flag `+` can be used for NFKC/NFKD compatibility equivalence normalization. Eg: `%(title)+.100U` is NFKC To summarize, the general syntax for a field is: ``` -%(name[.keys][addition][>strf][|default])[flags][width][.precision][length]type +%(name[.keys][addition][>strf][,alternate][&replacement][|default])[flags][width][.precision][length]type ``` -Additionally, you can set different output templates for the various metadata files separately from the general output template by specifying the type of file followed by the template separated by a colon `:`. The different file types supported are `subtitle`, `thumbnail`, `description`, `annotation`, `infojson`, `pl_thumbnail`, `pl_description`, `pl_infojson`, `chapter`. For example, `-o '%(title)s.%(ext)s' -o 'thumbnail:%(title)s\%(title)s.%(ext)s'` will put the thumbnails in a folder with the same name as the video. +Additionally, you can set different output templates for the various metadata files separately from the general output template by specifying the type of file followed by the template separated by a colon `:`. The different file types supported are `subtitle`, `thumbnail`, `description`, `annotation` (deprecated), `infojson`, `link`, `pl_thumbnail`, `pl_description`, `pl_infojson`, `chapter`. For example, `-o "%(title)s.%(ext)s" -o "thumbnail:%(title)s\%(title)s.%(ext)s"` will put the thumbnails in a folder with the same name as the video. If any of the templates (except default) is empty, that type of file will not be written. Eg: `--write-thumbnail -o "thumbnail:"` will write thumbnails only for playlists and not for video. The available fields are: @@ -956,9 +1113,10 @@ # OUTPUT TEMPLATE - `uploader` (string): Full name of the video uploader - `license` (string): License name the video is licensed under - `creator` (string): The creator of the video - - `release_date` (string): The date (YYYYMMDD) when the video was released - `timestamp` (numeric): UNIX timestamp of the moment the video became available - `upload_date` (string): Video upload date (YYYYMMDD) + - `release_date` (string): The date (YYYYMMDD) when the video was released + - `release_timestamp` (numeric): UNIX timestamp of the moment the video was released - `uploader_id` (string): Nickname or id of the video uploader - `channel` (string): Full name of the channel the video is uploaded on - `channel_id` (string): Id of the channel @@ -972,11 +1130,11 @@ # OUTPUT TEMPLATE - `average_rating` (numeric): Average rating give by users, the scale used depends on the webpage - `comment_count` (numeric): Number of comments on the video (For some extractors, comments are only downloaded at the end, and so this field cannot be used) - `age_limit` (numeric): Age restriction for the video (years) - - `live_status` (string): One of 'is_live', 'was_live', 'is_upcoming', 'not_live' + - `live_status` (string): One of "is_live", "was_live", "is_upcoming", "not_live" - `is_live` (boolean): Whether this video is a live stream or a fixed-length video - `was_live` (boolean): Whether this video was originally a live stream - `playable_in_embed` (string): Whether this video is allowed to play in embedded players on other sites - - `availability` (string): Whether the video is 'private', 'premium_only', 'subscriber_only', 'needs_auth', 'unlisted' or 'public' + - `availability` (string): Whether the video is "private", "premium_only", "subscriber_only", "needs_auth", "unlisted" or "public" - `start_time` (numeric): Time in seconds where the reproduction should start, as specified in the URL - `end_time` (numeric): Time in seconds where the reproduction should end, as specified in the URL - `format` (string): A human-readable description of the format @@ -991,6 +1149,7 @@ # OUTPUT TEMPLATE - `asr` (numeric): Audio sampling rate in Hertz - `vbr` (numeric): Average video bitrate in KBit/s - `fps` (numeric): Frame rate + - `dynamic_range` (string): The dynamic range of the video - `vcodec` (string): Name of the video codec in use - `container` (string): Name of the container format - `filesize` (numeric): The number of bytes, if known in advance @@ -1000,13 +1159,17 @@ # OUTPUT TEMPLATE - `extractor_key` (string): Key name of the extractor - `epoch` (numeric): Unix epoch when creating the file - `autonumber` (numeric): Number that will be increased with each download, starting at `--autonumber-start` + - `n_entries` (numeric): Total number of extracted items in the playlist - `playlist` (string): Name or id of the playlist that contains the video - - `playlist_index` (numeric): Index of the video in the playlist padded with leading zeros according to the total length of the playlist + - `playlist_index` (numeric): Index of the video in the playlist padded with leading zeros according the final index + - `playlist_autonumber` (numeric): Position of the video in the playlist download queue padded with leading zeros according to the total length of the playlist - `playlist_id` (string): Playlist identifier - `playlist_title` (string): Playlist title - `playlist_uploader` (string): Full name of the playlist uploader - `playlist_uploader_id` (string): Nickname or id of the playlist uploader - `webpage_url` (string): A URL to the video webpage which if given to yt-dlp should allow to get the same result again + - `webpage_url_basename` (string): The basename of the webpage URL + - `webpage_url_domain` (string): The domain of the webpage URL - `original_url` (string): The URL given by the user (or same as `webpage_url` for playlist entries) Available for the video that belongs to some logical chapter or section: @@ -1049,50 +1212,74 @@ # OUTPUT TEMPLATE - `urls` (string): The URLs of all requested formats, one in each line - `filename` (string): Name of the video file. Note that the actual filename may be different due to post-processing. Use `--exec echo` to get the name after all postprocessing is complete + +Available only in `--sponsorblock-chapter-title`: -Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with placeholder value provided with `--output-na-placeholder` (`NA` by default). + - `start_time` (numeric): Start time of the chapter in seconds + - `end_time` (numeric): End time of the chapter in seconds + - `categories` (list): The SponsorBlock categories the chapter belongs to + - `category` (string): The smallest SponsorBlock category the chapter belongs to + - `category_names` (list): Friendly names of the categories + - `name` (string): Friendly name of the smallest category -For example for `-o %(title)s-%(id)s.%(ext)s` and an mp4 video with title `yt-dlp test video` and id `BaW_jenozKc`, this will result in a `yt-dlp test video-BaW_jenozKc.mp4` file created in the current directory. +Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. For example for `-o %(title)s-%(id)s.%(ext)s` and an mp4 video with title `yt-dlp test video` and id `BaW_jenozKc`, this will result in a `yt-dlp test video-BaW_jenozKc.mp4` file created in the current directory. -For numeric sequences you can use numeric related formatting, for example, `%(view_count)05d` will result in a string with view count padded with zeros up to 5 characters, like in `00042`. +Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with placeholder value provided with `--output-na-placeholder` (`NA` by default). -Output templates can also contain arbitrary hierarchical path, e.g. `-o '%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s'` which will result in downloading each video in a directory corresponding to this path template. Any missing directory will be automatically created for you. +**Tip**: Look at the `-j` output to identify which fields are available for the particular URL + +For numeric sequences you can use [numeric related formatting](https://docs.python.org/3/library/stdtypes.html#printf-style-string-formatting), for example, `%(view_count)05d` will result in a string with view count padded with zeros up to 5 characters, like in `00042`. + +Output templates can also contain arbitrary hierarchical path, e.g. `-o "%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s"` which will result in downloading each video in a directory corresponding to this path template. Any missing directory will be automatically created for you. To use percent literals in an output template use `%%`. To output to stdout use `-o -`. The current default template is `%(title)s [%(id)s].%(ext)s`. -In some cases, you don't want special characters such as 中, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title: +In some cases, you don't want special characters such as 中, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title. +<!-- MANPAGE: BEGIN EXCLUDED SECTION --> #### Output template and Windows batch files If you are using an output template inside a Windows batch file then you must escape plain percent characters (`%`) by doubling, so that `-o "%(title)s-%(id)s.%(ext)s"` should become `-o "%%(title)s-%%(id)s.%%(ext)s"`. However you should not touch `%`'s that are not plain characters, e.g. environment variables for expansion should stay intact: `-o "C:\%HOMEPATH%\Desktop\%%(title)s.%%(ext)s"`. +<!-- MANPAGE: END EXCLUDED SECTION --> #### Output template examples -Note that on Windows you need to use double quotes instead of single. - ```bash -$ yt-dlp --get-filename -o '%(title)s.%(ext)s' BaW_jenozKc -youtube-dl test video ''_ä↭𝕐.mp4 # All kinds of weird characters +$ yt-dlp --get-filename -o "test video.%(ext)s" BaW_jenozKc +test video.webm # Literal name with correct extension -$ yt-dlp --get-filename -o '%(title)s.%(ext)s' BaW_jenozKc --restrict-filenames -youtube-dl_test_video_.mp4 # A simple file name +$ yt-dlp --get-filename -o "%(title)s.%(ext)s" BaW_jenozKc +youtube-dl test video ''_ä↭𝕐.webm # All kinds of weird characters + +$ yt-dlp --get-filename -o "%(title)s.%(ext)s" BaW_jenozKc --restrict-filenames +youtube-dl_test_video_.webm # Restricted file name # Download YouTube playlist videos in separate directory indexed by video order in a playlist -$ yt-dlp -o '%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s' https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re +$ yt-dlp -o "%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s" "https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re" # Download YouTube playlist videos in separate directories according to their uploaded year -$ yt-dlp -o '%(upload_date>%Y)s/%(title)s.%(ext)s' https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re +$ yt-dlp -o "%(upload_date>%Y)s/%(title)s.%(ext)s" "https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re" + +# Prefix playlist index with " - " separator, but only if it is available +$ yt-dlp -o '%(playlist_index|)s%(playlist_index& - |)s%(title)s.%(ext)s' BaW_jenozKc "https://www.youtube.com/user/TheLinuxFoundation/playlists" # Download all playlists of YouTube channel/user keeping each playlist in separate directory: -$ yt-dlp -o '%(uploader)s/%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s' https://www.youtube.com/user/TheLinuxFoundation/playlists +$ yt-dlp -o "%(uploader)s/%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s" "https://www.youtube.com/user/TheLinuxFoundation/playlists" # Download Udemy course keeping each chapter in separate directory under MyVideos directory in your home -$ yt-dlp -u user -p password -P '~/MyVideos' -o '%(playlist)s/%(chapter_number)s - %(chapter)s/%(title)s.%(ext)s' https://www.udemy.com/java-tutorial/ +$ yt-dlp -u user -p password -P "~/MyVideos" -o "%(playlist)s/%(chapter_number)s - %(chapter)s/%(title)s.%(ext)s" "https://www.udemy.com/java-tutorial" # Download entire series season keeping each series and each season in separate directory under C:/MyVideos -$ yt-dlp -P "C:/MyVideos" -o "%(series)s/%(season_number)s - %(season)s/%(episode_number)s - %(episode)s.%(ext)s" https://videomore.ru/kino_v_detalayah/5_sezon/367617 +$ yt-dlp -P "C:/MyVideos" -o "%(series)s/%(season_number)s - %(season)s/%(episode_number)s - %(episode)s.%(ext)s" "https://videomore.ru/kino_v_detalayah/5_sezon/367617" + +# Download video as "C:\MyVideos\uploader\title.ext", subtitles as "C:\MyVideos\subs\uploader\title.ext" +# and put all temporary files in "C:\MyVideos\tmp" +$ yt-dlp -P "C:/MyVideos" -P "temp:tmp" -P "subtitle:subs" -o "%(uploader)s/%(title)s.%(ext)s" BaW_jenoz --write-subs + +# Download video as "C:\MyVideos\uploader\title.ext" and subtitles as "C:\MyVideos\uploader\subs\title.ext" +$ yt-dlp -P "C:/MyVideos" -o "%(uploader)s/%(title)s.%(ext)s" -o "subtitle:%(uploader)s/subs/%(title)s.%(ext)s" BaW_jenozKc --write-subs # Stream the video being downloaded to stdout $ yt-dlp -o - BaW_jenozKc @@ -1103,29 +1290,35 @@ # FORMAT SELECTION By default, yt-dlp tries to download the best available quality if you **don't** pass any options. This is generally equivalent to using `-f bestvideo*+bestaudio/best`. However, if multiple audiostreams is enabled (`--audio-multistreams`), the default format changes to `-f bestvideo+bestaudio/best`. Similarly, if ffmpeg is unavailable, or if you use yt-dlp to stream to `stdout` (`-o -`), the default becomes `-f best/bestvideo+bestaudio`. +**Deprecation warning**: Latest versions of yt-dlp can stream multiple formats to the stdout simultaneously using ffmpeg. So, in future versions, the default for this will be set to `-f bv*+ba/b` similar to normal downloads. If you want to preserve the `-f b/bv+ba` setting, it is recommended to explicitly specify it in the configuration options. + The general syntax for format selection is `-f FORMAT` (or `--format FORMAT`) where `FORMAT` is a *selector expression*, i.e. an expression that describes format or formats you would like to download. +<!-- MANPAGE: BEGIN EXCLUDED SECTION --> **tl;dr:** [navigate me to examples](#format-selection-examples). +<!-- MANPAGE: END EXCLUDED SECTION --> The simplest case is requesting a specific format, for example with `-f 22` you can download the format with format code equal to 22. You can get the list of available format codes for particular video using `--list-formats` or `-F`. Note that these format codes are extractor specific. You can also use a file extension (currently `3gp`, `aac`, `flv`, `m4a`, `mp3`, `mp4`, `ogg`, `wav`, `webm` are supported) to download the best quality format of a particular file extension served as a single file, e.g. `-f webm` will download the best quality format with the `webm` extension served as a single file. +You can use `-f -` to interactively provide the format selector *for each video* + You can also use special names to select particular edge case formats: - - `all`: Select all formats - - `mergeall`: Select and merge all formats (Must be used with `--audio-multistreams`, `--video-multistreams` or both) - - `b*`, `best*`: Select the best quality format irrespective of whether it contains video or audio - - `w*`, `worst*`: Select the worst quality format irrespective of whether it contains video or audio - - `b`, `best`: Select the best quality format that contains both video and audio. Equivalent to `best*[vcodec!=none][acodec!=none]` + - `all`: Select **all formats** separately + - `mergeall`: Select and **merge all formats** (Must be used with `--audio-multistreams`, `--video-multistreams` or both) + - `b*`, `best*`: Select the best quality format that **contains either** a video or an audio + - `b`, `best`: Select the best quality format that **contains both** video and audio. Equivalent to `best*[vcodec!=none][acodec!=none]` + - `bv`, `bestvideo`: Select the best quality **video-only** format. Equivalent to `best*[acodec=none]` + - `bv*`, `bestvideo*`: Select the best quality format that **contains video**. It may also contain audio. Equivalent to `best*[vcodec!=none]` + - `ba`, `bestaudio`: Select the best quality **audio-only** format. Equivalent to `best*[vcodec=none]` + - `ba*`, `bestaudio*`: Select the best quality format that **contains audio**. It may also contain video. Equivalent to `best*[acodec!=none]` + - `w*`, `worst*`: Select the worst quality format that contains either a video or an audio - `w`, `worst`: Select the worst quality format that contains both video and audio. Equivalent to `worst*[vcodec!=none][acodec!=none]` - - `bv`, `bestvideo`: Select the best quality video-only format. Equivalent to `best*[acodec=none]` - `wv`, `worstvideo`: Select the worst quality video-only format. Equivalent to `worst*[acodec=none]` - - `bv*`, `bestvideo*`: Select the best quality format that contains video. It may also contain audio. Equivalent to `best*[vcodec!=none]` - `wv*`, `worstvideo*`: Select the worst quality format that contains video. It may also contain audio. Equivalent to `worst*[vcodec!=none]` - - `ba`, `bestaudio`: Select the best quality audio-only format. Equivalent to `best*[vcodec=none]` - `wa`, `worstaudio`: Select the worst quality audio-only format. Equivalent to `worst*[vcodec=none]` - - `ba*`, `bestaudio*`: Select the best quality format that contains audio. It may also contain video. Equivalent to `best*[acodec!=none]` - `wa*`, `worstaudio*`: Select the worst quality format that contains audio. It may also contain video. Equivalent to `worst*[acodec!=none]` For example, to download the worst quality video-only format you can use `-f worstvideo`. It is however recommended not to use `worst` and related options. When your format selector is `worst`, the format which is worst in all respects is selected. Most of the time, what you actually want is the video with the smallest filesize instead. So it is generally better to use `-f best -S +size,+br,+res,+fps` instead of `-f worst`. See [sorting formats](#sorting-formats) for more details. @@ -1136,7 +1329,11 @@ # FORMAT SELECTION If you want to download several formats of the same video use a comma as a separator, e.g. `-f 22,17,18` will download all these three formats, of course if they are available. Or a more sophisticated example combined with the precedence feature: `-f 136/137/mp4/bestvideo,140/m4a/bestaudio`. -You can merge the video and audio of multiple formats into a single file using `-f <format1>+<format2>+...` (requires ffmpeg installed), for example `-f bestvideo+bestaudio` will download the best video-only format, the best audio-only format and mux them together with ffmpeg. Unless `--video-multistreams` is used, all formats with a video stream except the first one are ignored. Similarly, unless `--audio-multistreams` is used, all formats with an audio stream except the first one are ignored. For example, `-f bestvideo+best+bestaudio --video-multistreams --audio-multistreams` will download and merge all 3 given formats. The resulting file will have 2 video streams and 2 audio streams. But `-f bestvideo+best+bestaudio --no-video-multistreams` will download and merge only `bestvideo` and `bestaudio`. `best` is ignored since another format containing a video stream (`bestvideo`) has already been selected. The order of the formats is therefore important. `-f best+bestaudio --no-audio-multistreams` will download and merge both formats while `-f bestaudio+best --no-audio-multistreams` will ignore `best` and download only `bestaudio`. +You can merge the video and audio of multiple formats into a single file using `-f <format1>+<format2>+...` (requires ffmpeg installed), for example `-f bestvideo+bestaudio` will download the best video-only format, the best audio-only format and mux them together with ffmpeg. + +**Deprecation warning**: Since the *below* described behavior is complex and counter-intuitive, this will be removed and multistreams will be enabled by default in the future. A new operator will be instead added to limit formats to single audio/video + +Unless `--video-multistreams` is used, all formats with a video stream except the first one are ignored. Similarly, unless `--audio-multistreams` is used, all formats with an audio stream except the first one are ignored. For example, `-f bestvideo+best+bestaudio --video-multistreams --audio-multistreams` will download and merge all 3 given formats. The resulting file will have 2 video streams and 2 audio streams. But `-f bestvideo+best+bestaudio --no-video-multistreams` will download and merge only `bestvideo` and `bestaudio`. `best` is ignored since another format containing a video stream (`bestvideo`) has already been selected. The order of the formats is therefore important. `-f best+bestaudio --no-audio-multistreams` will download and merge both formats while `-f bestaudio+best --no-audio-multistreams` will ignore `best` and download only `bestaudio`. ## Filtering Formats @@ -1169,11 +1366,13 @@ ## Filtering Formats Formats for which the value is not known are excluded unless you put a question mark (`?`) after the operator. You can combine format filters, so `-f "[height<=?720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. You can also use the filters with `all` to download all formats that satisfy the filter. For example, `-f "all[vcodec=none]"` selects all audio-only formats. -Format selectors can also be grouped using parentheses, for example if you want to download the best mp4 and webm formats with a height lower than 480 you can use `-f '(mp4,webm)[height<480]'`. +Format selectors can also be grouped using parentheses, for example if you want to download the best pre-merged mp4 and webm formats with a height lower than 480 you can use `-f "(mp4,webm)[height<480]"`. ## Sorting Formats -You can change the criteria for being considered the `best` by using `-S` (`--format-sort`). The general format for this is `--format-sort field1,field2...`. The available fields are: +You can change the criteria for being considered the `best` by using `-S` (`--format-sort`). The general format for this is `--format-sort field1,field2...`. + +The available fields are: - `hasvid`: Gives priority to formats that has a video stream - `hasaud`: Gives priority to formats that has a audio stream @@ -1181,29 +1380,34 @@ ## Sorting Formats - `lang`: Language preference as given by the extractor - `quality`: The quality of the format as given by the extractor - `source`: Preference of the source as given by the extractor - - `proto`: Protocol used for download (`https`/`ftps` > `http`/`ftp` > `m3u8_native`/`m3u8` > `http_dash_segments`> `websocket_frag` > other > `mms`/`rtsp` > unknown > `f4f`/`f4m`) - - `vcodec`: Video Codec (`av01` > `vp9.2` > `vp9` > `h265` > `h264` > `vp8` > `h263` > `theora` > other > unknown) - - `acodec`: Audio Codec (`opus` > `vorbis` > `aac` > `mp4a` > `mp3` > `ac3` > `dts` > other > unknown) + - `proto`: Protocol used for download (`https`/`ftps` > `http`/`ftp` > `m3u8_native`/`m3u8` > `http_dash_segments`> `websocket_frag` > `mms`/`rtsp` > `f4f`/`f4m`) + - `vcodec`: Video Codec (`av01` > `vp9.2` > `vp9` > `h265` > `h264` > `vp8` > `h263` > `theora` > other) + - `acodec`: Audio Codec (`flac`/`alac` > `wav`/`aiff` > `opus` > `vorbis` > `aac` > `mp4a` > `mp3` > `eac3` > `ac3` > `dts` > other) - `codec`: Equivalent to `vcodec,acodec` - - `vext`: Video Extension (`mp4` > `webm` > `flv` > other > unknown). If `--prefer-free-formats` is used, `webm` is prefered. - - `aext`: Audio Extension (`m4a` > `aac` > `mp3` > `ogg` > `opus` > `webm` > other > unknown). If `--prefer-free-formats` is used, the order changes to `opus` > `ogg` > `webm` > `m4a` > `mp3` > `aac`. + - `vext`: Video Extension (`mp4` > `webm` > `flv` > other). If `--prefer-free-formats` is used, `webm` is preferred. + - `aext`: Audio Extension (`m4a` > `aac` > `mp3` > `ogg` > `opus` > `webm` > other). If `--prefer-free-formats` is used, the order changes to `opus` > `ogg` > `webm` > `m4a` > `mp3` > `aac`. - `ext`: Equivalent to `vext,aext` - - `filesize`: Exact filesize, if know in advance. This will be unavailable for mu38 and DASH formats. + - `filesize`: Exact filesize, if known in advance - `fs_approx`: Approximate filesize calculated from the manifests - `size`: Exact filesize if available, otherwise approximate filesize - `height`: Height of video - `width`: Width of video - `res`: Video resolution, calculated as the smallest dimension. - `fps`: Framerate of video + - `hdr`: The dynamic range of the video (`DV` > `HDR12` > `HDR10+` > `HDR10` > `HLG` > `SDR`) - `tbr`: Total average bitrate in KBit/s - `vbr`: Average video bitrate in KBit/s - `abr`: Average audio bitrate in KBit/s - `br`: Equivalent to using `tbr,vbr,abr` - `asr`: Audio sample rate in Hz + +**Deprecation warning**: Many of these fields have (currently undocumented) aliases, that may be removed in a future version. It is recommended to use only the documented field names. -Note that any other **numerical** field made available by the extractor can also be used. All fields, unless specified otherwise, are sorted in descending order. To reverse this, prefix the field with a `+`. Eg: `+res` prefers format with the smallest resolution. Additionally, you can suffix a preferred value for the fields, separated by a `:`. Eg: `res:720` prefers larger videos, but no larger than 720p and the smallest video if there are no videos less than 720p. For `codec` and `ext`, you can provide two preferred values, the first for video and the second for audio. Eg: `+codec:avc:m4a` (equivalent to `+vcodec:avc,+acodec:m4a`) sets the video codec preference to `h264` > `h265` > `vp9` > `vp9.2` > `av01` > `vp8` > `h263` > `theora` and audio codec preference to `mp4a` > `aac` > `vorbis` > `opus` > `mp3` > `ac3` > `dts`. You can also make the sorting prefer the nearest values to the provided by using `~` as the delimiter. Eg: `filesize~1G` prefers the format with filesize closest to 1 GiB. +All fields, unless specified otherwise, are sorted in descending order. To reverse this, prefix the field with a `+`. Eg: `+res` prefers format with the smallest resolution. Additionally, you can suffix a preferred value for the fields, separated by a `:`. Eg: `res:720` prefers larger videos, but no larger than 720p and the smallest video if there are no videos less than 720p. For `codec` and `ext`, you can provide two preferred values, the first for video and the second for audio. Eg: `+codec:avc:m4a` (equivalent to `+vcodec:avc,+acodec:m4a`) sets the video codec preference to `h264` > `h265` > `vp9` > `vp9.2` > `av01` > `vp8` > `h263` > `theora` and audio codec preference to `mp4a` > `aac` > `vorbis` > `opus` > `mp3` > `ac3` > `dts`. You can also make the sorting prefer the nearest values to the provided by using `~` as the delimiter. Eg: `filesize~1G` prefers the format with filesize closest to 1 GiB. -The fields `hasvid`, `ie_pref`, `lang` are always given highest priority in sorting, irrespective of the user-defined order. This behaviour can be changed by using `--force-format-sort`. Apart from these, the default order used is: `quality,res,fps,codec:vp9.2,size,br,asr,proto,ext,hasaud,source,id`. Note that the extractors may override this default order, but they cannot override the user-provided order. +The fields `hasvid` and `ie_pref` are always given highest priority in sorting, irrespective of the user-defined order. This behaviour can be changed by using `--format-sort-force`. Apart from these, the default order used is: `lang,quality,res,fps,hdr:12,codec:vp9.2,size,br,asr,proto,ext,hasaud,source,id`. The extractors may override this default order, but they cannot override the user-provided order. + +Note that the default has `codec:vp9.2`; i.e. `av1` is not preferred. Similarly, the default for hdr is `hdr:12`; i.e. dolby vision is not preferred. These choices are made since DV and AV1 formats are not yet fully compatible with most devices. This may be changed in the future as more devices become capable of smoothly playing back these formats. If your format selector is `worst`, the last item is selected after sorting. This means it will select the format that is worst in all respects. Most of the time, what you actually want is the video with the smallest filesize instead. So it is generally better to use `-f best -S +size,+br,+res,+fps`. @@ -1211,16 +1415,14 @@ ## Sorting Formats ## Format Selection examples -Note that on Windows you may need to use double quotes instead of single. - ```bash # Download and merge the best video-only format and the best audio-only format, # or download the best combined format if video-only format is not available -$ yt-dlp -f 'bv+ba/b' +$ yt-dlp -f "bv+ba/b" # Download best format that contains video, # and if it doesn't already have an audio stream, merge it with best audio-only format -$ yt-dlp -f 'bv*+ba/b' +$ yt-dlp -f "bv*+ba/b" # Same as above $ yt-dlp @@ -1228,89 +1430,89 @@ # Same as above # Download the best video-only format and the best audio-only format without merging them # For this case, an output template should be used since # by default, bestvideo and bestaudio will have the same file name. -$ yt-dlp -f 'bv,ba' -o '%(title)s.f%(format_id)s.%(ext)s' +$ yt-dlp -f "bv,ba" -o "%(title)s.f%(format_id)s.%(ext)s" # Download and merge the best format that has a video stream, # and all audio-only formats into one file -$ yt-dlp -f 'bv*+mergeall[vcodec=none]' --audio-multistreams +$ yt-dlp -f "bv*+mergeall[vcodec=none]" --audio-multistreams # Download and merge the best format that has a video stream, # and the best 2 audio-only formats into one file -$ yt-dlp -f 'bv*+ba+ba.2' --audio-multistreams +$ yt-dlp -f "bv*+ba+ba.2" --audio-multistreams # The following examples show the old method (without -S) of format selection # and how to use -S to achieve a similar but (generally) better result # Download the worst video available (old method) -$ yt-dlp -f 'wv*+wa/w' +$ yt-dlp -f "wv*+wa/w" # Download the best video available but with the smallest resolution -$ yt-dlp -S '+res' +$ yt-dlp -S "+res" # Download the smallest video available -$ yt-dlp -S '+size,+br' +$ yt-dlp -S "+size,+br" # Download the best mp4 video available, or the best video if no mp4 available -$ yt-dlp -f 'bv*[ext=mp4]+ba[ext=m4a]/b[ext=mp4] / bv*+ba/b' +$ yt-dlp -f "bv*[ext=mp4]+ba[ext=m4a]/b[ext=mp4] / bv*+ba/b" # Download the best video with the best extension # (For video, mp4 > webm > flv. For audio, m4a > aac > mp3 ...) -$ yt-dlp -S 'ext' +$ yt-dlp -S "ext" # Download the best video available but no better than 480p, # or the worst video if there is no video under 480p -$ yt-dlp -f 'bv*[height<=480]+ba/b[height<=480] / wv*+ba/w' +$ yt-dlp -f "bv*[height<=480]+ba/b[height<=480] / wv*+ba/w" # Download the best video available with the largest height but no better than 480p, # or the best video with the smallest resolution if there is no video under 480p -$ yt-dlp -S 'height:480' +$ yt-dlp -S "height:480" # Download the best video available with the largest resolution but no better than 480p, # or the best video with the smallest resolution if there is no video under 480p # Resolution is determined by using the smallest dimension. # So this works correctly for vertical videos as well -$ yt-dlp -S 'res:480' +$ yt-dlp -S "res:480" # Download the best video (that also has audio) but no bigger than 50 MB, # or the worst video (that also has audio) if there is no video under 50 MB -$ yt-dlp -f 'b[filesize<50M] / w' +$ yt-dlp -f "b[filesize<50M] / w" # Download largest video (that also has audio) but no bigger than 50 MB, # or the smallest video (that also has audio) if there is no video under 50 MB -$ yt-dlp -f 'b' -S 'filesize:50M' +$ yt-dlp -f "b" -S "filesize:50M" # Download best video (that also has audio) that is closest in size to 50 MB -$ yt-dlp -f 'b' -S 'filesize~50M' +$ yt-dlp -f "b" -S "filesize~50M" # Download best video available via direct link over HTTP/HTTPS protocol, # or the best video available via any protocol if there is no such video -$ yt-dlp -f '(bv*+ba/b)[protocol^=http][protocol!*=dash] / (bv*+ba/b)' +$ yt-dlp -f "(bv*+ba/b)[protocol^=http][protocol!*=dash] / (bv*+ba/b)" # Download best video available via the best protocol # (https/ftps > http/ftp > m3u8_native > m3u8 > http_dash_segments ...) -$ yt-dlp -S 'proto' +$ yt-dlp -S "proto" # Download the best video with h264 codec, or the best video if there is no such video -$ yt-dlp -f '(bv*+ba/b)[vcodec^=avc1] / (bv*+ba/b)' +$ yt-dlp -f "(bv*+ba/b)[vcodec^=avc1] / (bv*+ba/b)" # Download the best video with best codec no better than h264, # or the best video with worst codec if there is no such video -$ yt-dlp -S 'codec:h264' +$ yt-dlp -S "codec:h264" # Download the best video with worst codec no worse than h264, # or the best video with best codec if there is no such video -$ yt-dlp -S '+codec:h264' +$ yt-dlp -S "+codec:h264" @@ -1318,34 +1520,34 @@ # More complex examples # Download the best video no better than 720p preferring framerate greater than 30, # or the worst video (still preferring framerate greater than 30) if there is no such video -$ yt-dlp -f '((bv*[fps>30]/bv*)[height<=720]/(wv*[fps>30]/wv*)) + ba / (b[fps>30]/b)[height<=720]/(w[fps>30]/w)' +$ yt-dlp -f "((bv*[fps>30]/bv*)[height<=720]/(wv*[fps>30]/wv*)) + ba / (b[fps>30]/b)[height<=720]/(w[fps>30]/w)" # Download the video with the largest resolution no better than 720p, # or the video with the smallest resolution available if there is no such video, # preferring larger framerate for formats with the same resolution -$ yt-dlp -S 'res:720,fps' +$ yt-dlp -S "res:720,fps" # Download the video with smallest resolution no worse than 480p, # or the video with the largest resolution available if there is no such video, # preferring better codec and then larger total bitrate for the same resolution -$ yt-dlp -S '+res:480,codec,br' +$ yt-dlp -S "+res:480,codec,br" ``` # MODIFYING METADATA -The metadata obtained the the extractors can be modified by using `--parse-metadata` and `--replace-in-metadata` +The metadata obtained by the extractors can be modified by using `--parse-metadata` and `--replace-in-metadata` `--replace-in-metadata FIELDS REGEX REPLACE` is used to replace text in any metadata field using [python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax). [Backreferences](https://docs.python.org/3/library/re.html?highlight=backreferences#re.sub) can be used in the replace string for advanced use. -The general syntax of `--parse-metadata FROM:TO` is to give the name of a field or a template (with same syntax as [output template](#output-template)) to extract data from, and the format to interpret it as, separated by a colon `:`. Either a [python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax) with named capture groups or a similar syntax to the [output template](#output-template) (only `%(field)s` formatting is supported) can be used for `TO`. The option can be used multiple times to parse and modify various fields. +The general syntax of `--parse-metadata FROM:TO` is to give the name of a field or an [output template](#output-template) to extract data from, and the format to interpret it as, separated by a colon `:`. Either a [python regular expression](https://docs.python.org/3/library/re.html#regular-expression-syntax) with named capture groups or a similar syntax to the [output template](#output-template) (only `%(field)s` formatting is supported) can be used for `TO`. The option can be used multiple times to parse and modify various fields. Note that any field created by this can be used in the [output template](#output-template) and will also affect the media file's metadata added when using `--add-metadata`. This option also has a few special uses: * You can download an additional URL based on the metadata of the currently downloaded video. To do this, set the field `additional_urls` to the URL that you want to download. Eg: `--parse-metadata "description:(?P<additional_urls>https?://www\.vimeo\.com/\d+)` will download the first vimeo video found in the description -* You can use this to change the metadata that is embedded in the media file. To do this, set the value of the corresponding field with a `meta_` prefix. For example, any value you set to `meta_description` field will be added to the `description` field in the file. For example, you can use this to set a different "description" and "synopsis" +* You can use this to change the metadata that is embedded in the media file. To do this, set the value of the corresponding field with a `meta_` prefix. For example, any value you set to `meta_description` field will be added to the `description` field in the file. For example, you can use this to set a different "description" and "synopsis". Any value set to the `meta_` field will overwrite all default values. For reference, these are the fields yt-dlp adds by default to the file metadata: @@ -1371,55 +1573,200 @@ # MODIFYING METADATA ## Modifying metadata examples -Note that on Windows you may need to use double quotes instead of single. - ```bash # Interpret the title as "Artist - Title" -$ yt-dlp --parse-metadata 'title:%(artist)s - %(title)s' +$ yt-dlp --parse-metadata "title:%(artist)s - %(title)s" # Regex example -$ yt-dlp --parse-metadata 'description:Artist - (?P<artist>.+)' +$ yt-dlp --parse-metadata "description:Artist - (?P<artist>.+)" # Set title as "Series name S01E05" -$ yt-dlp --parse-metadata '%(series)s S%(season_number)02dE%(episode_number)02d:%(title)s' +$ yt-dlp --parse-metadata "%(series)s S%(season_number)02dE%(episode_number)02d:%(title)s" # Set "comment" field in video metadata using description instead of webpage_url -$ yt-dlp --parse-metadata 'description:(?s)(?P<meta_comment>.+)' --add-metadata +$ yt-dlp --parse-metadata "description:(?s)(?P<meta_comment>.+)" --add-metadata + +# Remove "formats" field from the infojson by setting it to an empty string +$ yt-dlp --parse-metadata ":(?P<formats>)" -j # Replace all spaces and "_" in title and uploader with a `-` -$ yt-dlp --replace-in-metadata 'title,uploader' '[ _]' '-' +$ yt-dlp --replace-in-metadata "title,uploader" "[ _]" "-" ``` # EXTRACTOR ARGUMENTS -Some extractors accept additional arguments which can be passed using `--extractor-args KEY:ARGS`. `ARGS` is a `;` (semicolon) seperated string of `ARG=VAL1,VAL2`. Eg: `--extractor-args "youtube:player_client=android_agegate,web;include_live_dash" --extractor-args "funimation:version=uncut"` +Some extractors accept additional arguments which can be passed using `--extractor-args KEY:ARGS`. `ARGS` is a `;` (semicolon) separated string of `ARG=VAL1,VAL2`. Eg: `--extractor-args "youtube:player-client=android_agegate,web;include_live_dash" --extractor-args "funimation:version=uncut"` The following extractors use this feature: -* **youtube** - * `skip`: `hls` or `dash` (or both) to skip download of the respective manifests - * `player_client`: Clients to extract video data from. The main clients are `web`, `android`, `ios`, `mweb`. These also have `_music`, `_embedded`, `_agegate`, and `_creator` variants (Eg: `web_embedded`) (`mweb` has only `_agegate`). By default, `android,web` is used, but the agegate and creator variants are added as required for age-gated videos. Similarly the music variants are added for `music.youtube.com` urls. You can also use `all` to use all the clients - * `player_skip`: `configs` - skip any requests for client configs and use defaults - * `include_live_dash`: Include live dash formats (These formats don't download properly) - * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side). - * `max_comments`: Maximum amount of comments to download (default all). - * `max_comment_depth`: Maximum depth for nested comments. YouTube supports depths 1 or 2 (default). -* **funimation** - * `language`: Languages to extract. Eg: `funimation:language=english,japanese` - * `version`: The video version to extract - `uncut` or `simulcast` +#### youtube +* `skip`: `hls` or `dash` (or both) to skip download of the respective manifests +* `player_client`: Clients to extract video data from. The main clients are `web`, `android`, `ios`, `mweb`. These also have `_music`, `_embedded`, `_agegate`, and `_creator` variants (Eg: `web_embedded`) (`mweb` has only `_agegate`). By default, `android,web` is used, but the agegate and creator variants are added as required for age-gated videos. Similarly the music variants are added for `music.youtube.com` urls. You can also use `all` to use all the clients, and `default` for the default clients. +* `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details +* `include_live_dash`: Include live dash formats even without `--live-from-start` (These formats don't download properly) +* `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) +* `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all`. + * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total. +* `max_comment_depth` Maximum depth for nested comments. YouTube supports depths 1 or 2 (default) + * **Deprecated**: Set `max-replies` to `0` or `all` in `max_comments` instead (e.g. `max_comments=all,all,0` to get no replies) -* **vikiChannel** - * `video_types`: Types of videos to download - one or more of `episodes`, `movies`, `clips`, `trailers` +#### youtubetab (YouTube playlists, channels, feeds, etc.) +* `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) + +#### funimation +* `language`: Languages to extract. Eg: `funimation:language=english,japanese` +* `version`: The video version to extract - `uncut` or `simulcast` + +#### crunchyroll +* `language`: Languages to extract. Eg: `crunchyroll:language=jaJp` +* `hardsub`: Which hard-sub versions to extract. Eg: `crunchyroll:hardsub=None,enUS` + +#### vikichannel +* `video_types`: Types of videos to download - one or more of `episodes`, `movies`, `clips`, `trailers` + +#### youtubewebarchive +* `check_all`: Try to check more at the cost of more requests. One or more of `thumbnails`, `captures` + +#### gamejolt +* `comment_sort`: `hot` (default), `you` (cookies needed), `top`, `new` - choose comment sorting mode (on GameJolt's side) NOTE: These options may be changed/removed in the future without concern for backward compatibility +<!-- MANPAGE: MOVE "INSTALLATION" SECTION HERE --> + # PLUGINS -Plugins are loaded from `<root-dir>/ytdlp_plugins/<type>/__init__.py`. Currently only `extractor` plugins are supported. Support for `downloader` and `postprocessor` plugins may be added in the future. See [ytdlp_plugins](ytdlp_plugins) for example. +Plugins are loaded from `<root-dir>/ytdlp_plugins/<type>/__init__.py`; where `<root-dir>` is the directory of the binary (`<root-dir>/yt-dlp`), or the root directory of the module if you are running directly from source-code (`<root dir>/yt_dlp/__main__.py`). Plugins are currently not supported for the `pip` version -**Note**: `<root-dir>` is the directory of the binary (`<root-dir>/yt-dlp`), or the root directory of the module if you are running directly from source-code (`<root dir>/yt_dlp/__main__.py`) +Plugins can be of `<type>`s `extractor` or `postprocessor`. Extractor plugins do not need to be enabled from the CLI and are automatically invoked when the input URL is suitable for it. Postprocessor plugins can be invoked using `--use-postprocessor NAME`. + +See [ytdlp_plugins](ytdlp_plugins) for example plugins. + +Note that **all** plugins are imported even if not invoked, and that **there are no checks** performed on plugin code. Use plugins at your own risk and only if you trust the code + +If you are a plugin author, add [ytdlp-plugins](https://github.com/topics/ytdlp-plugins) as a topic to your repository for discoverability + + + +# EMBEDDING YT-DLP + +yt-dlp makes the best effort to be a good command-line program, and thus should be callable from any programming language. + +Your program should avoid parsing the normal stdout since they may change in future versions. Instead they should use options such as `-J`, `--print`, `--progress-template`, `--exec` etc to create console output that you can reliably reproduce and parse. + +From a Python program, you can embed yt-dlp in a more powerful fashion, like this: + +```python +from yt_dlp import YoutubeDL + +ydl_opts = {'format': 'bestaudio'} +with YoutubeDL(ydl_opts) as ydl: + ydl.download(['https://www.youtube.com/watch?v=BaW_jenozKc']) +``` + +Most likely, you'll want to use various options. For a list of options available, have a look at [`yt_dlp/YoutubeDL.py`](yt_dlp/YoutubeDL.py#L162). + +Here's a more complete example demonstrating various functionality: + +```python +import json +import yt_dlp + + +class MyLogger: + def debug(self, msg): + # For compatibility with youtube-dl, both debug and info are passed into debug + # You can distinguish them by the prefix '[debug] ' + if msg.startswith('[debug] '): + pass + else: + self.info(msg) + + def info(self, msg): + pass + + def warning(self, msg): + pass + + def error(self, msg): + print(msg) + + +# ℹ️ See the docstring of yt_dlp.postprocessor.common.PostProcessor +class MyCustomPP(yt_dlp.postprocessor.PostProcessor): + # ℹ️ See docstring of yt_dlp.postprocessor.common.PostProcessor.run + def run(self, info): + self.to_screen('Doing stuff') + return [], info + + +# ℹ️ See "progress_hooks" in the docstring of yt_dlp.YoutubeDL +def my_hook(d): + if d['status'] == 'finished': + print('Done downloading, now converting ...') + + +def format_selector(ctx): + """ Select the best video and the best audio that won't result in an mkv. + This is just an example and does not handle all cases """ + + # formats are already sorted worst to best + formats = ctx.get('formats')[::-1] + + # acodec='none' means there is no audio + best_video = next(f for f in formats + if f['vcodec'] != 'none' and f['acodec'] == 'none') + + # find compatible audio extension + audio_ext = {'mp4': 'm4a', 'webm': 'webm'}[best_video['ext']] + # vcodec='none' means there is no video + best_audio = next(f for f in formats if ( + f['acodec'] != 'none' and f['vcodec'] == 'none' and f['ext'] == audio_ext)) + + yield { + # These are the minimum required fields for a merged format + 'format_id': f'{best_video["format_id"]}+{best_audio["format_id"]}', + 'ext': best_video['ext'], + 'requested_formats': [best_video, best_audio], + # Must be + separated list of protocols + 'protocol': f'{best_video["protocol"]}+{best_audio["protocol"]}' + } + + +# ℹ️ See docstring of yt_dlp.YoutubeDL for a description of the options +ydl_opts = { + 'format': format_selector, + 'postprocessors': [{ + # Embed metadata in video using ffmpeg. + # ℹ️ See yt_dlp.postprocessor.FFmpegMetadataPP for the arguments it accepts + 'key': 'FFmpegMetadata', + 'add_chapters': True, + 'add_metadata': True, + }], + 'logger': MyLogger(), + 'progress_hooks': [my_hook], +} + + +# Add custom headers +yt_dlp.utils.std_headers.update({'Referer': 'https://www.google.com'}) + +# ℹ️ See the public functions in yt_dlp.YoutubeDL for for other available functions. +# Eg: "ydl.download", "ydl.download_with_info_file" +with yt_dlp.YoutubeDL(ydl_opts) as ydl: + ydl.add_post_processor(MyCustomPP()) + info = ydl.extract_info('https://www.youtube.com/watch?v=BaW_jenozKc') + + # ℹ️ ydl.sanitize_info makes the info json-serializable + print(json.dumps(ydl.sanitize_info(info))) +``` + +**Tip**: If you are porting your code from youtube-dl to yt-dlp, one important point to look out for is that we do not guarantee the return value of `YoutubeDL.extract_info` to be json serializable, or even be a dictionary. It will be dictionary-like, but if you want to ensure it is a serializable dictionary, pass it through `YoutubeDL.sanitize_info` as shown in the example above + + +<!-- MANPAGE: MOVE "NEW FEATURES" SECTION HERE --> # DEPRECATED OPTIONS @@ -1451,12 +1798,12 @@ #### Not recommended --print-json -j --no-simulate --autonumber-size NUMBER Use string formatting. Eg: %(autonumber)03d --autonumber-start NUMBER Use internal field formatting like %(autonumber+NUMBER)s + --id -o "%(id)s.%(ext)s" --metadata-from-title FORMAT --parse-metadata "%(title)s:FORMAT" --hls-prefer-native --downloader "m3u8:native" --hls-prefer-ffmpeg --downloader "m3u8:ffmpeg" --list-formats-old --compat-options list-formats (Alias: --no-list-formats-as-table) --list-formats-as-table --compat-options -list-formats [Default] (Alias: --no-list-formats-old) - --sponskrub-args ARGS --ppa "sponskrub:ARGS" --youtube-skip-dash-manifest --extractor-args "youtube:skip=dash" (Alias: --no-youtube-include-dash-manifest) --youtube-skip-hls-manifest --extractor-args "youtube:skip=hls" (Alias: --no-youtube-include-hls-manifest) --youtube-include-dash-manifest Default (Alias: --no-youtube-skip-dash-manifest) @@ -1491,6 +1838,18 @@ #### Old aliases --write-srt --write-subs --yes-overwrites --force-overwrites +#### Sponskrub Options +Support for [SponSkrub](https://github.com/faissaloo/SponSkrub) has been deprecated in favor of the `--sponsorblock` options + + --sponskrub --sponsorblock-mark all + --no-sponskrub --no-sponsorblock + --sponskrub-cut --sponsorblock-remove all + --no-sponskrub-cut --sponsorblock-remove -all + --sponskrub-force Not applicable + --no-sponskrub-force Not applicable + --sponskrub-location Not applicable + --sponskrub-args Not applicable + #### No longer supported These options may no longer work as intended @@ -1500,15 +1859,18 @@ #### No longer supported --no-call-home Default --include-ads No longer supported --no-include-ads Default + --write-annotations No supported site has annotations now + --no-write-annotations Default #### Removed These options were deprecated since 2014 and have now been entirely removed - --id -o "%(id)s.%(ext)s" -A, --auto-number -o "%(autonumber)s-%(id)s.%(ext)s" -t, --title -o "%(title)s-%(id)s.%(ext)s" -l, --literal -o accepts literal names +# CONTRIBUTING +See [CONTRIBUTING.md](CONTRIBUTING.md#contributing-to-yt-dlp) for instructions on [Opening an Issue](CONTRIBUTING.md#opening-an-issue) and [Contributing code to the project](CONTRIBUTING.md#developer-instructions) # MORE -For FAQ, Developer Instructions etc., see the [original README](https://github.com/ytdl-org/youtube-dl#faq) +For FAQ see the [youtube-dl README](https://github.com/ytdl-org/youtube-dl#faq) diff --git a/devscripts/lazy_load_template.py b/devscripts/lazy_load_template.py index 036e2e767c..da89e070de 100644 --- a/devscripts/lazy_load_template.py +++ b/devscripts/lazy_load_template.py @@ -1,9 +1,15 @@ # coding: utf-8 import re +from ..utils import bug_reports_message, write_string + class LazyLoadMetaClass(type): def __getattr__(cls, name): + if '_real_class' not in cls.__dict__: + write_string( + f'WARNING: Falling back to normal extractor since lazy extractor ' + f'{cls.__name__} does not have attribute {name}{bug_reports_message()}') return getattr(cls._get_real_class(), name) @@ -13,10 +19,10 @@ class LazyLoadExtractor(metaclass=LazyLoadMetaClass): @classmethod def _get_real_class(cls): - if '__real_class' not in cls.__dict__: + if '_real_class' not in cls.__dict__: mod = __import__(cls._module, fromlist=(cls.__name__,)) - cls.__real_class = getattr(mod, cls.__name__) - return cls.__real_class + cls._real_class = getattr(mod, cls.__name__) + return cls._real_class def __new__(cls, *args, **kwargs): real_cls = cls._get_real_class() diff --git a/devscripts/make_contributing.py b/devscripts/make_contributing.py index c7f3eef761..6b1b8219c4 100755 --- a/devscripts/make_contributing.py +++ b/devscripts/make_contributing.py @@ -1,33 +1,34 @@ #!/usr/bin/env python3 from __future__ import unicode_literals -# import io +import io import optparse -# import re +import re def main(): + return # This is unused in yt-dlp + parser = optparse.OptionParser(usage='%prog INFILE OUTFILE') options, args = parser.parse_args() if len(args) != 2: parser.error('Expected an input and an output filename') - -""" infile, outfile = args + infile, outfile = args with io.open(infile, encoding='utf-8') as inf: readme = inf.read() - bug_text = re.search( """ -# r'(?s)#\s*BUGS\s*[^\n]*\s*(.*?)#\s*COPYRIGHT', readme).group(1) -# dev_text = re.search( -# r'(?s)(#\s*DEVELOPER INSTRUCTIONS.*?)#\s*EMBEDDING yt-dlp', -""" readme).group(1) + bug_text = re.search( + r'(?s)#\s*BUGS\s*[^\n]*\s*(.*?)#\s*COPYRIGHT', readme).group(1) + dev_text = re.search( + r'(?s)(#\s*DEVELOPER INSTRUCTIONS.*?)#\s*EMBEDDING yt-dlp', readme).group(1) out = bug_text + dev_text with io.open(outfile, 'w', encoding='utf-8') as outf: - outf.write(out) """ + outf.write(out) + if __name__ == '__main__': main() diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index e7b024490c..b58fb85e35 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -7,11 +7,9 @@ from os.path import dirname as dirn import sys -print('WARNING: Lazy loading extractors is an experimental feature that may not always work', file=sys.stderr) - sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) -lazy_extractors_filename = sys.argv[1] +lazy_extractors_filename = sys.argv[1] if len(sys.argv) > 1 else 'yt_dlp/extractor/lazy_extractors.py' if os.path.exists(lazy_extractors_filename): os.remove(lazy_extractors_filename) @@ -41,12 +39,6 @@ class {name}({bases}): _module = '{module}' ''' -make_valid_template = ''' - @classmethod - def _make_valid_url(cls): - return {valid_url!r} -''' - def get_base_name(base): if base is InfoExtractor: @@ -63,15 +55,14 @@ def build_lazy_ie(ie, name): bases=', '.join(map(get_base_name, ie.__bases__)), module=ie.__module__) valid_url = getattr(ie, '_VALID_URL', None) + if not valid_url and hasattr(ie, '_make_valid_url'): + valid_url = ie._make_valid_url() if valid_url: s += f' _VALID_URL = {valid_url!r}\n' if not ie._WORKING: s += ' _WORKING = False\n' if ie.suitable.__func__ is not InfoExtractor.suitable.__func__: s += f'\n{getsource(ie.suitable)}' - if hasattr(ie, '_make_valid_url'): - # search extractors - s += make_valid_template.format(valid_url=ie._make_valid_url()) return s diff --git a/devscripts/make_supportedsites.py b/devscripts/make_supportedsites.py index 17a34843fd..4c11e25f28 100644 --- a/devscripts/make_supportedsites.py +++ b/devscripts/make_supportedsites.py @@ -29,6 +29,9 @@ def gen_ies_md(ies): continue if ie_desc is not None: ie_md += ': {0}'.format(ie.IE_DESC) + search_key = getattr(ie, 'SEARCH_KEY', None) + if search_key is not None: + ie_md += f'; "{ie.SEARCH_KEY}:" prefix' if not ie.working(): ie_md += ' (Currently broken)' yield ie_md diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py index 485b39e9f3..069d99eeb4 100644 --- a/devscripts/prepare_manpage.py +++ b/devscripts/prepare_manpage.py @@ -13,12 +13,14 @@ # NAME -youtube\-dl \- download videos from youtube.com or other video platforms +yt\-dlp \- A youtube-dl fork with additional features and patches # SYNOPSIS **yt-dlp** \[OPTIONS\] URL [URL...] +# DESCRIPTION + ''' @@ -33,47 +35,63 @@ def main(): with io.open(README_FILE, encoding='utf-8') as f: readme = f.read() - readme = re.sub(r'(?s)^.*?(?=# DESCRIPTION)', '', readme) - readme = re.sub(r'\s+yt-dlp \[OPTIONS\] URL \[URL\.\.\.\]', '', readme) - readme = PREFIX + readme - + readme = filter_excluded_sections(readme) + readme = move_sections(readme) readme = filter_options(readme) with io.open(outfile, 'w', encoding='utf-8') as outf: - outf.write(readme) + outf.write(PREFIX + readme) + + +def filter_excluded_sections(readme): + EXCLUDED_SECTION_BEGIN_STRING = re.escape('<!-- MANPAGE: BEGIN EXCLUDED SECTION -->') + EXCLUDED_SECTION_END_STRING = re.escape('<!-- MANPAGE: END EXCLUDED SECTION -->') + return re.sub( + rf'(?s){EXCLUDED_SECTION_BEGIN_STRING}.+?{EXCLUDED_SECTION_END_STRING}\n', + '', readme) + + +def move_sections(readme): + MOVE_TAG_TEMPLATE = '<!-- MANPAGE: MOVE "%s" SECTION HERE -->' + sections = re.findall(r'(?m)^%s$' % ( + re.escape(MOVE_TAG_TEMPLATE).replace(r'\%', '%') % '(.+)'), readme) + + for section_name in sections: + move_tag = MOVE_TAG_TEMPLATE % section_name + if readme.count(move_tag) > 1: + raise Exception(f'There is more than one occurrence of "{move_tag}". This is unexpected') + + sections = re.findall(rf'(?sm)(^# {re.escape(section_name)}.+?)(?=^# )', readme) + if len(sections) < 1: + raise Exception(f'The section {section_name} does not exist') + elif len(sections) > 1: + raise Exception(f'There are multiple occurrences of section {section_name}, this is unhandled') + + readme = readme.replace(sections[0], '', 1).replace(move_tag, sections[0], 1) + return readme def filter_options(readme): - ret = '' - in_options = False - for line in readme.split('\n'): - if line.startswith('# '): - if line[2:].startswith('OPTIONS'): - in_options = True - else: - in_options = False + section = re.search(r'(?sm)^# USAGE AND OPTIONS\n.+?(?=^# )', readme).group(0) + options = '# OPTIONS\n' + for line in section.split('\n')[1:]: + if line.lstrip().startswith('-'): + split = re.split(r'\s{2,}', line.lstrip()) + # Description string may start with `-` as well. If there is + # only one piece then it's a description bit not an option. + if len(split) > 1: + option, description = split + split_option = option.split(' ') - if in_options: - if line.lstrip().startswith('-'): - split = re.split(r'\s{2,}', line.lstrip()) - # Description string may start with `-` as well. If there is - # only one piece then it's a description bit not an option. - if len(split) > 1: - option, description = split - split_option = option.split(' ') + if not split_option[-1].startswith('-'): # metavar + option = ' '.join(split_option[:-1] + [f'*{split_option[-1]}*']) - if not split_option[-1].startswith('-'): # metavar - option = ' '.join(split_option[:-1] + ['*%s*' % split_option[-1]]) + # Pandoc's definition_lists. See http://pandoc.org/README.html + options += f'\n{option}\n: {description}\n' + continue + options += line.lstrip() + '\n' - # Pandoc's definition_lists. See http://pandoc.org/README.html - # for more information. - ret += '\n%s\n: %s\n' % (option, description) - continue - ret += line.lstrip() + '\n' - else: - ret += line + '\n' - - return ret + return readme.replace(section, options, 1) if __name__ == '__main__': diff --git a/devscripts/run_tests.bat b/devscripts/run_tests.bat index f12ae1c1b2..b8bb393d93 100644 --- a/devscripts/run_tests.bat +++ b/devscripts/run_tests.bat @@ -3,11 +3,11 @@ cd /d %~dp0.. if ["%~1"]==[""] ( - set "test_set=" + set "test_set="test"" ) else if ["%~1"]==["core"] ( - set "test_set=-k "not download"" + set "test_set="-m not download"" ) else if ["%~1"]==["download"] ( - set "test_set=-k download" + set "test_set="-m "download"" ) else ( echo.Invalid test type "%~1". Use "core" ^| "download" exit /b 1 diff --git a/devscripts/run_tests.sh b/devscripts/run_tests.sh index fb405b5698..c9a75ba006 100755 --- a/devscripts/run_tests.sh +++ b/devscripts/run_tests.sh @@ -3,12 +3,12 @@ if [ -z $1 ]; then test_set='test' elif [ $1 = 'core' ]; then - test_set='not download' + test_set="-m not download" elif [ $1 = 'download' ]; then - test_set='download' + test_set="-m download" else echo 'Invalid test type "'$1'". Use "core" | "download"' exit 1 fi -python3 -m pytest -k "$test_set" +python3 -m pytest "$test_set" diff --git a/devscripts/update-formulae.py b/devscripts/update-formulae.py new file mode 100644 index 0000000000..41bc1ac7a3 --- /dev/null +++ b/devscripts/update-formulae.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +from __future__ import unicode_literals + +import json +import os +import re +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from yt_dlp.compat import compat_urllib_request + + +# usage: python3 ./devscripts/update-formulae.py <path-to-formulae-rb> <version> +# version can be either 0-aligned (yt-dlp version) or normalized (PyPl version) + +filename, version = sys.argv[1:] + +normalized_version = '.'.join(str(int(x)) for x in version.split('.')) + +pypi_release = json.loads(compat_urllib_request.urlopen( + 'https://pypi.org/pypi/yt-dlp/%s/json' % normalized_version +).read().decode('utf-8')) + +tarball_file = next(x for x in pypi_release['urls'] if x['filename'].endswith('.tar.gz')) + +sha256sum = tarball_file['digests']['sha256'] +url = tarball_file['url'] + +with open(filename, 'r') as r: + formulae_text = r.read() + +formulae_text = re.sub(r'sha256 "[0-9a-f]*?"', 'sha256 "%s"' % sha256sum, formulae_text) +formulae_text = re.sub(r'url "[^"]*?"', 'url "%s"' % url, formulae_text) + +with open(filename, 'w') as w: + w.write(formulae_text) diff --git a/devscripts/update-version.py b/devscripts/update-version.py index 2d1673d0e2..0ee7bf2916 100644 --- a/devscripts/update-version.py +++ b/devscripts/update-version.py @@ -1,33 +1,42 @@ #!/usr/bin/env python3 -from __future__ import unicode_literals - from datetime import datetime -# import urllib.request +import sys +import subprocess -# response = urllib.request.urlopen('https://blackjack4494.github.io/youtube-dlc/update/LATEST_VERSION') -# old_version = response.read().decode('utf-8') -exec(compile(open('yt_dlp/version.py').read(), 'yt_dlp/version.py', 'exec')) +with open('yt_dlp/version.py', 'rt') as f: + exec(compile(f.read(), 'yt_dlp/version.py', 'exec')) old_version = locals()['__version__'] -old_version_list = old_version.split(".", 4) +old_version_list = old_version.split('.') old_ver = '.'.join(old_version_list[:3]) old_rev = old_version_list[3] if len(old_version_list) > 3 else '' ver = datetime.utcnow().strftime("%Y.%m.%d") -rev = str(int(old_rev or 0) + 1) if old_ver == ver else '' + +rev = (sys.argv[1:] or [''])[0] # Use first argument, if present as revision number +if not rev: + rev = str(int(old_rev or 0) + 1) if old_ver == ver else '' VERSION = '.'.join((ver, rev)) if rev else ver -# VERSION_LIST = [(int(v) for v in ver.split(".") + [rev or 0])] + +try: + sp = subprocess.Popen(['git', 'rev-parse', '--short', 'HEAD'], stdout=subprocess.PIPE) + GIT_HEAD = sp.communicate()[0].decode().strip() or None +except Exception: + GIT_HEAD = None + +VERSION_FILE = f'''\ +# Autogenerated by devscripts/update-version.py + +__version__ = {VERSION!r} + +RELEASE_GIT_HEAD = {GIT_HEAD!r} +''' + +with open('yt_dlp/version.py', 'wt') as f: + f.write(VERSION_FILE) print('::set-output name=ytdlp_version::' + VERSION) - -file_version_py = open('yt_dlp/version.py', 'rt') -data = file_version_py.read() -data = data.replace(old_version, VERSION) -file_version_py.close() - -file_version_py = open('yt_dlp/version.py', 'wt') -file_version_py.write(data) -file_version_py.close() +print(f'\nVersion = {VERSION}, Git HEAD = {GIT_HEAD}') diff --git a/docs/Contributing.md b/docs/Contributing.md new file mode 100644 index 0000000000..60fe469097 --- /dev/null +++ b/docs/Contributing.md @@ -0,0 +1,5 @@ +--- +orphan: true +--- +```{include} ../Contributing.md +``` diff --git a/pyinst.py b/pyinst.py index fb8eca3e5c..f135ec90d2 100644 --- a/pyinst.py +++ b/pyinst.py @@ -1,82 +1,135 @@ #!/usr/bin/env python3 # coding: utf-8 - -from __future__ import unicode_literals -import sys -# import os +import os import platform - +import sys from PyInstaller.utils.hooks import collect_submodules -from PyInstaller.utils.win32.versioninfo import ( - VarStruct, VarFileInfo, StringStruct, StringTable, - StringFileInfo, FixedFileInfo, VSVersionInfo, SetVersion, -) -import PyInstaller.__main__ -arch = sys.argv[1] if len(sys.argv) > 1 else platform.architecture()[0][:2] -assert arch in ('32', '64') -print('Building %sbit version' % arch) -_x86 = '_x86' if arch == '32' else '' -FILE_DESCRIPTION = 'yt-dlp%s' % (' (32 Bit)' if _x86 else '') +OS_NAME = platform.system() +if OS_NAME == 'Windows': + from PyInstaller.utils.win32.versioninfo import ( + VarStruct, VarFileInfo, StringStruct, StringTable, + StringFileInfo, FixedFileInfo, VSVersionInfo, SetVersion, + ) +elif OS_NAME == 'Darwin': + pass +else: + raise Exception('{OS_NAME} is not supported') -# root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -# print('Changing working directory to %s' % root_dir) -# os.chdir(root_dir) +ARCH = platform.architecture()[0][:2] -exec(compile(open('yt_dlp/version.py').read(), 'yt_dlp/version.py', 'exec')) -VERSION = locals()['__version__'] -VERSION_LIST = VERSION.split('.') -VERSION_LIST = list(map(int, VERSION_LIST)) + [0] * (4 - len(VERSION_LIST)) +def main(): + opts = parse_options() + version = read_version() -print('Version: %s%s' % (VERSION, _x86)) -print('Remember to update the version using devscipts\\update-version.py') + suffix = '_macos' if OS_NAME == 'Darwin' else '_x86' if ARCH == '32' else '' + final_file = 'dist/%syt-dlp%s%s' % ( + 'yt-dlp/' if '--onedir' in opts else '', suffix, '.exe' if OS_NAME == 'Windows' else '') -VERSION_FILE = VSVersionInfo( - ffi=FixedFileInfo( - filevers=VERSION_LIST, - prodvers=VERSION_LIST, - mask=0x3F, - flags=0x0, - OS=0x4, - fileType=0x1, - subtype=0x0, - date=(0, 0), - ), - kids=[ - StringFileInfo([ - StringTable( - '040904B0', [ - StringStruct('Comments', 'yt-dlp%s Command Line Interface.' % _x86), - StringStruct('CompanyName', 'https://github.com/yt-dlp'), - StringStruct('FileDescription', FILE_DESCRIPTION), - StringStruct('FileVersion', VERSION), - StringStruct('InternalName', 'yt-dlp%s' % _x86), - StringStruct( - 'LegalCopyright', - 'pukkandan.ytdlp@gmail.com | UNLICENSE', - ), - StringStruct('OriginalFilename', 'yt-dlp%s.exe' % _x86), - StringStruct('ProductName', 'yt-dlp%s' % _x86), - StringStruct( - 'ProductVersion', - '%s%s on Python %s' % (VERSION, _x86, platform.python_version())), - ])]), - VarFileInfo([VarStruct('Translation', [0, 1200])]) + print(f'Building yt-dlp v{version} {ARCH}bit for {OS_NAME} with options {opts}') + print('Remember to update the version using "devscripts/update-version.py"') + if not os.path.isfile('yt_dlp/extractor/lazy_extractors.py'): + print('WARNING: Building without lazy_extractors. Run ' + '"devscripts/make_lazy_extractors.py" to build lazy extractors', file=sys.stderr) + print(f'Destination: {final_file}\n') + + opts = [ + f'--name=yt-dlp{suffix}', + '--icon=devscripts/logo.ico', + '--upx-exclude=vcruntime140.dll', + '--noconfirm', + *dependency_options(), + *opts, + 'yt_dlp/__main__.py', ] -) + print(f'Running PyInstaller with {opts}') -dependancies = ['Crypto', 'mutagen'] + collect_submodules('websockets') -excluded_modules = ['test', 'ytdlp_plugins', 'youtube-dl', 'youtube-dlc'] + import PyInstaller.__main__ -PyInstaller.__main__.run([ - '--name=yt-dlp%s' % _x86, - '--onefile', - '--icon=devscripts/logo.ico', - *[f'--exclude-module={module}' for module in excluded_modules], - *[f'--hidden-import={module}' for module in dependancies], - '--upx-exclude=vcruntime140.dll', - 'yt_dlp/__main__.py', -]) -SetVersion('dist/yt-dlp%s.exe' % _x86, VERSION_FILE) + PyInstaller.__main__.run(opts) + + set_version_info(final_file, version) + + +def parse_options(): + # Compatability with older arguments + opts = sys.argv[1:] + if opts[0:1] in (['32'], ['64']): + if ARCH != opts[0]: + raise Exception(f'{opts[0]}bit executable cannot be built on a {ARCH}bit system') + opts = opts[1:] + return opts or ['--onefile'] + + +def read_version(): + exec(compile(open('yt_dlp/version.py').read(), 'yt_dlp/version.py', 'exec')) + return locals()['__version__'] + + +def version_to_list(version): + version_list = version.split('.') + return list(map(int, version_list)) + [0] * (4 - len(version_list)) + + +def dependency_options(): + dependencies = [pycryptodome_module(), 'mutagen'] + collect_submodules('websockets') + excluded_modules = ['test', 'ytdlp_plugins', 'youtube-dl', 'youtube-dlc'] + + yield from (f'--hidden-import={module}' for module in dependencies) + yield from (f'--exclude-module={module}' for module in excluded_modules) + + +def pycryptodome_module(): + try: + import Cryptodome # noqa: F401 + except ImportError: + try: + import Crypto # noqa: F401 + print('WARNING: Using Crypto since Cryptodome is not available. ' + 'Install with: pip install pycryptodomex', file=sys.stderr) + return 'Crypto' + except ImportError: + pass + return 'Cryptodome' + + +def set_version_info(exe, version): + if OS_NAME == 'Windows': + windows_set_version(exe, version) + + +def windows_set_version(exe, version): + version_list = version_to_list(version) + suffix = '_x86' if ARCH == '32' else '' + SetVersion(exe, VSVersionInfo( + ffi=FixedFileInfo( + filevers=version_list, + prodvers=version_list, + mask=0x3F, + flags=0x0, + OS=0x4, + fileType=0x1, + subtype=0x0, + date=(0, 0), + ), + kids=[ + StringFileInfo([StringTable('040904B0', [ + StringStruct('Comments', 'yt-dlp%s Command Line Interface.' % suffix), + StringStruct('CompanyName', 'https://github.com/yt-dlp'), + StringStruct('FileDescription', 'yt-dlp%s' % (' (32 Bit)' if ARCH == '32' else '')), + StringStruct('FileVersion', version), + StringStruct('InternalName', f'yt-dlp{suffix}'), + StringStruct('LegalCopyright', 'pukkandan.ytdlp@gmail.com | UNLICENSE'), + StringStruct('OriginalFilename', f'yt-dlp{suffix}.exe'), + StringStruct('ProductName', f'yt-dlp{suffix}'), + StringStruct( + 'ProductVersion', f'{version}{suffix} on Python {platform.python_version()}'), + ])]), VarFileInfo([VarStruct('Translation', [0, 1200])]) + ] + )) + + +if __name__ == '__main__': + main() diff --git a/requirements.txt b/requirements.txt index 6a982fa369..cecd08eae8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ mutagen -pycryptodome +pycryptodomex websockets diff --git a/setup.py b/setup.py index d54806f151..f08ae2309d 100644 --- a/setup.py +++ b/setup.py @@ -1,52 +1,86 @@ #!/usr/bin/env python3 # coding: utf-8 - -from setuptools import setup, Command, find_packages import os.path import warnings import sys -from distutils.spawn import spawn +try: + from setuptools import setup, Command, find_packages + setuptools_available = True +except ImportError: + from distutils.core import setup, Command + setuptools_available = False +from distutils.spawn import spawn # Get the version from yt_dlp/version.py without importing the package exec(compile(open('yt_dlp/version.py').read(), 'yt_dlp/version.py', 'exec')) -DESCRIPTION = 'Command-line program to download videos from YouTube.com and many other other video platforms.' +DESCRIPTION = 'A youtube-dl fork with additional features and patches' LONG_DESCRIPTION = '\n\n'.join(( 'Official repository: <https://github.com/yt-dlp/yt-dlp>', '**PS**: Some links in this document will not work since this is a copy of the README.md from Github', open('README.md', 'r', encoding='utf-8').read())) -REQUIREMENTS = ['mutagen', 'pycryptodome', 'websockets'] +REQUIREMENTS = ['mutagen', 'pycryptodomex', 'websockets'] + if sys.argv[1:2] == ['py2exe']: - raise NotImplementedError('py2exe is not currently supported; instead, use "pyinst.py" to build with pyinstaller') + import py2exe + warnings.warn( + 'py2exe builds do not support pycryptodomex and needs VC++14 to run. ' + 'The recommended way is to use "pyinst.py" to build using pyinstaller') + params = { + 'console': [{ + 'script': './yt_dlp/__main__.py', + 'dest_base': 'yt-dlp', + 'version': __version__, + 'description': DESCRIPTION, + 'comments': LONG_DESCRIPTION.split('\n')[0], + 'product_name': 'yt-dlp', + 'product_version': __version__, + }], + 'options': { + 'py2exe': { + 'bundle_files': 0, + 'compressed': 1, + 'optimize': 2, + 'dist_dir': './dist', + 'excludes': ['Crypto', 'Cryptodome'], # py2exe cannot import Crypto + 'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'], + } + }, + 'zipfile': None + } +else: + files_spec = [ + ('share/bash-completion/completions', ['completions/bash/yt-dlp']), + ('share/zsh/site-functions', ['completions/zsh/_yt-dlp']), + ('share/fish/vendor_completions.d', ['completions/fish/yt-dlp.fish']), + ('share/doc/yt_dlp', ['README.txt']), + ('share/man/man1', ['yt-dlp.1']) + ] + root = os.path.dirname(os.path.abspath(__file__)) + data_files = [] + for dirname, files in files_spec: + resfiles = [] + for fn in files: + if not os.path.exists(fn): + warnings.warn('Skipping file %s since it is not present. Try running `make pypi-files` first' % fn) + else: + resfiles.append(fn) + data_files.append((dirname, resfiles)) -files_spec = [ - ('share/bash-completion/completions', ['completions/bash/yt-dlp']), - ('share/zsh/site-functions', ['completions/zsh/_yt-dlp']), - ('share/fish/vendor_completions.d', ['completions/fish/yt-dlp.fish']), - ('share/doc/yt_dlp', ['README.txt']), - ('share/man/man1', ['yt-dlp.1']) -] -root = os.path.dirname(os.path.abspath(__file__)) -data_files = [] -for dirname, files in files_spec: - resfiles = [] - for fn in files: - if not os.path.exists(fn): - warnings.warn('Skipping file %s since it is not present. Try running `make pypi-files` first' % fn) - else: - resfiles.append(fn) - data_files.append((dirname, resfiles)) + params = { + 'data_files': data_files, + } -params = { - 'data_files': data_files, -} -params['entry_points'] = {'console_scripts': ['yt-dlp = yt_dlp:main']} + if setuptools_available: + params['entry_points'] = {'console_scripts': ['yt-dlp = yt_dlp:main']} + else: + params['scripts'] = ['yt-dlp'] class build_lazy_extractors(Command): @@ -64,7 +98,11 @@ def run(self): dry_run=self.dry_run) -packages = find_packages(exclude=('youtube_dl', 'test', 'ytdlp_plugins')) +if setuptools_available: + packages = find_packages(exclude=('youtube_dl', 'youtube_dlc', 'test', 'ytdlp_plugins')) +else: + packages = ['yt_dlp', 'yt_dlp.downloader', 'yt_dlp.extractor', 'yt_dlp.postprocessor'] + setup( name='yt-dlp', @@ -81,7 +119,7 @@ def run(self): 'Documentation': 'https://yt-dlp.readthedocs.io', 'Source': 'https://github.com/yt-dlp/yt-dlp', 'Tracker': 'https://github.com/yt-dlp/yt-dlp/issues', - #'Funding': 'https://donate.pypi.org', + 'Funding': 'https://github.com/yt-dlp/yt-dlp/blob/master/Collaborators.md#collaborators', }, classifiers=[ 'Topic :: Multimedia :: Video', diff --git a/supportedsites.md b/supportedsites.md index 7e19b324c4..9dc94f27d3 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -1,4 +1,6 @@ # Supported sites + - **17live** + - **17live:clip** - **1tv**: Первый канал - **20min** - **220.ro** @@ -19,6 +21,7 @@ # Supported sites - **9now.com.au** - **abc.net.au** - **abc.net.au:iview** + - **abc.net.au:iview:showseries** - **abcnews** - **abcnews:video** - **abcotvs**: ABC Owned Television Stations @@ -46,10 +49,12 @@ # Supported sites - **Alura** - **AluraCourse** - **Amara** + - **AmazonStore** - **AMCNetworks** - **AmericasTestKitchen** - **AmericasTestKitchenSeason** - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl + - **AnimalPlanet** - **AnimeLab** - **AnimeLabShows** - **AnimeOnDemand** @@ -97,6 +102,7 @@ # Supported sites - **Bandcamp:weekly** - **BandcampMusic** - **bangumi.bilibili.com**: BiliBili番剧 + - **BannedVideo** - **bbc**: BBC - **bbc.co.uk**: BBC iPlayer - **bbc.co.uk:article**: BBC articles @@ -118,11 +124,14 @@ # Supported sites - **Bigflix** - **Bild**: Bild.de - **BiliBili** + - **Bilibili category extractor** - **BilibiliAudio** - **BilibiliAudioAlbum** - **BilibiliChannel** - **BiliBiliPlayer** - - **BiliBiliSearch**: Bilibili video search, "bilisearch" keyword + - **BiliBiliSearch**: Bilibili video search; "bilisearch:" prefix + - **BiliIntl** + - **BiliIntlSeries** - **BioBioChileTV** - **Biography** - **BIQLE** @@ -133,6 +142,7 @@ # Supported sites - **BlackboardCollaborate** - **BleacherReport** - **BleacherReportCMS** + - **blogger.com** - **Bloomberg** - **BokeCC** - **BongaCams** @@ -142,6 +152,7 @@ # Supported sites - **BR**: Bayerischer Rundfunk - **BravoTV** - **Break** + - **BreitBart** - **brightcove:legacy** - **brightcove:new** - **BRMediathek**: Bayerischer Rundfunk Mediathek @@ -150,11 +161,13 @@ # Supported sites - **BusinessInsider** - **BuzzFeed** - **BYUtv** + - **CableAV** + - **CAM4** - **Camdemy** - **CamdemyFolder** - **CamModels** - - **CamTube** - **CamWithHer** + - **CanalAlpha** - **canalc2.tv** - **Canalplus**: mycanal.fr and piwiplus.fr - **Canvas** @@ -163,10 +176,7 @@ # Supported sites - **CarambaTVPage** - **CartoonNetwork** - **cbc.ca** - - **cbc.ca:olympics** - **cbc.ca:player** - - **cbc.ca:watch** - - **cbc.ca:watch:video** - **CBS** - **CBSInteractive** - **CBSLocal** @@ -180,11 +190,13 @@ # Supported sites - **CCTV**: 央视网 - **CDA** - **CeskaTelevize** - - **CeskaTelevizePorady** + - **CGTN** - **channel9**: Channel 9 - **CharlieRose** - **Chaturbate** - **Chilloutzone** + - **Chingari** + - **ChingariUser** - **chirbit** - **chirbit:profile** - **cielotv.it** @@ -192,6 +204,7 @@ # Supported sites - **Cinemax** - **CiscoLiveSearch** - **CiscoLiveSession** + - **ciscowebex**: Cisco Webex - **CJSW** - **cliphunter** - **Clippit** @@ -214,26 +227,32 @@ # Supported sites - **CONtv** - **Corus** - **Coub** + - **CozyTV** + - **cp24** - **Cracked** - **Crackle** - **CrooksAndLiars** - **crunchyroll** + - **crunchyroll:beta** - **crunchyroll:playlist** + - **crunchyroll:playlist:beta** - **CSpan**: C-SPAN - **CtsNews**: 華視新聞 - **CTV** - **CTVNews** - **cu.ntv.co.jp**: Nippon Television Network - - **Culturebox** - **CultureUnplugged** - **curiositystream** - - **curiositystream:collection** + - **curiositystream:collections** + - **curiositystream:series** - **CWTV** - **DagelijkseKost**: dagelijksekost.een.be - **DailyMail** - **dailymotion** - **dailymotion:playlist** - **dailymotion:user** + - **damtomo:record** + - **damtomo:video** - **daum.net** - **daum.net:clip** - **daum.net:playlist** @@ -255,8 +274,11 @@ # Supported sites - **DiscoveryPlus** - **DiscoveryPlusIndia** - **DiscoveryPlusIndiaShow** + - **DiscoveryPlusItaly** + - **DiscoveryPlusItalyShow** - **DiscoveryVR** - **Disney** + - **DIYNetwork** - **dlive:stream** - **dlive:vod** - **DoodStream** @@ -267,6 +289,8 @@ # Supported sites - **DPlay** - **DRBonanza** - **Dropbox** + - **Dropout** + - **DropoutSeason** - **DrTuber** - **drtv** - **drtv:live** @@ -295,14 +319,18 @@ # Supported sites - **Embedly** - **EMPFlix** - **Engadget** + - **Epicon** + - **EpiconSeries** - **Eporner** - **EroProfile** - **EroProfile:album** - **Escapist** - **ESPN** - **ESPNArticle** + - **ESPNCricInfo** - **EsriVideo** - **Europa** + - **EUScreen** - **EWETV** - **ExpoTV** - **Expressen** @@ -316,6 +344,7 @@ # Supported sites - **fc2** - **fc2:embed** - **Fczenit** + - **Filmmodu** - **filmon** - **filmon:channel** - **Filmweb** @@ -332,13 +361,10 @@ # Supported sites - **foxnews**: Fox News and Fox Business Video - **foxnews:article** - **FoxSports** - - **france2.fr:generation-what** - **FranceCulture** - **FranceInter** - **FranceTV** - - **FranceTVEmbed** - **francetvinfo.fr** - - **FranceTVJeunesse** - **FranceTVSite** - **Freesound** - **freespeech.org** @@ -353,15 +379,27 @@ # Supported sites - **Funk** - **Fusion** - **Fux** + - **Gab** + - **GabTV** - **Gaia** - **GameInformer** + - **GameJolt** + - **GameJoltCommunity** + - **GameJoltGame** + - **GameJoltGameSoundtrack** + - **GameJoltSearch** + - **GameJoltUser** - **GameSpot** - **GameStar** - **Gaskrank** - **Gazeta** - **GDCVault** - **GediDigital** + - **gem.cbc.ca** + - **gem.cbc.ca:live** + - **gem.cbc.ca:playlist** - **generic**: Generic downloader that works on some sites + - **Gettr** - **Gfycat** - **GiantBomb** - **Giga** @@ -371,12 +409,16 @@ # Supported sites - **GloboArticle** - **Go** - **GodTube** + - **Gofile** - **Golem** - **google:podcasts** - **google:podcasts:feed** - **GoogleDrive** + - **GoPro** - **Goshgay** + - **GoToStage** - **GPUTechConf** + - **Gronkh** - **Groupon** - **hbo** - **HearThisAt** @@ -405,9 +447,12 @@ # Supported sites - **hrfernsehen** - **HRTi** - **HRTiPlaylist** + - **HSEProduct** + - **HSEShow** - **Huajiao**: 花椒直播 - **HuffPost**: Huffington Post - **Hungama** + - **HungamaAlbumPlaylist** - **HungamaSong** - **Hypem** - **ign.com** @@ -425,11 +470,13 @@ # Supported sites - **IndavideoEmbed** - **InfoQ** - **Instagram** - - **instagram:tag**: Instagram hashtag search + - **instagram:tag**: Instagram hashtag search URLs - **instagram:user**: Instagram user profile + - **InstagramIOS**: IOS instagram:// URL - **Internazionale** - **InternetVideoArchive** - **IPrima** + - **IPrimaCNN** - **iqiyi**: 爱奇艺 - **Ir90Tv** - **ITTF** @@ -460,6 +507,7 @@ # Supported sites - **KinjaEmbed** - **KinoPoisk** - **KonserthusetPlay** + - **Koo** - **KrasView**: Красвью - **Ku6** - **KUSI** @@ -498,6 +546,7 @@ # Supported sites - **LineLive** - **LineLiveChannel** - **LineTV** + - **LinkedIn** - **linkedin:learning** - **linkedin:learning:course** - **LinuxAcademy** @@ -520,6 +569,9 @@ # Supported sites - **MallTV** - **mangomolo:live** - **mangomolo:video** + - **ManotoTV**: Manoto TV (Episode) + - **ManotoTVLive**: Manoto TV (Live) + - **ManotoTVShow**: Manoto TV (Show) - **ManyVids** - **MaoriTV** - **Markiza** @@ -530,8 +582,11 @@ # Supported sites - **MedalTV** - **media.ccc.de** - **media.ccc.de:lists** + - **Mediaite** + - **MediaKlikk** - **Medialaan** - **Mediaset** + - **MediasetShow** - **Mediasite** - **MediasiteCatalog** - **MediasiteNamedCatalog** @@ -546,6 +601,7 @@ # Supported sites - **Mgoon** - **MGTV**: 芒果TV - **MiaoPai** + - **microsoftstream**: Microsoft Stream - **mildom**: Record ongoing live by specific user in Mildom - **mildom:user:vod**: Download all VODs from specific user in Mildom - **mildom:vod**: Download a VOD in Mildom @@ -558,11 +614,13 @@ # Supported sites - **mirrativ** - **mirrativ:user** - **MiTele**: mitele.es + - **mixch** - **mixcloud** - **mixcloud:playlist** - **mixcloud:user** - **MLB** - **MLBVideo** + - **MLSSoccer** - **Mnet** - **MNetTV** - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net @@ -588,6 +646,7 @@ # Supported sites - **mtvservices:embedded** - **MTVUutisetArticle** - **MuenchenTV**: münchen.tv + - **MuseScore** - **mva**: Microsoft Virtual Academy videos - **mva:course**: Microsoft Virtual Academy courses - **Mwave** @@ -604,6 +663,10 @@ # Supported sites - **MyviEmbed** - **MyVisionTV** - **n-tv.de** + - **N1Info:article** + - **N1InfoAsset** + - **Nate** + - **NateProgram** - **natgeo:video** - **NationalGeographicTV** - **Naver** @@ -626,6 +689,7 @@ # Supported sites - **ndr:embed:base** - **NDTV** - **Nebula** + - **nebula:collection** - **NerdCubedFeed** - **netease:album**: 网易云音乐 - 专辑 - **netease:djradio**: 网易云音乐 - 电台 @@ -637,7 +701,8 @@ # Supported sites - **NetPlus** - **Netzkino** - **Newgrounds** - - **NewgroundsPlaylist** + - **Newgrounds:playlist** + - **Newgrounds:user** - **Newstube** - **NextMedia**: 蘋果日報 - **NextMediaActionNews**: 蘋果日報 - 動新聞 @@ -658,6 +723,9 @@ # Supported sites - **niconico**: ニコニコ動画 - **NiconicoPlaylist** - **NiconicoUser** + - **nicovideo:search**: Nico video search; "nicosearch:" prefix + - **nicovideo:search:date**: Nico video search, newest first; "nicosearchdate:" prefix + - **nicovideo:search_url**: Nico video search URLs - **Nintendo** - **Nitter** - **njoy**: N-JOY @@ -670,6 +738,7 @@ # Supported sites - **NosVideo** - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz - **NovaEmbed** + - **NovaPlay** - **nowness** - **nowness:playlist** - **nowness:series** @@ -695,12 +764,16 @@ # Supported sites - **NYTimes** - **NYTimesArticle** - **NYTimesCooking** + - **nzherald** - **NZZ** - **ocw.mit.edu** - **OdaTV** - **Odnoklassniki** - **OktoberfestTV** + - **OlympicsReplay** + - **on24**: ON24 - **OnDemandKorea** + - **OneFootball** - **onet.pl** - **onet.tv** - **onet.tv:channel** @@ -708,6 +781,8 @@ # Supported sites - **OnionStudios** - **Ooyala** - **OoyalaExternal** + - **Opencast** + - **OpencastPlaylist** - **openrec** - **openrec:capture** - **OraTV** @@ -740,9 +815,14 @@ # Supported sites - **parliamentlive.tv**: UK parliament videos - **Parlview** - **Patreon** + - **PatreonUser** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - **PearVideo** + - **peer.tv** - **PeerTube** + - **PeerTube:Playlist** + - **peloton** + - **peloton:live**: Peloton Live - **People** - **PerformGroup** - **periscope**: Periscope @@ -756,7 +836,10 @@ # Supported sites - **Pinkbike** - **Pinterest** - **PinterestCollection** + - **pixiv:sketch** + - **pixiv:sketch:user** - **Pladform** + - **PlanetMarathi** - **Platzi** - **PlatziCourse** - **play.fm** @@ -773,7 +856,12 @@ # Supported sites - **podomatic** - **Pokemon** - **PokemonWatch** + - **PolsatGo** - **PolskieRadio** + - **polskieradio:kierowcow** + - **polskieradio:player** + - **polskieradio:podcast** + - **polskieradio:podcast:list** - **PolskieRadioCategory** - **Popcorntimes** - **PopcornTV** @@ -783,6 +871,7 @@ # Supported sites - **PornHd** - **PornHub**: PornHub and Thumbzilla - **PornHubPagedVideoList** + - **PornHubPlaylist** - **PornHubUser** - **PornHubUserVideosUpload** - **Pornotube** @@ -790,6 +879,7 @@ # Supported sites - **PornoXO** - **PornTube** - **PressTV** + - **ProjectVeritas** - **prosiebensat1**: ProSiebenSat.1 Digital - **puhutv** - **puhutv:serie** @@ -806,16 +896,26 @@ # Supported sites - **QuicklineLive** - **R7** - **R7Article** + - **Radiko** + - **RadikoRadio** - **radio.de** - **radiobremen** - **radiocanada** - **radiocanada:audiovideo** - **radiofrance** - **RadioJavan** + - **radiokapital** + - **radiokapital:show** + - **RadioZetPodcast** + - **radlive** + - **radlive:channel** + - **radlive:season** - **Rai** - **RaiPlay** - **RaiPlayLive** - **RaiPlayPlaylist** + - **RaiPlayRadio** + - **RaiPlayRadioPlaylist** - **RayWenderlich** - **RayWenderlichCourse** - **RBMARadio** @@ -831,7 +931,9 @@ # Supported sites - **RedBullTV** - **RedBullTVRrnContent** - **Reddit** - - **RedditR** + - **RedGifs** + - **RedGifsSearch**: Redgifs search + - **RedGifsUser**: Redgifs user - **RedTube** - **RegioTV** - **RENTV** @@ -843,6 +945,7 @@ # Supported sites - **RMCDecouverte** - **RockstarGames** - **RoosterTeeth** + - **RoosterTeethSeries** - **RottenTomatoes** - **Roxwel** - **Rozhlas** @@ -854,21 +957,25 @@ # Supported sites - **rtl2:you** - **rtl2:you:series** - **RTP** + - **RTRFM** - **RTS**: RTS.ch - **rtve.es:alacarta**: RTVE a la carta + - **rtve.es:audio**: RTVE audio - **rtve.es:infantil**: RTVE infantil - **rtve.es:live**: RTVE.es live streams - **rtve.es:television** - **RTVNH** - **RTVS** - **RUHD** + - **RumbleChannel** - **RumbleEmbed** - **rutube**: Rutube videos - - **rutube:channel**: Rutube channels + - **rutube:channel**: Rutube channel - **rutube:embed**: Rutube embedded videos - **rutube:movie**: Rutube movies - **rutube:person**: Rutube person videos - **rutube:playlist**: Rutube playlists + - **rutube:tags**: Rutube tags - **RUTV**: RUTV.RU - **Ruutu** - **Ruv** @@ -884,7 +991,7 @@ # Supported sites - **SBS**: sbs.com.au - **schooltv** - **ScienceChannel** - - **screen.yahoo:search**: Yahoo screen search + - **screen.yahoo:search**: Yahoo screen search; "yvsearch:" prefix - **Screencast** - **ScreencastOMatic** - **ScrippsNetworks** @@ -892,6 +999,7 @@ # Supported sites - **SCTE** - **SCTECourse** - **Seeker** + - **SenateGov** - **SenateISVP** - **SendtoNews** - **Servus** @@ -907,14 +1015,17 @@ # Supported sites - **simplecast:episode** - **simplecast:podcast** - **Sina** + - **Skeb** - **sky.it** - **sky:news** + - **sky:news:story** - **sky:sports** - **sky:sports:news** - **skyacademy.it** - **SkylineWebcams** - **skynewsarabia:article** - **skynewsarabia:video** + - **SkyNewsAU** - **Slideshare** - **SlidesLive** - **Slutload** @@ -924,7 +1035,8 @@ # Supported sites - **SonyLIVSeries** - **soundcloud** - **soundcloud:playlist** - - **soundcloud:search**: Soundcloud search + - **soundcloud:related** + - **soundcloud:search**: Soundcloud search; "scsearch:" prefix - **soundcloud:set** - **soundcloud:trackstation** - **soundcloud:user** @@ -936,11 +1048,12 @@ # Supported sites - **southpark.de** - **southpark.nl** - **southparkstudios.dk** + - **SovietsCloset** + - **SovietsClosetPlaylist** - **SpankBang** - **SpankBangPlaylist** - **Spankwire** - **Spiegel** - - **sport.francetvinfo.fr** - **Sport5** - **SportBox** - **SportDeutschland** @@ -956,6 +1069,7 @@ # Supported sites - **SRGSSR** - **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites - **stanfordoc**: Stanford Open ClassRoom + - **startv** - **Steam** - **Stitcher** - **StitcherShow** @@ -963,10 +1077,13 @@ # Supported sites - **StoryFireSeries** - **StoryFireUser** - **Streamable** + - **Streamanity** - **streamcloud.eu** - **StreamCZ** + - **StreamFF** - **StreetVoice** - **StretchInternet** + - **Stripchat** - **stv:player** - **SunPorno** - **sverigesradio:episode** @@ -980,7 +1097,6 @@ # Supported sites - **SztvHu** - **t-online.de** - **Tagesschau** - - **tagesschau:player** - **Tass** - **TBS** - **TDSLifeway** @@ -1018,16 +1134,27 @@ # Supported sites - **TheScene** - **TheStar** - **TheSun** + - **ThetaStream** + - **ThetaVideo** - **TheWeatherChannel** - **ThisAmericanLife** - **ThisAV** - **ThisOldHouse** + - **ThreeSpeak** + - **ThreeSpeakUser** - **TikTok** + - **tiktok:effect** + - **tiktok:sound** + - **tiktok:tag** + - **tiktok:user** - **tinypic**: tinypic.com videos - **TMZ** - **TNAFlix** - **TNAFlixNetworkEmbed** - **toggle** + - **toggo** + - **Tokentube** + - **Tokentube:channel** - **ToonGoggles** - **tou.tv** - **Toypics**: Toypics video @@ -1035,7 +1162,10 @@ # Supported sites - **TrailerAddict** (Currently broken) - **Trilulilu** - **Trovo** + - **TrovoChannelClip**: All Clips of a trovo.live channel; "trovoclip:" prefix + - **TrovoChannelVod**: All VODs of a trovo.live channel; "trovovod:" prefix - **TrovoVod** + - **TrueID** - **TruNews** - **TruTV** - **Tube8** @@ -1050,10 +1180,11 @@ # Supported sites - **Turbo** - **tv.dfb.de** - **TV2** - - **tv2.hu** - **TV2Article** - **TV2DK** - **TV2DKBornholmPlay** + - **tv2play.hu** + - **tv2playseries.hu** - **TV4**: tv4.se and tv4play.se - **TV5MondePlus**: TV5MONDE+ - **tv5unis** @@ -1079,6 +1210,7 @@ # Supported sites - **tvp**: Telewizja Polska - **tvp:embed**: Telewizja Polska - **tvp:series** + - **tvp:stream** - **TVPlayer** - **TVPlayHome** - **Tweakers** @@ -1122,6 +1254,7 @@ # Supported sites - **Varzesh3** - **Vbox7** - **VeeHD** + - **Veo** - **Veoh** - **Vesti**: Вести.Ru - **Vevo** @@ -1137,7 +1270,7 @@ # Supported sites - **Viddler** - **Videa** - **video.arnes.si**: Arnes Video - - **video.google:search**: Google Video search + - **video.google:search**: Google Video search; "gvsearch:" prefix (Currently broken) - **video.sky.it** - **video.sky.it:live** - **VideoDetective** @@ -1150,9 +1283,6 @@ # Supported sites - **VidioLive** - **VidioPremier** - **VidLii** - - **vidme** - - **vidme:user** - - **vidme:user:likes** - **vier**: vier.be and vijf.be - **vier:videos** - **viewlift** @@ -1187,6 +1317,8 @@ # Supported sites - **VODPl** - **VODPlatform** - **VoiceRepublic** + - **voicy** + - **voicy:channel** - **Voot** - **VootSeries** - **VoxMedia** @@ -1202,6 +1334,7 @@ # Supported sites - **VTXTV** - **vube**: Vube.com - **VuClip** + - **Vupload** - **VVVVID** - **VVVVIDShow** - **VyboryMos** @@ -1227,11 +1360,14 @@ # Supported sites - **WeiboMobile** - **WeiqiTV**: WQTV - **whowatch** + - **Willow** - **WimTV** - **Wistia** - **WistiaPlaylist** - **wnl**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **WorldStarHipHop** + - **wppilot** + - **wppilot:channels** - **WSJ**: Wall Street Journal - **WSJArticle** - **WWE** @@ -1279,19 +1415,19 @@ # Supported sites - **YouPorn** - **YourPorn** - **YourUpload** - - **youtube**: YouTube.com - - **youtube:favorites**: YouTube.com liked videos, ":ytfav" for short (requires authentication) - - **youtube:history**: Youtube watch history, ":ythis" for short (requires authentication) - - **youtube:playlist**: YouTube.com playlists - - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) - - **youtube:search**: YouTube.com searches, "ytsearch" keyword - - **youtube:search:date**: YouTube.com searches, newest videos first, "ytsearchdate" keyword - - **youtube:search_url**: YouTube.com search URLs - - **youtube:subscriptions**: YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication) - - **youtube:tab**: YouTube.com tab - - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **youtube**: YouTube + - **youtube:favorites**: YouTube liked videos; ":ytfav" keyword (requires cookies) + - **youtube:history**: Youtube watch history; ":ythis" keyword (requires cookies) + - **youtube:playlist**: YouTube playlists + - **youtube:recommended**: YouTube recommended videos; ":ytrec" keyword + - **youtube:search**: YouTube search; "ytsearch:" prefix + - **youtube:search:date**: YouTube search, newest videos first; "ytsearchdate:" prefix + - **youtube:search_url**: YouTube search URLs with sorting and filter support + - **youtube:subscriptions**: YouTube subscriptions feed; ":ytsubs" keyword (requires cookies) + - **youtube:tab**: YouTube Tabs + - **youtube:watchlater**: Youtube watch later list; ":ytwatchlater" keyword (requires cookies) - **YoutubeYtBe**: youtu.be - - **YoutubeYtUser**: YouTube.com user videos, URL or "ytuser" keyword + - **YoutubeYtUser**: YouTube user videos; "ytuser:" prefix - **Zapiks** - **Zattoo** - **ZattooLive** @@ -1299,6 +1435,8 @@ # Supported sites - **ZDFChannel** - **Zee5** - **zee5:series** + - **ZenYandex** + - **ZenYandexChannel** - **Zhihu** - **zingmp3**: mp3.zing.vn - **zingmp3:album** diff --git a/test/helper.py b/test/helper.py index 9599eab8e2..b63a5c8973 100644 --- a/test/helper.py +++ b/test/helper.py @@ -22,7 +22,7 @@ ) -if "pytest" in sys.modules: +if 'pytest' in sys.modules: import pytest is_download_test = pytest.mark.download else: @@ -32,9 +32,9 @@ def is_download_test(testClass): def get_params(override=None): PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), - "parameters.json") + 'parameters.json') LOCAL_PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), - "local_parameters.json") + 'local_parameters.json') with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: parameters = json.load(pf) if os.path.exists(LOCAL_PARAMETERS_FILE): @@ -194,6 +194,51 @@ def expect_dict(self, got_dict, expected_dict): expect_value(self, got, expected, info_field) +def sanitize_got_info_dict(got_dict): + IGNORED_FIELDS = ( + # Format keys + 'url', 'manifest_url', 'format', 'format_id', 'format_note', 'width', 'height', 'resolution', + 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', 'vbr', 'fps', 'vcodec', 'container', 'filesize', + 'filesize_approx', 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'preference', + 'language', 'language_preference', 'quality', 'source_preference', 'http_headers', + 'stretched_ratio', 'no_resume', 'has_drm', 'downloader_options', + + # RTMP formats + 'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time', + + # Lists + 'formats', 'thumbnails', 'subtitles', 'automatic_captions', 'comments', 'entries', + + # Auto-generated + 'autonumber', 'playlist', 'format_index', 'video_ext', 'audio_ext', 'duration_string', 'epoch', + 'fulltitle', 'extractor', 'extractor_key', 'filepath', 'infojson_filename', 'original_url', + + # Only live_status needs to be checked + 'is_live', 'was_live', + ) + + IGNORED_PREFIXES = ('', 'playlist', 'requested', 'webpage') + + def sanitize(key, value): + if isinstance(value, str) and len(value) > 100: + return f'md5:{md5(value)}' + elif isinstance(value, list) and len(value) > 10: + return f'count:{len(value)}' + return value + + test_info_dict = { + key: sanitize(key, value) for key, value in got_dict.items() + if value is not None and key not in IGNORED_FIELDS and not any( + key.startswith(f'{prefix}_') for prefix in IGNORED_PREFIXES) + } + + # display_id may be generated from id + if test_info_dict.get('display_id') == test_info_dict['id']: + test_info_dict.pop('display_id') + + return test_info_dict + + def expect_info_dict(self, got_dict, expected_dict): expect_dict(self, got_dict, expected_dict) # Check for the presence of mandatory fields @@ -207,10 +252,8 @@ def expect_info_dict(self, got_dict, expected_dict): for key in ['webpage_url', 'extractor', 'extractor_key']: self.assertTrue(got_dict.get(key), 'Missing field: %s' % key) - # Are checkable fields missing from the test case definition? - test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value)) - for key, value in got_dict.items() - if value and key in ('id', 'title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location', 'age_limit')) + test_info_dict = sanitize_got_info_dict(got_dict) + missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys()) if missing_keys: def _repr(v): diff --git a/test/parameters.json b/test/parameters.json index 9ca7d2ca9a..bc45613741 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -9,7 +9,7 @@ "forcetitle": false, "forceurl": false, "force_write_download_archive": false, - "format": "best", + "format": "b/bv", "ignoreerrors": false, "listformats": null, "logtostderr": false, @@ -44,6 +44,5 @@ "writesubtitles": false, "allsubtitles": false, "listsubtitles": false, - "socket_timeout": 20, "fixup": "never" } diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index cbca22c91d..cf06dbde46 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -99,10 +99,10 @@ def test_html_search_meta(self): self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True) def test_search_json_ld_realworld(self): - # https://github.com/ytdl-org/youtube-dl/issues/23306 - expect_dict( - self, - self.ie._search_json_ld(r'''<script type="application/ld+json"> + _TESTS = [ + # https://github.com/ytdl-org/youtube-dl/issues/23306 + ( + r'''<script type="application/ld+json"> { "@context": "http://schema.org/", "@type": "VideoObject", @@ -135,17 +135,86 @@ def test_search_json_ld_realworld(self): "name": "Kleio Valentien", "url": "https://www.eporner.com/pornstar/kleio-valentien/" }]} -</script>''', None), - { - 'title': '1 On 1 With Kleio', - 'description': 'Kleio Valentien', - 'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4', - 'timestamp': 1449347075, - 'duration': 743.0, - 'view_count': 1120958, - 'width': 1920, - 'height': 1080, - }) + </script>''', + { + 'title': '1 On 1 With Kleio', + 'description': 'Kleio Valentien', + 'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4', + 'timestamp': 1449347075, + 'duration': 743.0, + 'view_count': 1120958, + 'width': 1920, + 'height': 1080, + }, + {}, + ), + ( + r'''<script type="application/ld+json"> + { + "@context": "https://schema.org", + "@graph": [ + { + "@type": "NewsArticle", + "mainEntityOfPage": { + "@type": "WebPage", + "@id": "https://www.ant1news.gr/Society/article/620286/symmoria-anilikon-dikigoros-thymaton-ithelan-na-toys-apoteleiosoyn" + }, + "headline": "Συμμορία ανηλίκων – δικηγόρος θυμάτων: ήθελαν να τους αποτελειώσουν", + "name": "Συμμορία ανηλίκων – δικηγόρος θυμάτων: ήθελαν να τους αποτελειώσουν", + "description": "Τα παιδιά δέχθηκαν την επίθεση επειδή αρνήθηκαν να γίνουν μέλη της συμμορίας, ανέφερε ο Γ. Ζαχαρόπουλος.", + "image": { + "@type": "ImageObject", + "url": "https://ant1media.azureedge.net/imgHandler/1100/a635c968-be71-447c-bf9c-80d843ece21e.jpg", + "width": 1100, + "height": 756 }, + "datePublished": "2021-11-10T08:50:00+03:00", + "dateModified": "2021-11-10T08:52:53+03:00", + "author": { + "@type": "Person", + "@id": "https://www.ant1news.gr/", + "name": "Ant1news", + "image": "https://www.ant1news.gr/images/logo-e5d7e4b3e714c88e8d2eca96130142f6.png", + "url": "https://www.ant1news.gr/" + }, + "publisher": { + "@type": "Organization", + "@id": "https://www.ant1news.gr#publisher", + "name": "Ant1news", + "url": "https://www.ant1news.gr", + "logo": { + "@type": "ImageObject", + "url": "https://www.ant1news.gr/images/logo-e5d7e4b3e714c88e8d2eca96130142f6.png", + "width": 400, + "height": 400 }, + "sameAs": [ + "https://www.facebook.com/Ant1news.gr", + "https://twitter.com/antennanews", + "https://www.youtube.com/channel/UC0smvAbfczoN75dP0Hw4Pzw", + "https://www.instagram.com/ant1news/" + ] + }, + + "keywords": "μαχαίρωμα,συμμορία ανηλίκων,ΕΙΔΗΣΕΙΣ,ΕΙΔΗΣΕΙΣ ΣΗΜΕΡΑ,ΝΕΑ,Κοινωνία - Ant1news", + + + "articleSection": "Κοινωνία" + } + ] + } + </script>''', + { + 'timestamp': 1636523400, + 'title': 'md5:91fe569e952e4d146485740ae927662b', + }, + {'expected_type': 'NewsArticle'}, + ), + ] + for html, expected_dict, search_json_ld_kwargs in _TESTS: + expect_dict( + self, + self.ie._search_json_ld(html, None, **search_json_ld_kwargs), + expected_dict + ) def test_download_json(self): uri = encode_data_uri(b'{"foo": "blah"}', 'application/json') diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index e689978fd3..61923513ee 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -137,7 +137,7 @@ def test(inp, *expected, multi=False): test('webm/mp4', '47') test('3gp/40/mp4', '35') test('example-with-dashes', 'example-with-dashes') - test('all', '35', 'example-with-dashes', '45', '47', '2') # Order doesn't actually matter for this + test('all', '2', '47', '45', 'example-with-dashes', '35') test('mergeall', '2+47+45+example-with-dashes+35', multi=True) def test_format_selection_audio(self): @@ -520,7 +520,7 @@ def test_format_filtering(self): ydl = YDL({'format': 'all[width>=400][width<=600]'}) ydl.process_ie_result(info_dict) downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] - self.assertEqual(downloaded_ids, ['B', 'C', 'D']) + self.assertEqual(downloaded_ids, ['D', 'C', 'B']) ydl = YDL({'format': 'best[height<40]'}) try: @@ -649,12 +649,14 @@ def test_add_extra_info(self): 'title2': '%PATH%', 'title3': 'foo/bar\\test', 'title4': 'foo "bar" test', + 'title5': 'áéí 𝐀', 'timestamp': 1618488000, 'duration': 100000, 'playlist_index': 1, + 'playlist_autonumber': 2, '_last_playlist_index': 100, 'n_entries': 10, - 'formats': [{'id': 'id1'}, {'id': 'id2'}, {'id': 'id3'}] + 'formats': [{'id': 'id 1'}, {'id': 'id 2'}, {'id': 'id 3'}] } def test_prepare_outtmpl_and_filename(self): @@ -664,8 +666,7 @@ def test(tmpl, expected, *, info=None, **params): ydl._num_downloads = 1 self.assertEqual(ydl.validate_outtmpl(tmpl), None) - outtmpl, tmpl_dict = ydl.prepare_outtmpl(tmpl, info or self.outtmpl_info) - out = ydl.escape_outtmpl(outtmpl) % tmpl_dict + out = ydl.evaluate_outtmpl(tmpl, info or self.outtmpl_info) fname = ydl.prepare_filename(info or self.outtmpl_info) if not isinstance(expected, (list, tuple)): @@ -689,6 +690,7 @@ def test(tmpl, expected, *, info=None, **params): test('%(duration_string)s', ('27:46:40', '27-46-40')) test('%(resolution)s', '1080p') test('%(playlist_index)s', '001') + test('%(playlist_autonumber)s', '02') test('%(autonumber)s', '00001') test('%(autonumber+2)03d', '005', autonumber_start=3) test('%(autonumber)s', '001', autonumber_size=3) @@ -715,6 +717,7 @@ def test(tmpl, expected, *, info=None, **params): test('%(id)s', '.abcd', info={'id': '.abcd'}) test('%(id)s', 'ab__cd', info={'id': 'ab__cd'}) test('%(id)s', ('ab:cd', 'ab -cd'), info={'id': 'ab:cd'}) + test('%(id.0)s', '-', info={'id': '--'}) # Invalid templates self.assertTrue(isinstance(YoutubeDL.validate_outtmpl('%(title)'), ValueError)) @@ -735,6 +738,7 @@ def expect_same_infodict(out): test(NA_TEST_OUTTMPL, 'NA-NA-def-1234.mp4') test(NA_TEST_OUTTMPL, 'none-none-def-1234.mp4', outtmpl_na_placeholder='none') test(NA_TEST_OUTTMPL, '--def-1234.mp4', outtmpl_na_placeholder='') + test('%(non_existent.0)s', 'NA') # String formatting FMT_TEST_OUTTMPL = '%%(height)%s.%%(ext)s' @@ -760,17 +764,32 @@ def expect_same_infodict(out): test('a%(width|)d', 'a', outtmpl_na_placeholder='none') FORMATS = self.outtmpl_info['formats'] - sanitize = lambda x: x.replace(':', ' -').replace('"', "'") + sanitize = lambda x: x.replace(':', ' -').replace('"', "'").replace('\n', ' ') # Custom type casting - test('%(formats.:.id)l', 'id1, id2, id3') + test('%(formats.:.id)l', 'id 1, id 2, id 3') + test('%(formats.:.id)#l', ('id 1\nid 2\nid 3', 'id 1 id 2 id 3')) test('%(ext)l', 'mp4') - test('%(formats.:.id) 15l', ' id1, id2, id3') + test('%(formats.:.id) 18l', ' id 1, id 2, id 3') test('%(formats)j', (json.dumps(FORMATS), sanitize(json.dumps(FORMATS)))) + test('%(formats)#j', (json.dumps(FORMATS, indent=4), sanitize(json.dumps(FORMATS, indent=4)))) + test('%(title5).3B', 'á') + test('%(title5)U', 'áéí 𝐀') + test('%(title5)#U', 'a\u0301e\u0301i\u0301 𝐀') + test('%(title5)+U', 'áéí A') + test('%(title5)+#U', 'a\u0301e\u0301i\u0301 A') + test('%(height)D', '1K') + test('%(height)5.2D', ' 1.08K') + test('%(title4)#S', 'foo_bar_test') + test('%(title4).10S', ('foo \'bar\' ', 'foo \'bar\'' + ('#' if compat_os_name == 'nt' else ' '))) if compat_os_name == 'nt': test('%(title4)q', ('"foo \\"bar\\" test"', "'foo _'bar_' test'")) + test('%(formats.:.id)#q', ('"id 1" "id 2" "id 3"', "'id 1' 'id 2' 'id 3'")) + test('%(formats.0.id)#q', ('"id 1"', "'id 1'")) else: test('%(title4)q', ('\'foo "bar" test\'', "'foo 'bar' test'")) + test('%(formats.:.id)#q', "'id 1' 'id 2' 'id 3'") + test('%(formats.0.id)#q', "'id 1'") # Internal formatting test('%(timestamp-1000>%H-%M-%S)s', '11-43-20') @@ -788,6 +807,17 @@ def expect_same_infodict(out): test('%(formats.0.id.-1+id)f', '1235.000000') test('%(formats.0.id.-1+formats.1.id.-1)d', '3') + # Alternates + test('%(title,id)s', '1234') + test('%(width-100,height+20|def)d', '1100') + test('%(width-100,height+width|def)s', 'def') + test('%(timestamp-x>%H\\,%M\\,%S,timestamp>%H\\,%M\\,%S)s', '12,00,00') + + # Replacement + test('%(id&foo)s.bar', 'foo.bar') + test('%(title&foo)s.bar', 'NA.bar') + test('%(title&foo|baz)s.bar', 'baz.bar') + # Laziness def gen(): yield from range(5) @@ -803,6 +833,12 @@ def gen(): compat_setenv('__yt_dlp_var', 'expanded') envvar = '%__yt_dlp_var%' if compat_os_name == 'nt' else '$__yt_dlp_var' test(envvar, (envvar, 'expanded')) + if compat_os_name == 'nt': + test('%s%', ('%s%', '%s%')) + compat_setenv('s', 'expanded') + test('%s%', ('%s%', 'expanded')) # %s% should be expanded before escaping %s + compat_setenv('(test)s', 'expanded') + test('%(test)s%', ('NA%', 'expanded')) # Environment should take priority over template # Path expansion and escaping test('Hello %(title1)s', 'Hello $PATH') @@ -992,6 +1028,7 @@ def test_selection(params, expected_ids): test_selection({'playlist_items': '2-4'}, [2, 3, 4]) test_selection({'playlist_items': '2,4'}, [2, 4]) test_selection({'playlist_items': '10'}, []) + test_selection({'playlist_items': '0'}, []) # Tests for https://github.com/ytdl-org/youtube-dl/issues/10591 test_selection({'playlist_items': '2-4,3-4,3'}, [2, 3, 4]) diff --git a/test/test_aes.py b/test/test_aes.py index d2e51af29f..5c9273f8aa 100644 --- a/test/test_aes.py +++ b/test/test_aes.py @@ -7,7 +7,22 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from yt_dlp.aes import aes_decrypt, aes_encrypt, aes_cbc_decrypt, aes_cbc_encrypt, aes_decrypt_text +from yt_dlp.aes import ( + aes_decrypt, + aes_encrypt, + aes_ecb_encrypt, + aes_ecb_decrypt, + aes_cbc_decrypt, + aes_cbc_decrypt_bytes, + aes_cbc_encrypt, + aes_ctr_decrypt, + aes_ctr_encrypt, + aes_gcm_decrypt_and_verify, + aes_gcm_decrypt_and_verify_bytes, + aes_decrypt_text, + BLOCK_SIZE_BYTES, +) +from yt_dlp.compat import compat_pycrypto_AES from yt_dlp.utils import bytes_to_intlist, intlist_to_bytes import base64 @@ -27,18 +42,43 @@ def test_encrypt(self): self.assertEqual(decrypted, msg) def test_cbc_decrypt(self): - data = bytes_to_intlist( - b"\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6'\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd" - ) - decrypted = intlist_to_bytes(aes_cbc_decrypt(data, self.key, self.iv)) + data = b'\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6\x27\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd' + decrypted = intlist_to_bytes(aes_cbc_decrypt(bytes_to_intlist(data), self.key, self.iv)) self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) + if compat_pycrypto_AES: + decrypted = aes_cbc_decrypt_bytes(data, intlist_to_bytes(self.key), intlist_to_bytes(self.iv)) + self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) def test_cbc_encrypt(self): data = bytes_to_intlist(self.secret_msg) encrypted = intlist_to_bytes(aes_cbc_encrypt(data, self.key, self.iv)) self.assertEqual( encrypted, - b"\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6'\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd") + b'\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6\'\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd') + + def test_ctr_decrypt(self): + data = bytes_to_intlist(b'\x03\xc7\xdd\xd4\x8e\xb3\xbc\x1a*O\xdc1\x12+8Aio\xd1z\xb5#\xaf\x08') + decrypted = intlist_to_bytes(aes_ctr_decrypt(data, self.key, self.iv)) + self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) + + def test_ctr_encrypt(self): + data = bytes_to_intlist(self.secret_msg) + encrypted = intlist_to_bytes(aes_ctr_encrypt(data, self.key, self.iv)) + self.assertEqual( + encrypted, + b'\x03\xc7\xdd\xd4\x8e\xb3\xbc\x1a*O\xdc1\x12+8Aio\xd1z\xb5#\xaf\x08') + + def test_gcm_decrypt(self): + data = b'\x159Y\xcf5eud\x90\x9c\x85&]\x14\x1d\x0f.\x08\xb4T\xe4/\x17\xbd' + authentication_tag = b'\xe8&I\x80rI\x07\x9d}YWuU@:e' + + decrypted = intlist_to_bytes(aes_gcm_decrypt_and_verify( + bytes_to_intlist(data), self.key, bytes_to_intlist(authentication_tag), self.iv[:12])) + self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) + if compat_pycrypto_AES: + decrypted = aes_gcm_decrypt_and_verify_bytes( + data, intlist_to_bytes(self.key), authentication_tag, intlist_to_bytes(self.iv[:12])) + self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) def test_decrypt_text(self): password = intlist_to_bytes(self.key).decode('utf-8') @@ -57,6 +97,19 @@ def test_decrypt_text(self): decrypted = (aes_decrypt_text(encrypted, password, 32)) self.assertEqual(decrypted, self.secret_msg) + def test_ecb_encrypt(self): + data = bytes_to_intlist(self.secret_msg) + data += [0x08] * (BLOCK_SIZE_BYTES - len(data) % BLOCK_SIZE_BYTES) + encrypted = intlist_to_bytes(aes_ecb_encrypt(data, self.key, self.iv)) + self.assertEqual( + encrypted, + b'\xaa\x86]\x81\x97>\x02\x92\x9d\x1bR[[L/u\xd3&\xd1(h\xde{\x81\x94\xba\x02\xae\xbd\xa6\xd0:') + + def test_ecb_decrypt(self): + data = bytes_to_intlist(b'\xaa\x86]\x81\x97>\x02\x92\x9d\x1bR[[L/u\xd3&\xd1(h\xde{\x81\x94\xba\x02\xae\xbd\xa6\xd0:') + decrypted = intlist_to_bytes(aes_ecb_decrypt(data, self.key, self.iv)) + self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) + if __name__ == '__main__': unittest.main() diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 68c1c68d3f..2d89366d45 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -38,7 +38,6 @@ def test_youtube_playlist_matching(self): assertTab('https://www.youtube.com/AsapSCIENCE') assertTab('https://www.youtube.com/embedded') assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') - assertTab('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668 self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M')) diff --git a/test/test_cookies.py b/test/test_cookies.py index 6faaaa0c99..842ebcb99b 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -3,16 +3,30 @@ from yt_dlp import cookies from yt_dlp.cookies import ( - CRYPTO_AVAILABLE, LinuxChromeCookieDecryptor, MacChromeCookieDecryptor, WindowsChromeCookieDecryptor, - YDLLogger, parse_safari_cookies, pbkdf2_sha1, + _get_linux_desktop_environment, + _LinuxDesktopEnvironment, ) +class Logger: + def debug(self, message): + print(f'[verbose] {message}') + + def info(self, message): + print(message) + + def warning(self, message, only_once=False): + self.error(message) + + def error(self, message): + raise Exception(message) + + class MonkeyPatch: def __init__(self, module, temporary_values): self._module = module @@ -30,6 +44,37 @@ def __exit__(self, exc_type, exc_val, exc_tb): class TestCookies(unittest.TestCase): + def test_get_desktop_environment(self): + """ based on https://chromium.googlesource.com/chromium/src/+/refs/heads/main/base/nix/xdg_util_unittest.cc """ + test_cases = [ + ({}, _LinuxDesktopEnvironment.OTHER), + + ({'DESKTOP_SESSION': 'gnome'}, _LinuxDesktopEnvironment.GNOME), + ({'DESKTOP_SESSION': 'mate'}, _LinuxDesktopEnvironment.GNOME), + ({'DESKTOP_SESSION': 'kde4'}, _LinuxDesktopEnvironment.KDE), + ({'DESKTOP_SESSION': 'kde'}, _LinuxDesktopEnvironment.KDE), + ({'DESKTOP_SESSION': 'xfce'}, _LinuxDesktopEnvironment.XFCE), + + ({'GNOME_DESKTOP_SESSION_ID': 1}, _LinuxDesktopEnvironment.GNOME), + ({'KDE_FULL_SESSION': 1}, _LinuxDesktopEnvironment.KDE), + + ({'XDG_CURRENT_DESKTOP': 'X-Cinnamon'}, _LinuxDesktopEnvironment.CINNAMON), + ({'XDG_CURRENT_DESKTOP': 'GNOME'}, _LinuxDesktopEnvironment.GNOME), + ({'XDG_CURRENT_DESKTOP': 'GNOME:GNOME-Classic'}, _LinuxDesktopEnvironment.GNOME), + ({'XDG_CURRENT_DESKTOP': 'GNOME : GNOME-Classic'}, _LinuxDesktopEnvironment.GNOME), + + ({'XDG_CURRENT_DESKTOP': 'Unity', 'DESKTOP_SESSION': 'gnome-fallback'}, _LinuxDesktopEnvironment.GNOME), + ({'XDG_CURRENT_DESKTOP': 'KDE', 'KDE_SESSION_VERSION': '5'}, _LinuxDesktopEnvironment.KDE), + ({'XDG_CURRENT_DESKTOP': 'KDE'}, _LinuxDesktopEnvironment.KDE), + ({'XDG_CURRENT_DESKTOP': 'Pantheon'}, _LinuxDesktopEnvironment.PANTHEON), + ({'XDG_CURRENT_DESKTOP': 'Unity'}, _LinuxDesktopEnvironment.UNITY), + ({'XDG_CURRENT_DESKTOP': 'Unity:Unity7'}, _LinuxDesktopEnvironment.UNITY), + ({'XDG_CURRENT_DESKTOP': 'Unity:Unity8'}, _LinuxDesktopEnvironment.UNITY), + ] + + for env, expected_desktop_environment in test_cases: + self.assertEqual(_get_linux_desktop_environment(env), expected_desktop_environment) + def test_chrome_cookie_decryptor_linux_derive_key(self): key = LinuxChromeCookieDecryptor.derive_key(b'abc') self.assertEqual(key, b'7\xa1\xec\xd4m\xfcA\xc7\xb19Z\xd0\x19\xdcM\x17') @@ -42,32 +87,30 @@ def test_chrome_cookie_decryptor_linux_v10(self): with MonkeyPatch(cookies, {'_get_linux_keyring_password': lambda *args, **kwargs: b''}): encrypted_value = b'v10\xccW%\xcd\xe6\xe6\x9fM" \xa7\xb0\xca\xe4\x07\xd6' value = 'USD' - decryptor = LinuxChromeCookieDecryptor('Chrome', YDLLogger()) + decryptor = LinuxChromeCookieDecryptor('Chrome', Logger()) self.assertEqual(decryptor.decrypt(encrypted_value), value) def test_chrome_cookie_decryptor_linux_v11(self): - with MonkeyPatch(cookies, {'_get_linux_keyring_password': lambda *args, **kwargs: b'', - 'KEYRING_AVAILABLE': True}): + with MonkeyPatch(cookies, {'_get_linux_keyring_password': lambda *args, **kwargs: b''}): encrypted_value = b'v11#\x81\x10>`w\x8f)\xc0\xb2\xc1\r\xf4\x1al\xdd\x93\xfd\xf8\xf8N\xf2\xa9\x83\xf1\xe9o\x0elVQd' value = 'tz=Europe.London' - decryptor = LinuxChromeCookieDecryptor('Chrome', YDLLogger()) + decryptor = LinuxChromeCookieDecryptor('Chrome', Logger()) self.assertEqual(decryptor.decrypt(encrypted_value), value) - @unittest.skipIf(not CRYPTO_AVAILABLE, 'cryptography library not available') def test_chrome_cookie_decryptor_windows_v10(self): with MonkeyPatch(cookies, { '_get_windows_v10_key': lambda *args, **kwargs: b'Y\xef\xad\xad\xeerp\xf0Y\xe6\x9b\x12\xc2<z\x16]\n\xbb\xb8\xcb\xd7\x9bA\xc3\x14e\x99{\xd6\xf4&' }): encrypted_value = b'v10T\xb8\xf3\xb8\x01\xa7TtcV\xfc\x88\xb8\xb8\xef\x05\xb5\xfd\x18\xc90\x009\xab\xb1\x893\x85)\x87\xe1\xa9-\xa3\xad=' value = '32101439' - decryptor = WindowsChromeCookieDecryptor('', YDLLogger()) + decryptor = WindowsChromeCookieDecryptor('', Logger()) self.assertEqual(decryptor.decrypt(encrypted_value), value) def test_chrome_cookie_decryptor_mac_v10(self): with MonkeyPatch(cookies, {'_get_mac_keyring_password': lambda *args, **kwargs: b'6eIDUdtKAacvlHwBVwvg/Q=='}): encrypted_value = b'v10\xb3\xbe\xad\xa1[\x9fC\xa1\x98\xe0\x9a\x01\xd9\xcf\xbfc' value = '2021-06-01-22' - decryptor = MacChromeCookieDecryptor('', YDLLogger()) + decryptor = MacChromeCookieDecryptor('', Logger()) self.assertEqual(decryptor.decrypt(encrypted_value), value) def test_safari_cookie_parsing(self): diff --git a/test/test_download.py b/test/test_download.py old mode 100644 new mode 100755 diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 8b2b60403c..e230b045fd 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -112,6 +112,71 @@ def test_call(self): ''') self.assertEqual(jsi.call_function('z'), 5) + def test_for_loop(self): + jsi = JSInterpreter(''' + function x() { a=0; for (i=0; i-10; i++) {a++} a } + ''') + self.assertEqual(jsi.call_function('x'), 10) + + def test_switch(self): + jsi = JSInterpreter(''' + function x(f) { switch(f){ + case 1:f+=1; + case 2:f+=2; + case 3:f+=3;break; + case 4:f+=4; + default:f=0; + } return f } + ''') + self.assertEqual(jsi.call_function('x', 1), 7) + self.assertEqual(jsi.call_function('x', 3), 6) + self.assertEqual(jsi.call_function('x', 5), 0) + + def test_switch_default(self): + jsi = JSInterpreter(''' + function x(f) { switch(f){ + case 2: f+=2; + default: f-=1; + case 5: + case 6: f+=6; + case 0: break; + case 1: f+=1; + } return f } + ''') + self.assertEqual(jsi.call_function('x', 1), 2) + self.assertEqual(jsi.call_function('x', 5), 11) + self.assertEqual(jsi.call_function('x', 9), 14) + + def test_try(self): + jsi = JSInterpreter(''' + function x() { try{return 10} catch(e){return 5} } + ''') + self.assertEqual(jsi.call_function('x'), 10) + + def test_for_loop_continue(self): + jsi = JSInterpreter(''' + function x() { a=0; for (i=0; i-10; i++) { continue; a++ } a } + ''') + self.assertEqual(jsi.call_function('x'), 0) + + def test_for_loop_break(self): + jsi = JSInterpreter(''' + function x() { a=0; for (i=0; i-10; i++) { break; a++ } a } + ''') + self.assertEqual(jsi.call_function('x'), 0) + + def test_literal_list(self): + jsi = JSInterpreter(''' + function x() { [1, 2, "asdf", [5, 6, 7]][3] } + ''') + self.assertEqual(jsi.call_function('x'), [5, 6, 7]) + + def test_comma(self): + jsi = JSInterpreter(''' + function x() { a=5; a -= 1, a+=3; return a } + ''') + self.assertEqual(jsi.call_function('x'), 7) + if __name__ == '__main__': unittest.main() diff --git a/test/test_postprocessors.py b/test/test_postprocessors.py index b15cbd28c8..bbe998993f 100644 --- a/test/test_postprocessors.py +++ b/test/test_postprocessors.py @@ -6,6 +6,7 @@ import os import sys import unittest + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from yt_dlp import YoutubeDL @@ -15,6 +16,7 @@ FFmpegThumbnailsConvertorPP, MetadataFromFieldPP, MetadataParserPP, + ModifyChaptersPP ) @@ -68,3 +70,493 @@ def test_parse_cmd(self): self.assertEqual(pp.parse_cmd('echo', info), cmd) self.assertEqual(pp.parse_cmd('echo {}', info), cmd) self.assertEqual(pp.parse_cmd('echo %(filepath)q', info), cmd) + + +class TestModifyChaptersPP(unittest.TestCase): + def setUp(self): + self._pp = ModifyChaptersPP(YoutubeDL()) + + @staticmethod + def _sponsor_chapter(start, end, cat, remove=False): + c = {'start_time': start, 'end_time': end, '_categories': [(cat, start, end)]} + if remove: + c['remove'] = True + return c + + @staticmethod + def _chapter(start, end, title=None, remove=False): + c = {'start_time': start, 'end_time': end} + if title is not None: + c['title'] = title + if remove: + c['remove'] = True + return c + + def _chapters(self, ends, titles): + self.assertEqual(len(ends), len(titles)) + start = 0 + chapters = [] + for e, t in zip(ends, titles): + chapters.append(self._chapter(start, e, t)) + start = e + return chapters + + def _remove_marked_arrange_sponsors_test_impl( + self, chapters, expected_chapters, expected_removed): + actual_chapters, actual_removed = ( + self._pp._remove_marked_arrange_sponsors(chapters)) + for c in actual_removed: + c.pop('title', None) + c.pop('_categories', None) + actual_chapters = [{ + 'start_time': c['start_time'], + 'end_time': c['end_time'], + 'title': c['title'], + } for c in actual_chapters] + self.assertSequenceEqual(expected_chapters, actual_chapters) + self.assertSequenceEqual(expected_removed, actual_removed) + + def test_remove_marked_arrange_sponsors_CanGetThroughUnaltered(self): + chapters = self._chapters([10, 20, 30, 40], ['c1', 'c2', 'c3', 'c4']) + self._remove_marked_arrange_sponsors_test_impl(chapters, chapters, []) + + def test_remove_marked_arrange_sponsors_ChapterWithSponsors(self): + chapters = self._chapters([70], ['c']) + [ + self._sponsor_chapter(10, 20, 'sponsor'), + self._sponsor_chapter(30, 40, 'preview'), + self._sponsor_chapter(50, 60, 'filler')] + expected = self._chapters( + [10, 20, 30, 40, 50, 60, 70], + ['c', '[SponsorBlock]: Sponsor', 'c', '[SponsorBlock]: Preview/Recap', + 'c', '[SponsorBlock]: Filler Tangent', 'c']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_UniqueNamesForOverlappingSponsors(self): + chapters = self._chapters([120], ['c']) + [ + self._sponsor_chapter(10, 45, 'sponsor'), self._sponsor_chapter(20, 40, 'selfpromo'), + self._sponsor_chapter(50, 70, 'sponsor'), self._sponsor_chapter(60, 85, 'selfpromo'), + self._sponsor_chapter(90, 120, 'selfpromo'), self._sponsor_chapter(100, 110, 'sponsor')] + expected = self._chapters( + [10, 20, 40, 45, 50, 60, 70, 85, 90, 100, 110, 120], + ['c', '[SponsorBlock]: Sponsor', '[SponsorBlock]: Sponsor, Unpaid/Self Promotion', + '[SponsorBlock]: Sponsor', + 'c', '[SponsorBlock]: Sponsor', '[SponsorBlock]: Sponsor, Unpaid/Self Promotion', + '[SponsorBlock]: Unpaid/Self Promotion', + 'c', '[SponsorBlock]: Unpaid/Self Promotion', '[SponsorBlock]: Unpaid/Self Promotion, Sponsor', + '[SponsorBlock]: Unpaid/Self Promotion']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_ChapterWithCuts(self): + cuts = [self._chapter(10, 20, remove=True), + self._sponsor_chapter(30, 40, 'sponsor', remove=True), + self._chapter(50, 60, remove=True)] + chapters = self._chapters([70], ['c']) + cuts + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([40], ['c']), cuts) + + def test_remove_marked_arrange_sponsors_ChapterWithSponsorsAndCuts(self): + chapters = self._chapters([70], ['c']) + [ + self._sponsor_chapter(10, 20, 'sponsor'), + self._sponsor_chapter(30, 40, 'selfpromo', remove=True), + self._sponsor_chapter(50, 60, 'interaction')] + expected = self._chapters([10, 20, 40, 50, 60], + ['c', '[SponsorBlock]: Sponsor', 'c', + '[SponsorBlock]: Interaction Reminder', 'c']) + self._remove_marked_arrange_sponsors_test_impl( + chapters, expected, [self._chapter(30, 40, remove=True)]) + + def test_remove_marked_arrange_sponsors_ChapterWithSponsorCutInTheMiddle(self): + cuts = [self._sponsor_chapter(20, 30, 'selfpromo', remove=True), + self._chapter(40, 50, remove=True)] + chapters = self._chapters([70], ['c']) + [self._sponsor_chapter(10, 60, 'sponsor')] + cuts + expected = self._chapters( + [10, 40, 50], ['c', '[SponsorBlock]: Sponsor', 'c']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_ChapterWithCutHidingSponsor(self): + cuts = [self._sponsor_chapter(20, 50, 'selpromo', remove=True)] + chapters = self._chapters([60], ['c']) + [ + self._sponsor_chapter(10, 20, 'intro'), + self._sponsor_chapter(30, 40, 'sponsor'), + self._sponsor_chapter(50, 60, 'outro'), + ] + cuts + expected = self._chapters( + [10, 20, 30], ['c', '[SponsorBlock]: Intermission/Intro Animation', '[SponsorBlock]: Endcards/Credits']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_ChapterWithAdjacentSponsors(self): + chapters = self._chapters([70], ['c']) + [ + self._sponsor_chapter(10, 20, 'sponsor'), + self._sponsor_chapter(20, 30, 'selfpromo'), + self._sponsor_chapter(30, 40, 'interaction')] + expected = self._chapters( + [10, 20, 30, 40, 70], + ['c', '[SponsorBlock]: Sponsor', '[SponsorBlock]: Unpaid/Self Promotion', + '[SponsorBlock]: Interaction Reminder', 'c']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_ChapterWithAdjacentCuts(self): + chapters = self._chapters([70], ['c']) + [ + self._sponsor_chapter(10, 20, 'sponsor'), + self._sponsor_chapter(20, 30, 'interaction', remove=True), + self._chapter(30, 40, remove=True), + self._sponsor_chapter(40, 50, 'selpromo', remove=True), + self._sponsor_chapter(50, 60, 'interaction')] + expected = self._chapters([10, 20, 30, 40], + ['c', '[SponsorBlock]: Sponsor', + '[SponsorBlock]: Interaction Reminder', 'c']) + self._remove_marked_arrange_sponsors_test_impl( + chapters, expected, [self._chapter(20, 50, remove=True)]) + + def test_remove_marked_arrange_sponsors_ChapterWithOverlappingSponsors(self): + chapters = self._chapters([70], ['c']) + [ + self._sponsor_chapter(10, 30, 'sponsor'), + self._sponsor_chapter(20, 50, 'selfpromo'), + self._sponsor_chapter(40, 60, 'interaction')] + expected = self._chapters( + [10, 20, 30, 40, 50, 60, 70], + ['c', '[SponsorBlock]: Sponsor', '[SponsorBlock]: Sponsor, Unpaid/Self Promotion', + '[SponsorBlock]: Unpaid/Self Promotion', '[SponsorBlock]: Unpaid/Self Promotion, Interaction Reminder', + '[SponsorBlock]: Interaction Reminder', 'c']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_ChapterWithOverlappingCuts(self): + chapters = self._chapters([70], ['c']) + [ + self._sponsor_chapter(10, 30, 'sponsor', remove=True), + self._sponsor_chapter(20, 50, 'selfpromo', remove=True), + self._sponsor_chapter(40, 60, 'interaction', remove=True)] + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([20], ['c']), [self._chapter(10, 60, remove=True)]) + + def test_remove_marked_arrange_sponsors_ChapterWithRunsOfOverlappingSponsors(self): + chapters = self._chapters([170], ['c']) + [ + self._sponsor_chapter(0, 30, 'intro'), + self._sponsor_chapter(20, 50, 'sponsor'), + self._sponsor_chapter(40, 60, 'selfpromo'), + self._sponsor_chapter(70, 90, 'sponsor'), + self._sponsor_chapter(80, 100, 'sponsor'), + self._sponsor_chapter(90, 110, 'sponsor'), + self._sponsor_chapter(120, 140, 'selfpromo'), + self._sponsor_chapter(130, 160, 'interaction'), + self._sponsor_chapter(150, 170, 'outro')] + expected = self._chapters( + [20, 30, 40, 50, 60, 70, 110, 120, 130, 140, 150, 160, 170], + ['[SponsorBlock]: Intermission/Intro Animation', '[SponsorBlock]: Intermission/Intro Animation, Sponsor', '[SponsorBlock]: Sponsor', + '[SponsorBlock]: Sponsor, Unpaid/Self Promotion', '[SponsorBlock]: Unpaid/Self Promotion', 'c', + '[SponsorBlock]: Sponsor', 'c', '[SponsorBlock]: Unpaid/Self Promotion', + '[SponsorBlock]: Unpaid/Self Promotion, Interaction Reminder', + '[SponsorBlock]: Interaction Reminder', + '[SponsorBlock]: Interaction Reminder, Endcards/Credits', '[SponsorBlock]: Endcards/Credits']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_ChapterWithRunsOfOverlappingCuts(self): + chapters = self._chapters([170], ['c']) + [ + self._chapter(0, 30, remove=True), + self._sponsor_chapter(20, 50, 'sponsor', remove=True), + self._chapter(40, 60, remove=True), + self._sponsor_chapter(70, 90, 'sponsor', remove=True), + self._chapter(80, 100, remove=True), + self._chapter(90, 110, remove=True), + self._sponsor_chapter(120, 140, 'sponsor', remove=True), + self._sponsor_chapter(130, 160, 'selfpromo', remove=True), + self._chapter(150, 170, remove=True)] + expected_cuts = [self._chapter(0, 60, remove=True), + self._chapter(70, 110, remove=True), + self._chapter(120, 170, remove=True)] + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([20], ['c']), expected_cuts) + + def test_remove_marked_arrange_sponsors_OverlappingSponsorsDifferentTitlesAfterCut(self): + chapters = self._chapters([60], ['c']) + [ + self._sponsor_chapter(10, 60, 'sponsor'), + self._sponsor_chapter(10, 40, 'intro'), + self._sponsor_chapter(30, 50, 'interaction'), + self._sponsor_chapter(30, 50, 'selfpromo', remove=True), + self._sponsor_chapter(40, 50, 'interaction'), + self._sponsor_chapter(50, 60, 'outro')] + expected = self._chapters( + [10, 30, 40], ['c', '[SponsorBlock]: Sponsor, Intermission/Intro Animation', '[SponsorBlock]: Sponsor, Endcards/Credits']) + self._remove_marked_arrange_sponsors_test_impl( + chapters, expected, [self._chapter(30, 50, remove=True)]) + + def test_remove_marked_arrange_sponsors_SponsorsNoLongerOverlapAfterCut(self): + chapters = self._chapters([70], ['c']) + [ + self._sponsor_chapter(10, 30, 'sponsor'), + self._sponsor_chapter(20, 50, 'interaction'), + self._sponsor_chapter(30, 50, 'selpromo', remove=True), + self._sponsor_chapter(40, 60, 'sponsor'), + self._sponsor_chapter(50, 60, 'interaction')] + expected = self._chapters( + [10, 20, 40, 50], ['c', '[SponsorBlock]: Sponsor', + '[SponsorBlock]: Sponsor, Interaction Reminder', 'c']) + self._remove_marked_arrange_sponsors_test_impl( + chapters, expected, [self._chapter(30, 50, remove=True)]) + + def test_remove_marked_arrange_sponsors_SponsorsStillOverlapAfterCut(self): + chapters = self._chapters([70], ['c']) + [ + self._sponsor_chapter(10, 60, 'sponsor'), + self._sponsor_chapter(20, 60, 'interaction'), + self._sponsor_chapter(30, 50, 'selfpromo', remove=True)] + expected = self._chapters( + [10, 20, 40, 50], ['c', '[SponsorBlock]: Sponsor', + '[SponsorBlock]: Sponsor, Interaction Reminder', 'c']) + self._remove_marked_arrange_sponsors_test_impl( + chapters, expected, [self._chapter(30, 50, remove=True)]) + + def test_remove_marked_arrange_sponsors_ChapterWithRunsOfOverlappingSponsorsAndCuts(self): + chapters = self._chapters([200], ['c']) + [ + self._sponsor_chapter(10, 40, 'sponsor'), + self._sponsor_chapter(10, 30, 'intro'), + self._chapter(20, 30, remove=True), + self._sponsor_chapter(30, 40, 'selfpromo'), + self._sponsor_chapter(50, 70, 'sponsor'), + self._sponsor_chapter(60, 80, 'interaction'), + self._chapter(70, 80, remove=True), + self._sponsor_chapter(70, 90, 'sponsor'), + self._sponsor_chapter(80, 100, 'interaction'), + self._sponsor_chapter(120, 170, 'selfpromo'), + self._sponsor_chapter(130, 180, 'outro'), + self._chapter(140, 150, remove=True), + self._chapter(150, 160, remove=True)] + expected = self._chapters( + [10, 20, 30, 40, 50, 70, 80, 100, 110, 130, 140, 160], + ['c', '[SponsorBlock]: Sponsor, Intermission/Intro Animation', '[SponsorBlock]: Sponsor, Unpaid/Self Promotion', + 'c', '[SponsorBlock]: Sponsor', '[SponsorBlock]: Sponsor, Interaction Reminder', + '[SponsorBlock]: Interaction Reminder', 'c', '[SponsorBlock]: Unpaid/Self Promotion', + '[SponsorBlock]: Unpaid/Self Promotion, Endcards/Credits', '[SponsorBlock]: Endcards/Credits', 'c']) + expected_cuts = [self._chapter(20, 30, remove=True), + self._chapter(70, 80, remove=True), + self._chapter(140, 160, remove=True)] + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, expected_cuts) + + def test_remove_marked_arrange_sponsors_SponsorOverlapsMultipleChapters(self): + chapters = (self._chapters([20, 40, 60, 80, 100], ['c1', 'c2', 'c3', 'c4', 'c5']) + + [self._sponsor_chapter(10, 90, 'sponsor')]) + expected = self._chapters([10, 90, 100], ['c1', '[SponsorBlock]: Sponsor', 'c5']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_CutOverlapsMultipleChapters(self): + cuts = [self._chapter(10, 90, remove=True)] + chapters = self._chapters([20, 40, 60, 80, 100], ['c1', 'c2', 'c3', 'c4', 'c5']) + cuts + expected = self._chapters([10, 20], ['c1', 'c5']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_SponsorsWithinSomeChaptersAndOverlappingOthers(self): + chapters = (self._chapters([10, 40, 60, 80], ['c1', 'c2', 'c3', 'c4']) + + [self._sponsor_chapter(20, 30, 'sponsor'), + self._sponsor_chapter(50, 70, 'selfpromo')]) + expected = self._chapters([10, 20, 30, 40, 50, 70, 80], + ['c1', 'c2', '[SponsorBlock]: Sponsor', 'c2', 'c3', + '[SponsorBlock]: Unpaid/Self Promotion', 'c4']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_CutsWithinSomeChaptersAndOverlappingOthers(self): + cuts = [self._chapter(20, 30, remove=True), self._chapter(50, 70, remove=True)] + chapters = self._chapters([10, 40, 60, 80], ['c1', 'c2', 'c3', 'c4']) + cuts + expected = self._chapters([10, 30, 40, 50], ['c1', 'c2', 'c3', 'c4']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_ChaptersAfterLastSponsor(self): + chapters = (self._chapters([20, 40, 50, 60], ['c1', 'c2', 'c3', 'c4']) + + [self._sponsor_chapter(10, 30, 'music_offtopic')]) + expected = self._chapters( + [10, 30, 40, 50, 60], + ['c1', '[SponsorBlock]: Non-Music Section', 'c2', 'c3', 'c4']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_ChaptersAfterLastCut(self): + cuts = [self._chapter(10, 30, remove=True)] + chapters = self._chapters([20, 40, 50, 60], ['c1', 'c2', 'c3', 'c4']) + cuts + expected = self._chapters([10, 20, 30, 40], ['c1', 'c2', 'c3', 'c4']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_SponsorStartsAtChapterStart(self): + chapters = (self._chapters([10, 20, 40], ['c1', 'c2', 'c3']) + + [self._sponsor_chapter(20, 30, 'sponsor')]) + expected = self._chapters([10, 20, 30, 40], ['c1', 'c2', '[SponsorBlock]: Sponsor', 'c3']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_CutStartsAtChapterStart(self): + cuts = [self._chapter(20, 30, remove=True)] + chapters = self._chapters([10, 20, 40], ['c1', 'c2', 'c3']) + cuts + expected = self._chapters([10, 20, 30], ['c1', 'c2', 'c3']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_SponsorEndsAtChapterEnd(self): + chapters = (self._chapters([10, 30, 40], ['c1', 'c2', 'c3']) + + [self._sponsor_chapter(20, 30, 'sponsor')]) + expected = self._chapters([10, 20, 30, 40], ['c1', 'c2', '[SponsorBlock]: Sponsor', 'c3']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_CutEndsAtChapterEnd(self): + cuts = [self._chapter(20, 30, remove=True)] + chapters = self._chapters([10, 30, 40], ['c1', 'c2', 'c3']) + cuts + expected = self._chapters([10, 20, 30], ['c1', 'c2', 'c3']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_SponsorCoincidesWithChapters(self): + chapters = (self._chapters([10, 20, 30, 40], ['c1', 'c2', 'c3', 'c4']) + + [self._sponsor_chapter(10, 30, 'sponsor')]) + expected = self._chapters([10, 30, 40], ['c1', '[SponsorBlock]: Sponsor', 'c4']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_CutCoincidesWithChapters(self): + cuts = [self._chapter(10, 30, remove=True)] + chapters = self._chapters([10, 20, 30, 40], ['c1', 'c2', 'c3', 'c4']) + cuts + expected = self._chapters([10, 20], ['c1', 'c4']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_SponsorsAtVideoBoundaries(self): + chapters = (self._chapters([20, 40, 60], ['c1', 'c2', 'c3']) + + [self._sponsor_chapter(0, 10, 'intro'), self._sponsor_chapter(50, 60, 'outro')]) + expected = self._chapters( + [10, 20, 40, 50, 60], ['[SponsorBlock]: Intermission/Intro Animation', 'c1', 'c2', 'c3', '[SponsorBlock]: Endcards/Credits']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_CutsAtVideoBoundaries(self): + cuts = [self._chapter(0, 10, remove=True), self._chapter(50, 60, remove=True)] + chapters = self._chapters([20, 40, 60], ['c1', 'c2', 'c3']) + cuts + expected = self._chapters([10, 30, 40], ['c1', 'c2', 'c3']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_SponsorsOverlapChaptersAtVideoBoundaries(self): + chapters = (self._chapters([10, 40, 50], ['c1', 'c2', 'c3']) + + [self._sponsor_chapter(0, 20, 'intro'), self._sponsor_chapter(30, 50, 'outro')]) + expected = self._chapters( + [20, 30, 50], ['[SponsorBlock]: Intermission/Intro Animation', 'c2', '[SponsorBlock]: Endcards/Credits']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_CutsOverlapChaptersAtVideoBoundaries(self): + cuts = [self._chapter(0, 20, remove=True), self._chapter(30, 50, remove=True)] + chapters = self._chapters([10, 40, 50], ['c1', 'c2', 'c3']) + cuts + expected = self._chapters([10], ['c2']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, cuts) + + def test_remove_marked_arrange_sponsors_EverythingSponsored(self): + chapters = (self._chapters([10, 20, 30, 40], ['c1', 'c2', 'c3', 'c4']) + + [self._sponsor_chapter(0, 20, 'intro'), self._sponsor_chapter(20, 40, 'outro')]) + expected = self._chapters([20, 40], ['[SponsorBlock]: Intermission/Intro Animation', '[SponsorBlock]: Endcards/Credits']) + self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) + + def test_remove_marked_arrange_sponsors_EverythingCut(self): + cuts = [self._chapter(0, 20, remove=True), self._chapter(20, 40, remove=True)] + chapters = self._chapters([10, 20, 30, 40], ['c1', 'c2', 'c3', 'c4']) + cuts + self._remove_marked_arrange_sponsors_test_impl( + chapters, [], [self._chapter(0, 40, remove=True)]) + + def test_remove_marked_arrange_sponsors_TinyChaptersInTheOriginalArePreserved(self): + chapters = self._chapters([0.1, 0.2, 0.3, 0.4], ['c1', 'c2', 'c3', 'c4']) + self._remove_marked_arrange_sponsors_test_impl(chapters, chapters, []) + + def test_remove_marked_arrange_sponsors_TinySponsorsAreIgnored(self): + chapters = [self._sponsor_chapter(0, 0.1, 'intro'), self._chapter(0.1, 0.2, 'c1'), + self._sponsor_chapter(0.2, 0.3, 'sponsor'), self._chapter(0.3, 0.4, 'c2'), + self._sponsor_chapter(0.4, 0.5, 'outro')] + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([0.3, 0.5], ['c1', 'c2']), []) + + def test_remove_marked_arrange_sponsors_TinyChaptersResultingFromCutsAreIgnored(self): + cuts = [self._chapter(1.5, 2.5, remove=True)] + chapters = self._chapters([2, 3, 3.5], ['c1', 'c2', 'c3']) + cuts + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([2, 2.5], ['c1', 'c3']), cuts) + + def test_remove_marked_arrange_sponsors_SingleTinyChapterIsPreserved(self): + cuts = [self._chapter(0.5, 2, remove=True)] + chapters = self._chapters([2], ['c']) + cuts + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([0.5], ['c']), cuts) + + def test_remove_marked_arrange_sponsors_TinyChapterAtTheStartPrependedToTheNext(self): + cuts = [self._chapter(0.5, 2, remove=True)] + chapters = self._chapters([2, 4], ['c1', 'c2']) + cuts + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([2.5], ['c2']), cuts) + + def test_remove_marked_arrange_sponsors_TinyChaptersResultingFromSponsorOverlapAreIgnored(self): + chapters = self._chapters([1, 3, 4], ['c1', 'c2', 'c3']) + [ + self._sponsor_chapter(1.5, 2.5, 'sponsor')] + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([1.5, 2.5, 4], ['c1', '[SponsorBlock]: Sponsor', 'c3']), []) + + def test_remove_marked_arrange_sponsors_TinySponsorsOverlapsAreIgnored(self): + chapters = self._chapters([2, 3, 5], ['c1', 'c2', 'c3']) + [ + self._sponsor_chapter(1, 3, 'sponsor'), + self._sponsor_chapter(2.5, 4, 'selfpromo') + ] + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([1, 3, 4, 5], [ + 'c1', '[SponsorBlock]: Sponsor', '[SponsorBlock]: Unpaid/Self Promotion', 'c3']), []) + + def test_remove_marked_arrange_sponsors_TinySponsorsPrependedToTheNextSponsor(self): + chapters = self._chapters([4], ['c']) + [ + self._sponsor_chapter(1.5, 2, 'sponsor'), + self._sponsor_chapter(2, 4, 'selfpromo') + ] + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([1.5, 4], ['c', '[SponsorBlock]: Unpaid/Self Promotion']), []) + + def test_remove_marked_arrange_sponsors_SmallestSponsorInTheOverlapGetsNamed(self): + self._pp._sponsorblock_chapter_title = '[SponsorBlock]: %(name)s' + chapters = self._chapters([10], ['c']) + [ + self._sponsor_chapter(2, 8, 'sponsor'), + self._sponsor_chapter(4, 6, 'selfpromo') + ] + self._remove_marked_arrange_sponsors_test_impl( + chapters, self._chapters([2, 4, 6, 8, 10], [ + 'c', '[SponsorBlock]: Sponsor', '[SponsorBlock]: Unpaid/Self Promotion', + '[SponsorBlock]: Sponsor', 'c' + ]), []) + + def test_make_concat_opts_CommonCase(self): + sponsor_chapters = [self._chapter(1, 2, 's1'), self._chapter(10, 20, 's2')] + expected = '''ffconcat version 1.0 +file 'file:test' +outpoint 1.000000 +file 'file:test' +inpoint 2.000000 +outpoint 10.000000 +file 'file:test' +inpoint 20.000000 +''' + opts = self._pp._make_concat_opts(sponsor_chapters, 30) + self.assertEqual(expected, ''.join(self._pp._concat_spec(['test'] * len(opts), opts))) + + def test_make_concat_opts_NoZeroDurationChunkAtVideoStart(self): + sponsor_chapters = [self._chapter(0, 1, 's1'), self._chapter(10, 20, 's2')] + expected = '''ffconcat version 1.0 +file 'file:test' +inpoint 1.000000 +outpoint 10.000000 +file 'file:test' +inpoint 20.000000 +''' + opts = self._pp._make_concat_opts(sponsor_chapters, 30) + self.assertEqual(expected, ''.join(self._pp._concat_spec(['test'] * len(opts), opts))) + + def test_make_concat_opts_NoZeroDurationChunkAtVideoEnd(self): + sponsor_chapters = [self._chapter(1, 2, 's1'), self._chapter(10, 20, 's2')] + expected = '''ffconcat version 1.0 +file 'file:test' +outpoint 1.000000 +file 'file:test' +inpoint 2.000000 +outpoint 10.000000 +''' + opts = self._pp._make_concat_opts(sponsor_chapters, 20) + self.assertEqual(expected, ''.join(self._pp._concat_spec(['test'] * len(opts), opts))) + + def test_quote_for_concat_RunsOfQuotes(self): + self.assertEqual( + r"'special '\'' '\'\''characters'\'\'\''galore'", + self._pp._quote_for_ffmpeg("special ' ''characters'''galore")) + + def test_quote_for_concat_QuotesAtStart(self): + self.assertEqual( + r"\'\'\''special '\'' characters '\'' galore'", + self._pp._quote_for_ffmpeg("'''special ' characters ' galore")) + + def test_quote_for_concat_QuotesAtEnd(self): + self.assertEqual( + r"'special '\'' characters '\'' galore'\'\'\'", + self._pp._quote_for_ffmpeg("special ' characters ' galore'''")) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 0c5b49ee8c..9b39dbd39b 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -19,6 +19,7 @@ CeskaTelevizeIE, LyndaIE, NPOIE, + PBSIE, ComedyCentralIE, NRKTVIE, RaiPlayIE, @@ -372,5 +373,42 @@ def test_subtitles_in_page(self): self.assertEqual(md5(subtitles['en']), 'acaca989e24a9e45a6719c9b3d60815c') +@is_download_test +class TestPBSSubtitles(BaseTestSubtitles): + url = 'https://www.pbs.org/video/how-fantasy-reflects-our-world-picecq/' + IE = PBSIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + + def test_subtitles_dfxp_format(self): + self.DL.params['writesubtitles'] = True + self.DL.params['subtitlesformat'] = 'dfxp' + subtitles = self.getSubtitles() + self.assertIn(md5(subtitles['en']), ['643b034254cdc3768ff1e750b6b5873b']) + + def test_subtitles_vtt_format(self): + self.DL.params['writesubtitles'] = True + self.DL.params['subtitlesformat'] = 'vtt' + subtitles = self.getSubtitles() + self.assertIn( + md5(subtitles['en']), ['937a05711555b165d4c55a9667017045', 'f49ea998d6824d94959c8152a368ff73']) + + def test_subtitles_srt_format(self): + self.DL.params['writesubtitles'] = True + self.DL.params['subtitlesformat'] = 'srt' + subtitles = self.getSubtitles() + self.assertIn(md5(subtitles['en']), ['2082c21b43759d9bf172931b2f2ca371']) + + def test_subtitles_sami_format(self): + self.DL.params['writesubtitles'] = True + self.DL.params['subtitlesformat'] = 'sami' + subtitles = self.getSubtitles() + self.assertIn(md5(subtitles['en']), ['4256b16ac7da6a6780fafd04294e85cd']) + + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index d20bca7950..2e33308c75 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -37,6 +37,7 @@ ExtractorError, find_xpath_attr, fix_xml_ampersands, + format_bytes, float_or_none, get_element_by_class, get_element_by_attribute, @@ -848,30 +849,52 @@ def test_parse_codecs(self): self.assertEqual(parse_codecs('avc1.77.30, mp4a.40.2'), { 'vcodec': 'avc1.77.30', 'acodec': 'mp4a.40.2', + 'dynamic_range': None, }) self.assertEqual(parse_codecs('mp4a.40.2'), { 'vcodec': 'none', 'acodec': 'mp4a.40.2', + 'dynamic_range': None, }) self.assertEqual(parse_codecs('mp4a.40.5,avc1.42001e'), { 'vcodec': 'avc1.42001e', 'acodec': 'mp4a.40.5', + 'dynamic_range': None, }) self.assertEqual(parse_codecs('avc3.640028'), { 'vcodec': 'avc3.640028', 'acodec': 'none', + 'dynamic_range': None, }) self.assertEqual(parse_codecs(', h264,,newcodec,aac'), { 'vcodec': 'h264', 'acodec': 'aac', + 'dynamic_range': None, }) self.assertEqual(parse_codecs('av01.0.05M.08'), { 'vcodec': 'av01.0.05M.08', 'acodec': 'none', + 'dynamic_range': None, + }) + self.assertEqual(parse_codecs('vp9.2'), { + 'vcodec': 'vp9.2', + 'acodec': 'none', + 'dynamic_range': 'HDR10', + }) + self.assertEqual(parse_codecs('av01.0.12M.10.0.110.09.16.09.0'), { + 'vcodec': 'av01.0.12M.10', + 'acodec': 'none', + 'dynamic_range': 'HDR10', + }) + self.assertEqual(parse_codecs('dvhe'), { + 'vcodec': 'dvhe', + 'acodec': 'none', + 'dynamic_range': 'DV', }) self.assertEqual(parse_codecs('theora, vorbis'), { 'vcodec': 'theora', 'acodec': 'vorbis', + 'dynamic_range': None, }) self.assertEqual(parse_codecs('unknownvcodec, unknownacodec'), { 'vcodec': 'unknownvcodec', @@ -1134,19 +1157,29 @@ def test_parse_count(self): self.assertEqual(parse_count('1000'), 1000) self.assertEqual(parse_count('1.000'), 1000) self.assertEqual(parse_count('1.1k'), 1100) + self.assertEqual(parse_count('1.1 k'), 1100) + self.assertEqual(parse_count('1,1 k'), 1100) self.assertEqual(parse_count('1.1kk'), 1100000) self.assertEqual(parse_count('1.1kk '), 1100000) + self.assertEqual(parse_count('1,1kk'), 1100000) + self.assertEqual(parse_count('100 views'), 100) + self.assertEqual(parse_count('1,100 views'), 1100) self.assertEqual(parse_count('1.1kk views'), 1100000) + self.assertEqual(parse_count('10M views'), 10000000) + self.assertEqual(parse_count('has 10M views'), 10000000) def test_parse_resolution(self): self.assertEqual(parse_resolution(None), {}) self.assertEqual(parse_resolution(''), {}) - self.assertEqual(parse_resolution('1920x1080'), {'width': 1920, 'height': 1080}) - self.assertEqual(parse_resolution('1920×1080'), {'width': 1920, 'height': 1080}) + self.assertEqual(parse_resolution(' 1920x1080'), {'width': 1920, 'height': 1080}) + self.assertEqual(parse_resolution('1920×1080 '), {'width': 1920, 'height': 1080}) self.assertEqual(parse_resolution('1920 x 1080'), {'width': 1920, 'height': 1080}) self.assertEqual(parse_resolution('720p'), {'height': 720}) self.assertEqual(parse_resolution('4k'), {'height': 2160}) self.assertEqual(parse_resolution('8K'), {'height': 4320}) + self.assertEqual(parse_resolution('pre_1920x1080_post'), {'width': 1920, 'height': 1080}) + self.assertEqual(parse_resolution('ep1x2'), {}) + self.assertEqual(parse_resolution('1920, 1080'), {'width': 1920, 'height': 1080}) def test_parse_bitrate(self): self.assertEqual(parse_bitrate(None), None) @@ -1197,12 +1230,49 @@ def test_is_html(self): def test_render_table(self): self.assertEqual( render_table( - ['a', 'bcd'], - [[123, 4], [9999, 51]]), + ['a', 'empty', 'bcd'], + [[123, '', 4], [9999, '', 51]]), + 'a empty bcd\n' + '123 4\n' + '9999 51') + + self.assertEqual( + render_table( + ['a', 'empty', 'bcd'], + [[123, '', 4], [9999, '', 51]], + hide_empty=True), 'a bcd\n' '123 4\n' '9999 51') + self.assertEqual( + render_table( + ['\ta', 'bcd'], + [['1\t23', 4], ['\t9999', 51]]), + ' a bcd\n' + '1 23 4\n' + '9999 51') + + self.assertEqual( + render_table( + ['a', 'bcd'], + [[123, 4], [9999, 51]], + delim='-'), + 'a bcd\n' + '--------\n' + '123 4\n' + '9999 51') + + self.assertEqual( + render_table( + ['a', 'bcd'], + [[123, 4], [9999, 51]], + delim='-', extra_gap=2), + 'a bcd\n' + '----------\n' + '123 4\n' + '9999 51') + def test_match_str(self): # Unary self.assertFalse(match_str('xy', {'x': 1200})) @@ -1231,6 +1301,7 @@ def test_match_str(self): self.assertFalse(match_str('x>2K', {'x': 1200})) self.assertTrue(match_str('x>=1200 & x < 1300', {'x': 1200})) self.assertFalse(match_str('x>=1100 & x < 1200', {'x': 1200})) + self.assertTrue(match_str('x > 1:0:0', {'x': 3700})) # String self.assertFalse(match_str('y=a212', {'y': 'foobar42'})) @@ -1367,21 +1438,21 @@ def test_dfxp2srt(self): </body> </tt>'''.encode('utf-8') srt_data = '''1 -00:00:02,080 --> 00:00:05,839 +00:00:02,080 --> 00:00:05,840 <font color="white" face="sansSerif" size="16">default style<font color="red">custom style</font></font> 2 -00:00:02,080 --> 00:00:05,839 +00:00:02,080 --> 00:00:05,840 <b><font color="cyan" face="sansSerif" size="16"><font color="lime">part 1 </font>part 2</font></b> 3 -00:00:05,839 --> 00:00:09,560 +00:00:05,840 --> 00:00:09,560 <u><font color="lime">line 3 part 3</font></u> 4 -00:00:09,560 --> 00:00:12,359 +00:00:09,560 --> 00:00:12,360 <i><u><font color="yellow"><font color="lime">inner </font>style</font></u></i> @@ -1594,9 +1665,9 @@ def test_LazyList(self): self.assertEqual(repr(LazyList(it)), repr(it)) self.assertEqual(str(LazyList(it)), str(it)) - self.assertEqual(list(LazyList(it).reverse()), it[::-1]) - self.assertEqual(list(LazyList(it).reverse()[1:3:7]), it[::-1][1:3:7]) - self.assertEqual(list(LazyList(it).reverse()[::-1]), it) + self.assertEqual(list(LazyList(it, reverse=True)), it[::-1]) + self.assertEqual(list(reversed(LazyList(it))[::-1]), it) + self.assertEqual(list(reversed(LazyList(it))[1:3:7]), it[::-1][1:3:7]) def test_LazyList_laziness(self): @@ -1609,15 +1680,27 @@ def test(ll, idx, val, cache): test(ll, 5, 5, range(6)) test(ll, -3, 7, range(10)) - ll = LazyList(range(10)).reverse() + ll = LazyList(range(10), reverse=True) test(ll, -1, 0, range(1)) test(ll, 3, 6, range(10)) ll = LazyList(itertools.count()) test(ll, 10, 10, range(11)) - ll.reverse() + ll = reversed(ll) test(ll, -15, 14, range(15)) + def test_format_bytes(self): + self.assertEqual(format_bytes(0), '0.00B') + self.assertEqual(format_bytes(1000), '1000.00B') + self.assertEqual(format_bytes(1024), '1.00KiB') + self.assertEqual(format_bytes(1024**2), '1.00MiB') + self.assertEqual(format_bytes(1024**3), '1.00GiB') + self.assertEqual(format_bytes(1024**4), '1.00TiB') + self.assertEqual(format_bytes(1024**5), '1.00PiB') + self.assertEqual(format_bytes(1024**6), '1.00EiB') + self.assertEqual(format_bytes(1024**7), '1.00ZiB') + self.assertEqual(format_bytes(1024**8), '1.00YiB') + if __name__ == '__main__': unittest.main() diff --git a/test/test_write_annotations.py b/test/test_write_annotations.py.disabled similarity index 100% rename from test/test_write_annotations.py rename to test/test_write_annotations.py.disabled diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index e831393e49..d9638658dd 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -26,29 +26,31 @@ def assertIsPlaylist(self, info): def test_youtube_playlist_noplaylist(self): dl = FakeYDL() dl.params['noplaylist'] = True - ie = YoutubePlaylistIE(dl) + ie = YoutubeTabIE(dl) result = ie.extract('https://www.youtube.com/watch?v=FXxLjLQi3Fg&list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') self.assertEqual(result['_type'], 'url') - self.assertEqual(YoutubeIE().extract_id(result['url']), 'FXxLjLQi3Fg') + self.assertEqual(YoutubeIE.extract_id(result['url']), 'FXxLjLQi3Fg') def test_youtube_course(self): + print('Skipping: Course URLs no longer exists') + return dl = FakeYDL() ie = YoutubePlaylistIE(dl) # TODO find a > 100 (paginating?) videos course result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') entries = list(result['entries']) - self.assertEqual(YoutubeIE().extract_id(entries[0]['url']), 'j9WZyLZCBzs') + self.assertEqual(YoutubeIE.extract_id(entries[0]['url']), 'j9WZyLZCBzs') self.assertEqual(len(entries), 25) - self.assertEqual(YoutubeIE().extract_id(entries[-1]['url']), 'rYefUsYuEp0') + self.assertEqual(YoutubeIE.extract_id(entries[-1]['url']), 'rYefUsYuEp0') def test_youtube_mix(self): dl = FakeYDL() - ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/watch?v=W01L70IGBgE&index=2&list=RDOQpdSVF_k_w') - entries = result['entries'] + ie = YoutubeTabIE(dl) + result = ie.extract('https://www.youtube.com/watch?v=tyITL_exICo&list=RDCLAK5uy_kLWIr9gv1XLlPbaDS965-Db4TrBoUTxQ8') + entries = list(result['entries']) self.assertTrue(len(entries) >= 50) original_video = entries[0] - self.assertEqual(original_video['id'], 'OQpdSVF_k_w') + self.assertEqual(original_video['id'], 'tyITL_exICo') def test_youtube_toptracks(self): print('Skipping: The playlist page gives error 500') @@ -68,10 +70,10 @@ def test_youtube_flat_playlist_extraction(self): entries = list(result['entries']) self.assertTrue(len(entries) == 1) video = entries[0] - self.assertEqual(video['_type'], 'url_transparent') + self.assertEqual(video['_type'], 'url') self.assertEqual(video['ie_key'], 'Youtube') self.assertEqual(video['id'], 'BaW_jenozKc') - self.assertEqual(video['url'], 'BaW_jenozKc') + self.assertEqual(video['url'], 'https://www.youtube.com/watch?v=BaW_jenozKc') self.assertEqual(video['title'], 'youtube-dl test video "\'/\\ä↭𝕐') self.assertEqual(video['duration'], 10) self.assertEqual(video['uploader'], 'Philipp Hagemeister') diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index dcf6ab60d6..5f8114a1ce 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -14,9 +14,10 @@ from test.helper import FakeYDL, is_download_test from yt_dlp.extractor import YoutubeIE +from yt_dlp.jsinterp import JSInterpreter from yt_dlp.compat import compat_str, compat_urlretrieve -_TESTS = [ +_SIG_TESTS = [ ( 'https://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js', 86, @@ -64,6 +65,29 @@ ) ] +_NSIG_TESTS = [ + ( + 'https://www.youtube.com/s/player/9216d1f7/player_ias.vflset/en_US/base.js', + 'SLp9F5bwjAdhE9F-', 'gWnb9IK2DJ8Q1w', + ), + ( + 'https://www.youtube.com/s/player/f8cb7a3b/player_ias.vflset/en_US/base.js', + 'oBo2h5euWy6osrUt', 'ivXHpm7qJjJN', + ), + ( + 'https://www.youtube.com/s/player/2dfe380c/player_ias.vflset/en_US/base.js', + 'oBo2h5euWy6osrUt', '3DIBbn3qdQ', + ), + ( + 'https://www.youtube.com/s/player/f1ca6900/player_ias.vflset/en_US/base.js', + 'cu3wyu6LQn2hse', 'jvxetvmlI9AN9Q', + ), + ( + 'https://www.youtube.com/s/player/8040e515/player_ias.vflset/en_US/base.js', + 'wvOFaY-yjgDuIEg5', 'HkfBFDHmgw4rsw', + ), +] + @is_download_test class TestPlayerInfo(unittest.TestCase): @@ -97,35 +121,49 @@ def setUp(self): os.mkdir(self.TESTDATA_DIR) -def make_tfunc(url, sig_input, expected_sig): - m = re.match(r'.*-([a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$', url) - assert m, '%r should follow URL format' % url - test_id = m.group(1) +def t_factory(name, sig_func, url_pattern): + def make_tfunc(url, sig_input, expected_sig): + m = url_pattern.match(url) + assert m, '%r should follow URL format' % url + test_id = m.group('id') - def test_func(self): - basename = 'player-%s.js' % test_id - fn = os.path.join(self.TESTDATA_DIR, basename) + def test_func(self): + basename = f'player-{name}-{test_id}.js' + fn = os.path.join(self.TESTDATA_DIR, basename) - if not os.path.exists(fn): - compat_urlretrieve(url, fn) + if not os.path.exists(fn): + compat_urlretrieve(url, fn) + with io.open(fn, encoding='utf-8') as testf: + jscode = testf.read() + self.assertEqual(sig_func(jscode, sig_input), expected_sig) - ydl = FakeYDL() - ie = YoutubeIE(ydl) - with io.open(fn, encoding='utf-8') as testf: - jscode = testf.read() - func = ie._parse_sig_js(jscode) - src_sig = ( - compat_str(string.printable[:sig_input]) - if isinstance(sig_input, int) else sig_input) - got_sig = func(src_sig) - self.assertEqual(got_sig, expected_sig) - - test_func.__name__ = str('test_signature_js_' + test_id) - setattr(TestSignature, test_func.__name__, test_func) + test_func.__name__ = f'test_{name}_js_{test_id}' + setattr(TestSignature, test_func.__name__, test_func) + return make_tfunc -for test_spec in _TESTS: - make_tfunc(*test_spec) +def signature(jscode, sig_input): + func = YoutubeIE(FakeYDL())._parse_sig_js(jscode) + src_sig = ( + compat_str(string.printable[:sig_input]) + if isinstance(sig_input, int) else sig_input) + return func(src_sig) + + +def n_sig(jscode, sig_input): + funcname = YoutubeIE(FakeYDL())._extract_n_function_name(jscode) + return JSInterpreter(jscode).call_function(funcname, sig_input) + + +make_sig_test = t_factory( + 'signature', signature, re.compile(r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$')) +for test_spec in _SIG_TESTS: + make_sig_test(*test_spec) + +make_nsig_test = t_factory( + 'nsig', n_sig, re.compile(r'.+/player/(?P<id>[a-zA-Z0-9_-]+)/.+.js$')) +for test_spec in _NSIG_TESTS: + make_nsig_test(*test_spec) if __name__ == '__main__': diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 422b26ffe9..1d1429b5f2 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -5,10 +5,10 @@ import collections import contextlib -import copy import datetime import errno import fileinput +import functools import io import itertools import json @@ -25,9 +25,10 @@ import tokenize import traceback import random +import unicodedata +from enum import Enum from string import ascii_letters -from zipimport import zipimporter from .compat import ( compat_basestring, @@ -35,12 +36,14 @@ compat_kwargs, compat_numeric_types, compat_os_name, + compat_pycrypto_AES, compat_shlex_quote, compat_str, compat_tokenize_tokenize, compat_urllib_error, compat_urllib_request, compat_urllib_request_DataHandler, + windows_enable_vt_mode, ) from .cookies import load_cookies from .utils import ( @@ -52,9 +55,7 @@ DEFAULT_OUTTMPL, determine_ext, determine_protocol, - DOT_DESKTOP_LINK_TEMPLATE, - DOT_URL_LINK_TEMPLATE, - DOT_WEBLOC_LINK_TEMPLATE, + DownloadCancelled, DownloadError, encode_compat_str, encodeFilename, @@ -66,32 +67,37 @@ float_or_none, format_bytes, format_field, - STR_FORMAT_RE_TMPL, - STR_FORMAT_TYPES, + format_decimal_suffix, formatSeconds, GeoRestrictedError, + get_domain, HEADRequest, int_or_none, iri_to_uri, ISO3166Utils, + join_nonempty, LazyList, + LINK_TEMPLATES, locked_file, make_dir, make_HTTPS_handler, MaxDownloadsReached, network_exceptions, + number_of_digits, orderedSet, OUTTMPL_TYPES, PagedList, parse_filesize, PerRequestProxyHandler, platform_name, + Popen, PostProcessingError, preferredencoding, prepend_extension, - process_communicate_or_kill, + ReExtractInfo, register_socks_protocols, RejectedVideoReached, + remove_terminal_sequences, render_table, replace_extension, SameFileError, @@ -100,10 +106,13 @@ sanitize_url, sanitized_Request, std_headers, + STR_FORMAT_RE_TMPL, + STR_FORMAT_TYPES, str_or_none, strftime_or_none, subtitles_filename, - ThrottledDownload, + supports_terminal_sequences, + timetuple_from_msec, to_high_limit_path, traverse_obj, try_get, @@ -118,11 +127,12 @@ YoutubeDLRedirectHandler, ) from .cache import Cache +from .minicurses import format_text from .extractor import ( gen_extractor_classes, get_info_extractor, _LAZY_LOADER, - _PLUGIN_CLASSES + _PLUGIN_CLASSES as plugin_extractors ) from .extractor.openload import PhantomJSwrapper from .downloader import ( @@ -133,6 +143,8 @@ from .downloader.rtmp import rtmpdump_version from .postprocessor import ( get_postprocessor, + EmbedThumbnailPP, + FFmpegFixupDuplicateMoovPP, FFmpegFixupDurationPP, FFmpegFixupM3u8PP, FFmpegFixupM4aPP, @@ -141,8 +153,10 @@ FFmpegMergerPP, FFmpegPostProcessor, MoveFilesAfterDownloadPP, + _PLUGIN_CLASSES as plugin_postprocessors ) -from .version import __version__ +from .update import detect_variant +from .version import __version__, RELEASE_GIT_HEAD if compat_os_name == 'nt': import ctypes @@ -201,12 +215,15 @@ class YoutubeDL(object): simulate: Do not download the video files. If unset (or None), simulate only if listsubtitles, listformats or list_thumbnails is used format: Video format code. see "FORMAT SELECTION" for more details. + You can also pass a function. The function takes 'ctx' as + argument and returns the formats to download. + See "build_format_selector" for an implementation allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded. ignore_no_formats_error: Ignore "No video formats" error. Usefull for extracting metadata even if the video is not actually available for download (experimental) - format_sort: How to sort the video formats. see "Sorting Formats" - for more details. + format_sort: A list of fields by which to sort the video formats. + See "Sorting Formats" for more details. format_sort_force: Force the given format_sort. see "Sorting Formats" for more details. allow_multiple_video_streams: Allow multiple video streams to be merged @@ -214,7 +231,8 @@ class YoutubeDL(object): allow_multiple_audio_streams: Allow multiple audio streams to be merged into a single file check_formats Whether to test if the formats are downloadable. - Can be True (check all), False (check none) + Can be True (check all), False (check none), + 'selected' (check selected formats), or None (check only if requested by extractor) paths: Dictionary of output paths. The allowed keys are 'home' 'temp' and the keys of OUTTMPL_TYPES (in utils.py) @@ -225,9 +243,9 @@ class YoutubeDL(object): restrictfilenames: Do not allow "&" and spaces in file names trim_file_name: Limit length of filename (extension excluded) windowsfilenames: Force the filenames to be windows compatible - ignoreerrors: Do not stop on download errors - (Default True when running yt-dlp, - but False when directly accessing YoutubeDL class) + ignoreerrors: Do not stop on download/postprocessing errors. + Can be 'only_download' to ignore only download errors. + Default is 'only_download' for CLI, but False for API skip_playlist_after_errors: Number of allowed failures until the rest of the playlist is skipped force_generic_extractor: Force downloader to use the generic extractor @@ -245,6 +263,7 @@ class YoutubeDL(object): rejecttitle: Reject downloads for matching titles. logger: Log messages to a logging.Logger instance. logtostderr: Log messages to stderr instead of stdout. + consoletitle: Display progress in console window's titlebar. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file clean_infojson: Remove private fields from the infojson @@ -295,11 +314,13 @@ class YoutubeDL(object): file that is in the archive. break_on_reject: Stop the download process when encountering a video that has been filtered out. + break_per_url: Whether break_on_reject and break_on_existing + should act on each input URL as opposed to for the entire queue cookiefile: File name where cookies should be read from and dumped to - cookiesfrombrowser: A tuple containing the name of the browser and the profile - name/path from where cookies are loaded. - Eg: ('chrome', ) or (vivaldi, 'default') - nocheckcertificate:Do not verify SSL certificates + cookiesfrombrowser: A tuple containing the name of the browser, the profile + name/pathfrom where cookies are loaded, and the name of the + keyring. Eg: ('chrome', ) or ('vivaldi', 'default', 'BASICTEXT') + nocheckcertificate: Do not verify SSL certificates prefer_insecure: Use HTTP instead of HTTPS to retrieve information. At the moment, this is only supported by YouTube. proxy: URL of the proxy server to use @@ -309,20 +330,24 @@ class YoutubeDL(object): bidi_workaround: Work around buggy terminals without bidirectional text support, using fridibi debug_printtraffic:Print out sent and received HTTP traffic - include_ads: Download ads as well + include_ads: Download ads as well (deprecated) default_search: Prepend this string if an input url is not valid. 'auto' for elaborate guessing encoding: Use this encoding instead of the system-specified. extract_flat: Do not resolve URLs, return the immediate result. Pass in 'in_playlist' to only show this behavior for playlist items. + wait_for_video: If given, wait for scheduled streams to become available. + The value should be a tuple containing the range + (min_secs, max_secs) to wait between retries postprocessors: A list of dictionaries, each with an entry * key: The name of the postprocessor. See yt_dlp/postprocessor/__init__.py for a list. * when: When to run the postprocessor. Can be one of pre_process|before_dl|post_process|after_move. Assumed to be 'post_process' if not given - post_hooks: A list of functions that get called as the final step + post_hooks: Deprecated - Register a custom postprocessor instead + A list of functions that get called as the final step for each video file, after all postprocessors have been called. The filename will be passed as the only argument. progress_hooks: A list of functions that get called on download @@ -350,10 +375,18 @@ class YoutubeDL(object): Progress hooks are guaranteed to be called at least once (with status "finished") if the download is successful. + postprocessor_hooks: A list of functions that get called on postprocessing + progress, with a dictionary with the entries + * status: One of "started", "processing", or "finished". + Check this first and ignore unknown values. + * postprocessor: Name of the postprocessor + * info_dict: The extracted info_dict + + Progress hooks are guaranteed to be called at least twice + (with status "started" and "finished") if the processing is successful. merge_output_format: Extension to use when merging formats. final_ext: Expected final extension; used to detect when the file was - already downloaded and converted. "merge_output_format" is - replaced by this extension when given + already downloaded and converted fixup: Automatically correct known faults of the file. One of: - "never": do nothing @@ -406,15 +439,20 @@ class YoutubeDL(object): use downloader suggested by extractor if None. compat_opts: Compatibility options. See "Differences in default behavior". The following options do not work when used through the API: - filename, abort-on-error, multistreams, no-live-chat, - no-clean-infojson, no-playlist-metafiles, no-keep-subs. + filename, abort-on-error, multistreams, no-live-chat, format-sort + no-clean-infojson, no-playlist-metafiles, no-keep-subs, no-attach-info-json. Refer __init__.py for their implementation + progress_template: Dictionary of templates for progress outputs. + Allowed keys are 'download', 'postprocess', + 'download-title' (console title) and 'postprocess-title'. + The template is mapped on a dictionary with keys 'progress' and 'info' The following parameters are not used by YoutubeDL itself, they are used by the downloader (see yt_dlp/downloader/common.py): nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize, - max_filesize, test, noresizebuffer, retries, continuedl, noprogress, consoletitle, - xattr_set_filesize, external_downloader_args, hls_use_mpegts, http_chunk_size. + max_filesize, test, noresizebuffer, retries, file_access_retries, fragment_retries, + continuedl, noprogress, xattr_set_filesize, hls_use_mpegts, http_chunk_size, + external_downloader_args, concurrent_fragment_downloads. The following options are used by the post processors: prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available, @@ -451,15 +489,20 @@ class YoutubeDL(object): _NUMERIC_FIELDS = set(( 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx', - 'timestamp', 'upload_year', 'upload_month', 'upload_day', + 'timestamp', 'release_timestamp', 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count', 'average_rating', 'comment_count', 'age_limit', 'start_time', 'end_time', 'chapter_number', 'season_number', 'episode_number', 'track_number', 'disc_number', 'release_year', - 'playlist_index', )) + _format_selection_exts = { + 'audio': {'m4a', 'mp3', 'ogg', 'aac'}, + 'video': {'mp4', 'flv', 'webm', '3gp'}, + 'storyboards': {'mhtml'}, + } + params = None _ies = {} _pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []} @@ -472,7 +515,10 @@ class YoutubeDL(object): _screen_file = None def __init__(self, params=None, auto_init=True): - """Create a FileDownloader object with the given options.""" + """Create a FileDownloader object with the given options. + @param auto_init Whether to load the default extractors and print header (if verbose). + Set to 'no_verbose_header' to not print the header + """ if params is None: params = {} self._ies = {} @@ -482,26 +528,30 @@ def __init__(self, params=None, auto_init=True): self._first_webpage_request = True self._post_hooks = [] self._progress_hooks = [] + self._postprocessor_hooks = [] self._download_retcode = 0 self._num_downloads = 0 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] self._err_file = sys.stderr - self.params = { - # Default parameters - 'nocheckcertificate': False, - } - self.params.update(params) + self.params = params self.cache = Cache(self) + windows_enable_vt_mode() + self._allow_colors = { + 'screen': not self.params.get('no_color') and supports_terminal_sequences(self._screen_file), + 'err': not self.params.get('no_color') and supports_terminal_sequences(self._err_file), + } + if sys.version_info < (3, 6): self.report_warning( 'Python version %d.%d is not supported! Please update to Python 3.6 or above' % sys.version_info[:2]) if self.params.get('allow_unplayable_formats'): self.report_warning( - 'You have asked for unplayable formats to be listed/downloaded. ' - 'This is a developer option intended for debugging. ' - 'If you experience any issues while using this option, DO NOT open a bug report') + f'You have asked for {self._format_err("UNPLAYABLE", self.Styles.EMPHASIS)} formats to be listed/downloaded. ' + 'This is a developer option intended for debugging. \n' + ' If you experience any issues while using this option, ' + f'{self._format_err("DO NOT", self.Styles.ERROR)} open a bug report') def check_deprecated(param, option, suggestion): if self.params.get(param) is not None: @@ -517,16 +567,21 @@ def check_deprecated(param, option, suggestion): check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"') check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"') - for msg in self.params.get('warnings', []): + for msg in self.params.get('_warnings', []): self.report_warning(msg) + for msg in self.params.get('_deprecation_warnings', []): + self.deprecation_warning(msg) - if self.params.get('overwrites') is None: - self.params.pop('overwrites', None) - elif self.params.get('nooverwrites') is not None: + if 'list-formats' in self.params.get('compat_opts', []): + self.params['listformats_table'] = False + + if 'overwrites' not in self.params and self.params.get('nooverwrites') is not None: # nooverwrites was unnecessarily changed to overwrites # in 0c3d0f51778b153f65c21906031c2e091fcfb641 # This ensures compatibility with both keys self.params['overwrites'] = not self.params['nooverwrites'] + elif self.params.get('overwrites') is None: + self.params.pop('overwrites', None) else: self.params['nooverwrites'] = not self.params['overwrites'] @@ -544,16 +599,15 @@ def check_deprecated(param, option, suggestion): stdout=slave, stderr=self._err_file) try: - self._output_process = subprocess.Popen( - ['bidiv'] + width_args, **sp_kwargs - ) + self._output_process = Popen(['bidiv'] + width_args, **sp_kwargs) except OSError: - self._output_process = subprocess.Popen( - ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs) + self._output_process = Popen(['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs) self._output_channel = os.fdopen(master, 'rb') except OSError as ose: if ose.errno == errno.ENOENT: - self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.') + self.report_warning( + 'Could not find fribidi executable, ignoring --bidi-workaround. ' + 'Make sure that fribidi is an executable file in one of the directories in your $PATH.') else: raise @@ -571,16 +625,40 @@ def check_deprecated(param, option, suggestion): # Creating format selector here allows us to catch syntax errors before the extraction self.format_selector = ( - None if self.params.get('format') is None + self.params.get('format') if self.params.get('format') in (None, '-') + else self.params['format'] if callable(self.params['format']) else self.build_format_selector(self.params['format'])) self._setup_opener() - """Preload the archive, if any is specified""" + if auto_init: + if auto_init != 'no_verbose_header': + self.print_debug_header() + self.add_default_info_extractors() + + hooks = { + 'post_hooks': self.add_post_hook, + 'progress_hooks': self.add_progress_hook, + 'postprocessor_hooks': self.add_postprocessor_hook, + } + for opt, fn in hooks.items(): + for ph in self.params.get(opt, []): + fn(ph) + + for pp_def_raw in self.params.get('postprocessors', []): + pp_def = dict(pp_def_raw) + when = pp_def.pop('when', 'post_process') + self.add_post_processor( + get_postprocessor(pp_def.pop('key'))(self, **compat_kwargs(pp_def)), + when=when) + + register_socks_protocols() + def preload_download_archive(fn): + """Preload the archive, if any is specified""" if fn is None: return False - self.write_debug('Loading archive file %r\n' % fn) + self.write_debug(f'Loading archive file {fn!r}') try: with locked_file(fn, 'r', encoding='utf-8') as archive_file: for line in archive_file: @@ -594,25 +672,6 @@ def preload_download_archive(fn): self.archive = set() preload_download_archive(self.params.get('download_archive')) - if auto_init: - self.print_debug_header() - self.add_default_info_extractors() - - for pp_def_raw in self.params.get('postprocessors', []): - pp_def = dict(pp_def_raw) - when = pp_def.pop('when', 'post_process') - pp_class = get_postprocessor(pp_def.pop('key')) - pp = pp_class(self, **compat_kwargs(pp_def)) - self.add_post_processor(pp, when=when) - - for ph in self.params.get('post_hooks', []): - self.add_post_hook(ph) - - for ph in self.params.get('progress_hooks', []): - self.add_progress_hook(ph) - - register_socks_protocols() - def warn_if_short_id(self, argv): # short YouTube ID starting with dash? idxs = [ @@ -626,7 +685,7 @@ def warn_if_short_id(self, argv): ) self.report_warning( 'Long argument string detected. ' - 'Use -- to separate parameters and URLs, like this:\n%s\n' % + 'Use -- to separate parameters and URLs, like this:\n%s' % args_to_str(correct_argv)) def add_info_extractor(self, ie): @@ -673,9 +732,16 @@ def add_post_hook(self, ph): self._post_hooks.append(ph) def add_progress_hook(self, ph): - """Add the progress hook (currently only for the file downloader)""" + """Add the download progress hook""" self._progress_hooks.append(ph) + def add_postprocessor_hook(self, ph): + """Add the postprocessing progress hook""" + self._postprocessor_hooks.append(ph) + for pps in self._pps.values(): + for pp in pps: + pp.add_progress_hook(ph) + def _bidi_workaround(self, message): if not hasattr(self, '_output_channel'): return message @@ -716,6 +782,7 @@ def to_stderr(self, message, only_once=False): def to_console_title(self, message): if not self.params.get('consoletitle', False): return + message = remove_terminal_sequences(message) if compat_os_name == 'nt': if ctypes.windll.kernel32.GetConsoleWindow(): # c_wchar_p() might not be necessary if `message` is @@ -752,14 +819,15 @@ def __exit__(self, *args): if self.params.get('cookiefile') is not None: self.cookiejar.save(ignore_discard=True, ignore_expires=True) - def trouble(self, message=None, tb=None): + def trouble(self, message=None, tb=None, is_error=True): """Determine action to take when a download problem appears. Depending on if the downloader has been configured to ignore download errors or not, this method may throw an exception or not when errors are found, after printing the message. - tb, if given, is additional traceback information. + @param tb If given, is additional traceback information + @param is_error Whether to raise error according to ignorerrors """ if message is not None: self.to_stderr(message) @@ -775,7 +843,9 @@ def trouble(self, message=None, tb=None): tb = ''.join(tb_data) if tb: self.to_stderr(tb) - if not self.params.get('ignoreerrors', False): + if not is_error: + return + if not self.params.get('ignoreerrors'): if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: exc_info = sys.exc_info()[1].exc_info else: @@ -788,6 +858,34 @@ def to_screen(self, message, skip_eol=False): self.to_stdout( message, skip_eol, quiet=self.params.get('quiet', False)) + class Styles(Enum): + HEADERS = 'yellow' + EMPHASIS = 'light blue' + ID = 'green' + DELIM = 'blue' + ERROR = 'red' + WARNING = 'yellow' + SUPPRESS = 'light black' + + def _format_text(self, handle, allow_colors, text, f, fallback=None, *, test_encoding=False): + if test_encoding: + original_text = text + encoding = self.params.get('encoding') or getattr(handle, 'encoding', 'ascii') + text = text.encode(encoding, 'ignore').decode(encoding) + if fallback is not None and text != original_text: + text = fallback + if isinstance(f, self.Styles): + f = f.value + return format_text(text, f) if allow_colors else text if fallback is None else fallback + + def _format_screen(self, *args, **kwargs): + return self._format_text( + self._screen_file, self._allow_colors['screen'], *args, **kwargs) + + def _format_err(self, *args, **kwargs): + return self._format_text( + self._err_file, self._allow_colors['err'], *args, **kwargs) + def report_warning(self, message, only_once=False): ''' Print the message to stderr, it will be prefixed with 'WARNING:' @@ -798,24 +896,20 @@ def report_warning(self, message, only_once=False): else: if self.params.get('no_warnings'): return - if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt': - _msg_header = '\033[0;33mWARNING:\033[0m' - else: - _msg_header = 'WARNING:' - warning_message = '%s %s' % (_msg_header, message) - self.to_stderr(warning_message, only_once) + self.to_stderr(f'{self._format_err("WARNING:", self.Styles.WARNING)} {message}', only_once) - def report_error(self, message, tb=None): + def deprecation_warning(self, message): + if self.params.get('logger') is not None: + self.params['logger'].warning('DeprecationWarning: {message}') + else: + self.to_stderr(f'{self._format_err("DeprecationWarning:", self.Styles.ERROR)} {message}', True) + + def report_error(self, message, *args, **kwargs): ''' Do the same as trouble, but prefixes the message with 'ERROR:', colored in red if stderr is a tty file. ''' - if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt': - _msg_header = '\033[0;31mERROR:\033[0m' - else: - _msg_header = 'ERROR:' - error_message = '%s %s' % (_msg_header, message) - self.trouble(error_message, tb) + self.trouble(f'{self._format_err("ERROR:", self.Styles.ERROR)} {message}', *args, **kwargs) def write_debug(self, message, only_once=False): '''Log debug message or Print message to stderr''' @@ -855,9 +949,14 @@ def parse_outtmpl(self): outtmpl_dict = self.params.get('outtmpl', {}) if not isinstance(outtmpl_dict, dict): outtmpl_dict = {'default': outtmpl_dict} + # Remove spaces in the default template + if self.params.get('restrictfilenames'): + sanitize = lambda x: x.replace(' - ', ' ').replace(' ', '-') + else: + sanitize = lambda x: x outtmpl_dict.update({ - k: v for k, v in DEFAULT_OUTTMPL.items() - if not outtmpl_dict.get(k)}) + k: sanitize(v) for k, v in DEFAULT_OUTTMPL.items() + if outtmpl_dict.get(k) is None}) for key, val in outtmpl_dict.items(): if isinstance(val, bytes): self.report_warning( @@ -907,7 +1006,7 @@ def escape_outtmpl(outtmpl): def validate_outtmpl(cls, outtmpl): ''' @return None or Exception object ''' outtmpl = re.sub( - STR_FORMAT_RE_TMPL.format('[^)]*', '[ljq]'), + STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBUDS]'), lambda mobj: f'{mobj.group(0)[:-1]}s', cls._outtmpl_expandpath(outtmpl)) try: @@ -916,13 +1015,22 @@ def validate_outtmpl(cls, outtmpl): except ValueError as err: return err - def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None): - """ Make the template and info_dict suitable for substitution : ydl.outtmpl_escape(outtmpl) % info_dict """ - info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set - - info_dict = dict(info_dict) # Do not sanitize so as not to consume LazyList + @staticmethod + def _copy_infodict(info_dict): + info_dict = dict(info_dict) for key in ('__original_infodict', '__postprocessors'): info_dict.pop(key, None) + return info_dict + + def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False): + """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict + @param sanitize Whether to sanitize the output as a filename. + For backward compatibility, a function can also be passed + """ + + info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set + + info_dict = self._copy_infodict(info_dict) info_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs formatSeconds(info_dict['duration'], '-' if sanitize else ':') if info_dict.get('duration', None) is not None @@ -931,15 +1039,16 @@ def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None): if info_dict.get('resolution') is None: info_dict['resolution'] = self.format_resolution(info_dict, default=None) - # For fields playlist_index and autonumber convert all occurrences + # For fields playlist_index, playlist_autonumber and autonumber convert all occurrences # of %(field)s to %(field)0Nd for backward compatibility field_size_compat_map = { - 'playlist_index': len(str(info_dict.get('_last_playlist_index') or '')), + 'playlist_index': number_of_digits(info_dict.get('_last_playlist_index') or 0), + 'playlist_autonumber': number_of_digits(info_dict.get('n_entries') or 0), 'autonumber': self.params.get('autonumber_size') or 5, } TMPL_DICT = {} - EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljq]')) + EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBUDS]')) MATH_FUNCTIONS = { '+': float.__add__, '-': float.__sub__, @@ -947,13 +1056,15 @@ def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None): # Field is of the form key1.key2... # where keys (except first) can be string, int or slice FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)') - MATH_FIELD_RE = r'''{field}|{num}'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?') + MATH_FIELD_RE = r'''(?:{field}|{num})'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?') MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys())) INTERNAL_FORMAT_RE = re.compile(r'''(?x) (?P<negate>-)? (?P<fields>{field}) (?P<maths>(?:{math_op}{math_field})*) (?:>(?P<strf_format>.+?))? + (?P<alternate>(?<!\\),[^|&)]+)? + (?:&(?P<replacement>.*?))? (?:\|(?P<default>.*?))? $'''.format(field=FIELD_RE, math_op=MATH_OPERATORS_RE, math_field=MATH_FIELD_RE)) @@ -995,48 +1106,74 @@ def get_value(mdict): operator = None # Datetime formatting if mdict['strf_format']: - value = strftime_or_none(value, mdict['strf_format']) + value = strftime_or_none(value, mdict['strf_format'].replace('\\,', ',')) return value na = self.params.get('outtmpl_na_placeholder', 'NA') + def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')): + return sanitize_filename(str(value), restricted=restricted, + is_id=re.search(r'(^|[_.])id(\.|$)', key)) + + sanitizer = sanitize if callable(sanitize) else filename_sanitizer + sanitize = bool(sanitize) + def _dumpjson_default(obj): if isinstance(obj, (set, LazyList)): return list(obj) - raise TypeError(f'Object of type {type(obj).__name__} is not JSON serializable') + return repr(obj) def create_key(outer_mobj): if not outer_mobj.group('has_key'): - return f'%{outer_mobj.group(0)}' + return outer_mobj.group(0) key = outer_mobj.group('key') mobj = re.match(INTERNAL_FORMAT_RE, key) - if mobj is None: - value, default, mobj = None, na, {'fields': ''} - else: + initial_field = mobj.group('fields') if mobj else '' + value, replacement, default = None, None, na + while mobj: mobj = mobj.groupdict() - default = mobj['default'] if mobj['default'] is not None else na + default = mobj['default'] if mobj['default'] is not None else default value = get_value(mobj) + replacement = mobj['replacement'] + if value is None and mobj['alternate']: + mobj = re.match(INTERNAL_FORMAT_RE, mobj['alternate'][1:]) + else: + break fmt = outer_mobj.group('format') if fmt == 's' and value is not None and key in field_size_compat_map.keys(): fmt = '0{:d}d'.format(field_size_compat_map[key]) - value = default if value is None else value + value = default if value is None else value if replacement is None else replacement + flags = outer_mobj.group('conversion') or '' str_fmt = f'{fmt[:-1]}s' - if fmt[-1] == 'l': - value, fmt = ', '.join(variadic(value)), str_fmt - elif fmt[-1] == 'j': - value, fmt = json.dumps(value, default=_dumpjson_default), str_fmt - elif fmt[-1] == 'q': - value, fmt = compat_shlex_quote(str(value)), str_fmt + if fmt[-1] == 'l': # list + delim = '\n' if '#' in flags else ', ' + value, fmt = delim.join(variadic(value, allowed_types=(str, bytes))), str_fmt + elif fmt[-1] == 'j': # json + value, fmt = json.dumps(value, default=_dumpjson_default, indent=4 if '#' in flags else None), str_fmt + elif fmt[-1] == 'q': # quoted + value = map(str, variadic(value) if '#' in flags else [value]) + value, fmt = ' '.join(map(compat_shlex_quote, value)), str_fmt + elif fmt[-1] == 'B': # bytes + value = f'%{str_fmt}'.encode('utf-8') % str(value).encode('utf-8') + value, fmt = value.decode('utf-8', 'ignore'), 's' + elif fmt[-1] == 'U': # unicode normalized + value, fmt = unicodedata.normalize( + # "+" = compatibility equivalence, "#" = NFD + 'NF%s%s' % ('K' if '+' in flags else '', 'D' if '#' in flags else 'C'), + value), str_fmt + elif fmt[-1] == 'D': # decimal suffix + value, fmt = format_decimal_suffix(value, f'%{fmt[:-1]}f%s' if fmt[:-1] else '%d%s'), 's' + elif fmt[-1] == 'S': # filename sanitization + value, fmt = filename_sanitizer(initial_field, value, restricted='#' in flags), str_fmt elif fmt[-1] == 'c': - value = str(value) - if value is None: - value, fmt = default, 's' + if value: + value = str(value)[0] else: - value = value[0] + fmt = str_fmt elif fmt[-1] not in 'rs': # numeric value = float_or_none(value) if value is None: @@ -1048,7 +1185,7 @@ def create_key(outer_mobj): # So we convert it to repr first value, fmt = repr(value), str_fmt if fmt[-1] in 'csr': - value = sanitize(mobj['fields'].split('.')[-1], value) + value = sanitizer(initial_field, value) key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format')) TMPL_DICT[key] = value @@ -1056,30 +1193,24 @@ def create_key(outer_mobj): return EXTERNAL_FORMAT_RE.sub(create_key, outtmpl), TMPL_DICT + def evaluate_outtmpl(self, outtmpl, info_dict, *args, **kwargs): + outtmpl, info_dict = self.prepare_outtmpl(outtmpl, info_dict, *args, **kwargs) + return self.escape_outtmpl(outtmpl) % info_dict + def _prepare_filename(self, info_dict, tmpl_type='default'): try: - sanitize = lambda k, v: sanitize_filename( - compat_str(v), - restricted=self.params.get('restrictfilenames'), - is_id=(k == 'id' or k.endswith('_id'))) - outtmpl = self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default']) - outtmpl, template_dict = self.prepare_outtmpl(outtmpl, info_dict, sanitize) - outtmpl = self.escape_outtmpl(self._outtmpl_expandpath(outtmpl)) - filename = outtmpl % template_dict + outtmpl = self._outtmpl_expandpath(self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default'])) + filename = self.evaluate_outtmpl(outtmpl, info_dict, True) force_ext = OUTTMPL_TYPES.get(tmpl_type) - if force_ext is not None: + if filename and force_ext is not None: filename = replace_extension(filename, force_ext, info_dict.get('ext')) # https://github.com/blackjack4494/youtube-dlc/issues/85 trim_file_name = self.params.get('trim_file_name', False) if trim_file_name: - fn_groups = filename.rsplit('.') - ext = fn_groups[-1] - sub_ext = '' - if len(fn_groups) > 2: - sub_ext = fn_groups[-2] - filename = '.'.join(filter(None, [fn_groups[0][:trim_file_name], sub_ext, ext])) + no_ext, *ext = filename.rsplit('.', 2) + filename = join_nonempty(no_ext[:trim_file_name], *ext, delim='.') return filename except ValueError as err: @@ -1090,6 +1221,8 @@ def prepare_filename(self, info_dict, dir_type='', warn=False): """Generate the output filename.""" filename = self._prepare_filename(info_dict, dir_type or 'default') + if not filename and dir_type not in ('', 'temp'): + return '' if warn: if not self.params.get('paths'): @@ -1166,7 +1299,7 @@ def add_extra_info(info_dict, extra_info): for key, value in extra_info.items(): info_dict.setdefault(key, value) - def extract_info(self, url, download=True, ie_key=None, extra_info={}, + def extract_info(self, url, download=True, ie_key=None, extra_info=None, process=True, force_generic_extractor=False): """ Return a list with a dictionary for each video extracted. @@ -1183,6 +1316,9 @@ def extract_info(self, url, download=True, ie_key=None, extra_info={}, force_generic_extractor -- force using the generic extractor """ + if extra_info is None: + extra_info = {} + if not ie_key and force_generic_extractor: ie_key = 'Generic' @@ -1201,39 +1337,87 @@ def extract_info(self, url, download=True, ie_key=None, extra_info={}, temp_id = ie.get_temp_id(url) if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): - self.to_screen("[%s] %s: has already been recorded in archive" % ( - ie_key, temp_id)) + self.to_screen(f'[{ie_key}] {temp_id}: has already been recorded in the archive') + if self.params.get('break_on_existing', False): + raise ExistingVideoReached() break return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process) else: self.report_error('no suitable InfoExtractor for URL %s' % url) - def __handle_extraction_exceptions(func, handle_all_errors=True): + def __handle_extraction_exceptions(func): + @functools.wraps(func) def wrapper(self, *args, **kwargs): - try: - return func(self, *args, **kwargs) - except GeoRestrictedError as e: - msg = e.msg - if e.countries: - msg += '\nThis video is available in %s.' % ', '.join( - map(ISO3166Utils.short2full, e.countries)) - msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' - self.report_error(msg) - except ExtractorError as e: # An error we somewhat expected - self.report_error(compat_str(e), e.format_traceback()) - except ThrottledDownload: - self.to_stderr('\r') - self.report_warning('The download speed is below throttle limit. Re-extracting data') - return wrapper(self, *args, **kwargs) - except (MaxDownloadsReached, ExistingVideoReached, RejectedVideoReached): - raise - except Exception as e: - if handle_all_errors and self.params.get('ignoreerrors', False): - self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) - else: + while True: + try: + return func(self, *args, **kwargs) + except (DownloadCancelled, LazyList.IndexError, PagedList.IndexError): raise + except ReExtractInfo as e: + if e.expected: + self.to_screen(f'{e}; Re-extracting data') + else: + self.to_stderr('\r') + self.report_warning(f'{e}; Re-extracting data') + continue + except GeoRestrictedError as e: + msg = e.msg + if e.countries: + msg += '\nThis video is available in %s.' % ', '.join( + map(ISO3166Utils.short2full, e.countries)) + msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' + self.report_error(msg) + except ExtractorError as e: # An error we somewhat expected + self.report_error(str(e), e.format_traceback()) + except Exception as e: + if self.params.get('ignoreerrors'): + self.report_error(str(e), tb=encode_compat_str(traceback.format_exc())) + else: + raise + break return wrapper + def _wait_for_video(self, ie_result): + if (not self.params.get('wait_for_video') + or ie_result.get('_type', 'video') != 'video' + or ie_result.get('formats') or ie_result.get('url')): + return + + format_dur = lambda dur: '%02d:%02d:%02d' % timetuple_from_msec(dur * 1000)[:-1] + last_msg = '' + + def progress(msg): + nonlocal last_msg + self.to_screen(msg + ' ' * (len(last_msg) - len(msg)) + '\r', skip_eol=True) + last_msg = msg + + min_wait, max_wait = self.params.get('wait_for_video') + diff = try_get(ie_result, lambda x: x['release_timestamp'] - time.time()) + if diff is None and ie_result.get('live_status') == 'is_upcoming': + diff = random.randrange(min_wait, max_wait) if (max_wait and min_wait) else (max_wait or min_wait) + self.report_warning('Release time of video is not known') + elif (diff or 0) <= 0: + self.report_warning('Video should already be available according to extracted info') + diff = min(max(diff or 0, min_wait or 0), max_wait or float('inf')) + self.to_screen(f'[wait] Waiting for {format_dur(diff)} - Press Ctrl+C to try now') + + wait_till = time.time() + diff + try: + while True: + diff = wait_till - time.time() + if diff <= 0: + progress('') + raise ReExtractInfo('[wait] Wait period ended', expected=True) + progress(f'[wait] Remaining time until next attempt: {self._format_screen(format_dur(diff), self.Styles.EMPHASIS)}') + time.sleep(1) + except KeyboardInterrupt: + progress('') + raise ReExtractInfo('[wait] Interrupted by user', expected=True) + except BaseException as e: + if not isinstance(e, ReExtractInfo): + self.to_screen('') + raise + @__handle_extraction_exceptions def __extract_info(self, url, ie, download, extra_info, process): ie_result = ie.extract(url) @@ -1249,6 +1433,7 @@ def __extract_info(self, url, ie, download, extra_info, process): ie_result.setdefault('original_url', extra_info['original_url']) self.add_default_extra_info(ie_result, ie, url) if process: + self._wait_for_video(ie_result) return self.process_ie_result(ie_result, download, extra_info) else: return ie_result @@ -1259,6 +1444,7 @@ def add_default_extra_info(self, ie_result, ie, url): 'webpage_url': url, 'original_url': url, 'webpage_url_basename': url_basename(url), + 'webpage_url_domain': get_domain(url), }) if ie is not None: self.add_extra_info(ie_result, { @@ -1287,10 +1473,15 @@ def process_ie_result(self, ie_result, download=True, extra_info=None): if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or extract_flat is True): info_copy = ie_result.copy() - self.add_extra_info(info_copy, extra_info) ie = try_get(ie_result.get('ie_key'), self.get_info_extractor) + if ie and not ie_result.get('id'): + info_copy['id'] = ie.get_temp_id(ie_result['url']) self.add_default_extra_info(info_copy, ie, ie_result['url']) + self.add_extra_info(info_copy, extra_info) + info_copy, _ = self.pre_process(info_copy) self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True) + if self.params.get('force_write_download_archive', False): + self.record_download_archive(info_copy) return ie_result if result_type == 'video': @@ -1306,7 +1497,7 @@ def process_ie_result(self, ie_result, download=True, extra_info=None): self.write_debug('Additional URLs: "%s"' % '", "'.join(additional_urls)) ie_result['additional_entries'] = [ self.extract_info( - url, download, extra_info, + url, download, extra_info=extra_info, force_generic_extractor=self.params.get('force_generic_extractor')) for url in additional_urls ] @@ -1378,6 +1569,7 @@ def _fixup(r): 'extractor': ie_result['extractor'], 'webpage_url': ie_result['webpage_url'], 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'webpage_url_domain': get_domain(ie_result['webpage_url']), 'extractor_key': ie_result['extractor_key'], }) return r @@ -1398,12 +1590,14 @@ def __process_playlist(self, ie_result, download): self.to_screen('[download] Downloading playlist: %s' % playlist) if 'entries' not in ie_result: - raise EntryNotInPlaylist() + raise EntryNotInPlaylist('There are no entries') + + MissingEntry = object() incomplete_entries = bool(ie_result.get('requested_entries')) if incomplete_entries: - def fill_missing_entries(entries, indexes): - ret = [None] * max(*indexes) - for i, entry in zip(indexes, entries): + def fill_missing_entries(entries, indices): + ret = [MissingEntry] * max(indices) + for i, entry in zip(indices, entries): ret[i - 1] = entry return ret ie_result['entries'] = fill_missing_entries(ie_result['entries'], ie_result['requested_entries']) @@ -1433,27 +1627,34 @@ def iter_playlistitems(format): msg = ( 'Downloading %d videos' if not isinstance(ie_entries, list) else 'Collected %d videos; downloading %%d of them' % len(ie_entries)) - if not isinstance(ie_entries, (list, PagedList)): - ie_entries = LazyList(ie_entries) - def get_entry(i): - return YoutubeDL.__handle_extraction_exceptions( - lambda self, i: ie_entries[i - 1], - False - )(self, i) + if isinstance(ie_entries, list): + def get_entry(i): + return ie_entries[i - 1] + else: + if not isinstance(ie_entries, (PagedList, LazyList)): + ie_entries = LazyList(ie_entries) + + def get_entry(i): + return YoutubeDL.__handle_extraction_exceptions( + lambda self, i: ie_entries[i - 1] + )(self, i) entries = [] - for i in playlistitems or itertools.count(playliststart): + items = playlistitems if playlistitems is not None else itertools.count(playliststart) + for i in items: + if i == 0: + continue if playlistitems is None and playlistend is not None and playlistend < i: break entry = None try: entry = get_entry(i) - if entry is None: + if entry is MissingEntry: raise EntryNotInPlaylist() except (IndexError, EntryNotInPlaylist): if incomplete_entries: - raise EntryNotInPlaylist() + raise EntryNotInPlaylist(f'Entry {i} cannot be found') elif not playlistitems: break entries.append(entry) @@ -1471,11 +1672,12 @@ def get_entry(i): if entry is not None] n_entries = len(entries) - if not playlistitems and (playliststart or playlistend): + if not playlistitems and (playliststart != 1 or playlistend): playlistitems = list(range(playliststart, playliststart + n_entries)) ie_result['requested_entries'] = playlistitems - if self.params.get('allow_playlist_files', True): + _infojson_written = False + if not self.params.get('simulate') and self.params.get('allow_playlist_files', True): ie_copy = { 'playlist': playlist, 'playlist_id': ie_result.get('id'), @@ -1483,41 +1685,19 @@ def get_entry(i): 'playlist_uploader': ie_result.get('uploader'), 'playlist_uploader_id': ie_result.get('uploader_id'), 'playlist_index': 0, + 'n_entries': n_entries, } ie_copy.update(dict(ie_result)) - if self.params.get('writeinfojson', False): - infofn = self.prepare_filename(ie_copy, 'pl_infojson') - if not self._ensure_dir_exists(encodeFilename(infofn)): - return - if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(infofn)): - self.to_screen('[info] Playlist metadata is already present') - else: - self.to_screen('[info] Writing playlist metadata as JSON to: ' + infofn) - try: - write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn) - except (OSError, IOError): - self.report_error('Cannot write playlist metadata to JSON file ' + infofn) - + _infojson_written = self._write_info_json( + 'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson')) + if _infojson_written is None: + return + if self._write_description('playlist', ie_result, + self.prepare_filename(ie_copy, 'pl_description')) is None: + return # TODO: This should be passed to ThumbnailsConvertor if necessary - self._write_thumbnails(ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail')) - - if self.params.get('writedescription', False): - descfn = self.prepare_filename(ie_copy, 'pl_description') - if not self._ensure_dir_exists(encodeFilename(descfn)): - return - if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(descfn)): - self.to_screen('[info] Playlist description is already present') - elif ie_result.get('description') is None: - self.report_warning('There\'s no playlist description to write.') - else: - try: - self.to_screen('[info] Writing playlist description to: ' + descfn) - with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: - descfile.write(ie_result['description']) - except (OSError, IOError): - self.report_error('Cannot write playlist description file ' + descfn) - return + self._write_thumbnails('playlist', ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail')) if self.params.get('playlistreverse', False): entries = entries[::-1] @@ -1531,8 +1711,8 @@ def get_entry(i): max_failures = self.params.get('skip_playlist_after_errors') or float('inf') for i, entry_tuple in enumerate(entries, 1): playlist_index, entry = entry_tuple - if 'playlist-index' in self.params.get('compat_options', []): - playlist_index = playlistitems[i - 1] if playlistitems else i + if 'playlist-index' in self.params.get('compat_opts', []): + playlist_index = playlistitems[i - 1] if playlistitems else i + playliststart - 1 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) # This __x_forwarded_for_ip thing is a bit ugly but requires # minimal changes @@ -1551,6 +1731,7 @@ def get_entry(i): 'extractor': ie_result['extractor'], 'webpage_url': ie_result['webpage_url'], 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'webpage_url_domain': get_domain(ie_result['webpage_url']), 'extractor_key': ie_result['extractor_key'], } @@ -1564,9 +1745,14 @@ def get_entry(i): self.report_error( 'Skipping the remaining entries in playlist "%s" since %d items failed extraction' % (playlist, failures)) break - # TODO: skip failed (empty) entries? playlist_results.append(entry_result) ie_result['entries'] = playlist_results + + # Write the updated info to json + if _infojson_written and self._write_info_json( + 'updated playlist', ie_result, + self.prepare_filename(ie_copy, 'pl_infojson'), overwrite=True) is None: + return self.to_screen('[download] Finished downloading playlist: %s' % playlist) return ie_result @@ -1636,6 +1822,29 @@ def _filter(f): return op(actual_value, comparison_value) return _filter + def _check_formats(self, formats): + for f in formats: + self.to_screen('[info] Testing format %s' % f['format_id']) + path = self.get_output_path('temp') + if not self._ensure_dir_exists(f'{path}/'): + continue + temp_file = tempfile.NamedTemporaryFile(suffix='.tmp', delete=False, dir=path or None) + temp_file.close() + try: + success, _ = self.dl(temp_file.name, f, test=True) + except (DownloadError, IOError, OSError, ValueError) + network_exceptions: + success = False + finally: + if os.path.exists(temp_file.name): + try: + os.remove(temp_file.name) + except OSError: + self.report_warning('Unable to delete temporary file "%s"' % temp_file.name) + if success: + yield f + else: + self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id']) + def _default_format_spec(self, info_dict, download=True): def can_merge(): @@ -1675,7 +1884,7 @@ def syntax_error(note, start): allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False), 'video': self.params.get('allow_multiple_video_streams', False)} - check_formats = self.params.get('check_formats') + check_formats = self.params.get('check_formats') == 'selected' def _parse_filter(tokens): filter_parts = [] @@ -1812,11 +2021,18 @@ def _merge(formats_pair): else: output_ext = 'mkv' + filtered = lambda *keys: filter(None, (traverse_obj(fmt, *keys) for fmt in formats_info)) + new_dict = { 'requested_formats': formats_info, - 'format': '+'.join(fmt_info.get('format') for fmt_info in formats_info), - 'format_id': '+'.join(fmt_info.get('format_id') for fmt_info in formats_info), + 'format': '+'.join(filtered('format')), + 'format_id': '+'.join(filtered('format_id')), 'ext': output_ext, + 'protocol': '+'.join(map(determine_protocol, formats_info)), + 'language': '+'.join(orderedSet(filtered('language'))) or None, + 'format_note': '+'.join(orderedSet(filtered('format_note'))) or None, + 'filesize_approx': sum(filtered('filesize', 'filesize_approx')) or None, + 'tbr': sum(filtered('tbr', 'vbr', 'abr')), } if the_only_video: @@ -1825,6 +2041,7 @@ def _merge(formats_pair): 'height': the_only_video.get('height'), 'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video), 'fps': the_only_video.get('fps'), + 'dynamic_range': the_only_video.get('dynamic_range'), 'vcodec': the_only_video.get('vcodec'), 'vbr': the_only_video.get('vbr'), 'stretched_ratio': the_only_video.get('stretched_ratio'), @@ -1834,6 +2051,7 @@ def _merge(formats_pair): new_dict.update({ 'acodec': the_only_audio.get('acodec'), 'abr': the_only_audio.get('abr'), + 'asr': the_only_audio.get('asr'), }) return new_dict @@ -1842,26 +2060,7 @@ def _check_formats(formats): if not check_formats: yield from formats return - for f in formats: - self.to_screen('[info] Testing format %s' % f['format_id']) - temp_file = tempfile.NamedTemporaryFile( - suffix='.tmp', delete=False, - dir=self.get_output_path('temp') or None) - temp_file.close() - try: - success, _ = self.dl(temp_file.name, f, test=True) - except (DownloadError, IOError, OSError, ValueError) + network_exceptions: - success = False - finally: - if os.path.exists(temp_file.name): - try: - os.remove(temp_file.name) - except OSError: - self.report_warning('Unable to delete temporary file "%s"' % temp_file.name) - if success: - yield f - else: - self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id']) + yield from self._check_formats(formats) def _build_selector_function(selector): if isinstance(selector, list): # , @@ -1889,8 +2088,7 @@ def selector_function(ctx): selector_1, selector_2 = map(_build_selector_function, selector.selector) def selector_function(ctx): - for pair in itertools.product( - selector_1(copy.deepcopy(ctx)), selector_2(copy.deepcopy(ctx))): + for pair in itertools.product(selector_1(ctx), selector_2(ctx)): yield _merge(pair) elif selector.type == SINGLE: # atom @@ -1899,7 +2097,7 @@ def selector_function(ctx): # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector if format_spec == 'all': def selector_function(ctx): - yield from _check_formats(ctx['formats']) + yield from _check_formats(ctx['formats'][::-1]) elif format_spec == 'mergeall': def selector_function(ctx): formats = list(_check_formats(ctx['formats'])) @@ -1934,9 +2132,14 @@ def selector_function(ctx): filter_f = lambda f: _filter_f(f) and ( f.get('vcodec') != 'none' or f.get('acodec') != 'none') else: - filter_f = ((lambda f: f.get('ext') == format_spec) - if format_spec in ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] # extension - else (lambda f: f.get('format_id') == format_spec)) # id + if format_spec in self._format_selection_exts['audio']: + filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' + elif format_spec in self._format_selection_exts['video']: + filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' and f.get('vcodec') != 'none' + elif format_spec in self._format_selection_exts['storyboards']: + filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') == 'none' and f.get('vcodec') == 'none' + else: + filter_f = lambda f: f.get('format_id') == format_spec # id def selector_function(ctx): formats = list(ctx['formats']) @@ -1955,7 +2158,7 @@ def selector_function(ctx): filters = [self._build_format_filter(f) for f in selector.filters] def final_selector(ctx): - ctx_copy = copy.deepcopy(ctx) + ctx_copy = dict(ctx) for _filter in filters: ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats'])) return selector_function(ctx_copy) @@ -2013,53 +2216,45 @@ def _calc_cookies(self, info_dict): self.cookiejar.add_cookie_header(pr) return pr.get_header('Cookie') + def _sort_thumbnails(self, thumbnails): + thumbnails.sort(key=lambda t: ( + t.get('preference') if t.get('preference') is not None else -1, + t.get('width') if t.get('width') is not None else -1, + t.get('height') if t.get('height') is not None else -1, + t.get('id') if t.get('id') is not None else '', + t.get('url'))) + def _sanitize_thumbnails(self, info_dict): thumbnails = info_dict.get('thumbnails') if thumbnails is None: thumbnail = info_dict.get('thumbnail') if thumbnail: info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}] - if thumbnails: - thumbnails.sort(key=lambda t: ( - t.get('preference') if t.get('preference') is not None else -1, - t.get('width') if t.get('width') is not None else -1, - t.get('height') if t.get('height') is not None else -1, - t.get('id') if t.get('id') is not None else '', - t.get('url'))) + if not thumbnails: + return - def thumbnail_tester(): - if self.params.get('check_formats'): - test_all = True - to_screen = lambda msg: self.to_screen(f'[info] {msg}') - else: - test_all = False - to_screen = self.write_debug + def check_thumbnails(thumbnails): + for t in thumbnails: + self.to_screen(f'[info] Testing thumbnail {t["id"]}') + try: + self.urlopen(HEADRequest(t['url'])) + except network_exceptions as err: + self.to_screen(f'[info] Unable to connect to thumbnail {t["id"]} URL {t["url"]!r} - {err}. Skipping...') + continue + yield t - def test_thumbnail(t): - if not test_all and not t.get('_test_url'): - return True - to_screen('Testing thumbnail %s' % t['id']) - try: - self.urlopen(HEADRequest(t['url'])) - except network_exceptions as err: - to_screen('Unable to connect to thumbnail %s URL "%s" - %s. Skipping...' % ( - t['id'], t['url'], error_to_compat_str(err))) - return False - return True + self._sort_thumbnails(thumbnails) + for i, t in enumerate(thumbnails): + if t.get('id') is None: + t['id'] = '%d' % i + if t.get('width') and t.get('height'): + t['resolution'] = '%dx%d' % (t['width'], t['height']) + t['url'] = sanitize_url(t['url']) - return test_thumbnail - - for i, t in enumerate(thumbnails): - if t.get('id') is None: - t['id'] = '%d' % i - if t.get('width') and t.get('height'): - t['resolution'] = '%dx%d' % (t['width'], t['height']) - t['url'] = sanitize_url(t['url']) - - if self.params.get('check_formats') is not False: - info_dict['thumbnails'] = LazyList(filter(thumbnail_tester(), thumbnails[::-1])).reverse() - else: - info_dict['thumbnails'] = thumbnails + if self.params.get('check_formats') is True: + info_dict['thumbnails'] = LazyList(check_thumbnails(thumbnails[::-1]), reverse=True) + else: + info_dict['thumbnails'] = thumbnails def process_video_result(self, info_dict, download=True): assert info_dict.get('_type', 'video') == 'video' @@ -2110,6 +2305,9 @@ def sanitize_numeric_fields(info): if info_dict.get('display_id') is None and 'id' in info_dict: info_dict['display_id'] = info_dict['id'] + if info_dict.get('duration') is not None: + info_dict['duration_string'] = formatSeconds(info_dict['duration']) + for ts_key, date_key in ( ('timestamp', 'upload_date'), ('release_timestamp', 'release_date'), @@ -2162,16 +2360,19 @@ def sanitize_numeric_fields(info): info_dict['requested_subtitles'] = self.process_subtitles( info_dict['id'], subtitles, automatic_captions) - # We now pick which formats have to be downloaded if info_dict.get('formats') is None: # There's only one format available formats = [info_dict] else: formats = info_dict['formats'] + info_dict['__has_drm'] = any(f.get('has_drm') for f in formats) if not self.params.get('allow_unplayable_formats'): formats = [f for f in formats if not f.get('has_drm')] - info_dict['__has_drm'] = len(info_dict.get('formats') or ['']) > len(formats) + + if info_dict.get('is_live'): + get_from_start = bool(self.params.get('live_from_start')) + formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start] if not formats: self.raise_no_formats(info_dict) @@ -2208,10 +2409,18 @@ def is_wellformed(f): formats_dict[format_id].append(format) # Make sure all formats have unique format_id + common_exts = set(itertools.chain(*self._format_selection_exts.values())) for format_id, ambiguous_formats in formats_dict.items(): - if len(ambiguous_formats) > 1: - for i, format in enumerate(ambiguous_formats): + ambigious_id = len(ambiguous_formats) > 1 + for i, format in enumerate(ambiguous_formats): + if ambigious_id: format['format_id'] = '%s-%d' % (format_id, i) + if format.get('ext') is None: + format['ext'] = determine_ext(format['url']).lower() + # Ensure there is no conflict between id and ext in format selection + # See https://github.com/yt-dlp/yt-dlp/issues/1282 + if format['format_id'] != format['ext'] and format['format_id'] in common_exts: + format['format_id'] = 'f%s' % format['format_id'] for i, format in enumerate(formats): if format.get('format') is None: @@ -2220,13 +2429,16 @@ def is_wellformed(f): res=self.format_resolution(format), note=format_field(format, 'format_note', ' (%s)'), ) - # Automatically determine file extension if missing - if format.get('ext') is None: - format['ext'] = determine_ext(format['url']).lower() - # Automatically determine protocol if missing (useful for format - # selection purposes) if format.get('protocol') is None: format['protocol'] = determine_protocol(format) + if format.get('resolution') is None: + format['resolution'] = self.format_resolution(format, default=None) + if format.get('dynamic_range') is None and format.get('vcodec') != 'none': + format['dynamic_range'] = 'SDR' + if (info_dict.get('duration') and format.get('tbr') + and not format.get('filesize') and not format.get('filesize_approx')): + format['filesize_approx'] = info_dict['duration'] * format['tbr'] * (1024 / 8) + # Add HTTP headers, so that external programs can use them from the # json output full_format_info = info_dict.copy() @@ -2238,6 +2450,9 @@ def is_wellformed(f): # TODO Central sorting goes here + if self.params.get('check_formats') is True: + formats = LazyList(self._check_formats(formats[::-1]), reverse=True) + if not formats or formats[0] is not info_dict: # only set the 'formats' fields if the original info_dict list them # otherwise we end up with a circular reference, the first (and unique) @@ -2247,20 +2462,21 @@ def is_wellformed(f): info_dict, _ = self.pre_process(info_dict) + # The pre-processors may have modified the formats + formats = info_dict.get('formats', [info_dict]) + + list_only = self.params.get('simulate') is None and ( + self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles')) + interactive_format_selection = not list_only and self.format_selector == '-' if self.params.get('list_thumbnails'): self.list_thumbnails(info_dict) - if self.params.get('listformats'): - if not info_dict.get('formats') and not info_dict.get('url'): - self.to_screen('%s has no formats' % info_dict['id']) - else: - self.list_formats(info_dict) if self.params.get('listsubtitles'): if 'automatic_captions' in info_dict: self.list_subtitles( info_dict['id'], automatic_captions, 'automatic captions') self.list_subtitles(info_dict['id'], subtitles, 'subtitles') - list_only = self.params.get('simulate') is None and ( - self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles')) + if self.params.get('listformats') or interactive_format_selection: + self.list_formats(info_dict) if list_only: # Without this printing, -F --print-json will not work self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True) @@ -2272,33 +2488,48 @@ def is_wellformed(f): self.write_debug('Default format spec: %s' % req_format) format_selector = self.build_format_selector(req_format) - # While in format selection we may need to have an access to the original - # format set in order to calculate some metrics or do some processing. - # For now we need to be able to guess whether original formats provided - # by extractor are incomplete or not (i.e. whether extractor provides only - # video-only or audio-only formats) for proper formats selection for - # extractors with such incomplete formats (see - # https://github.com/ytdl-org/youtube-dl/pull/5556). - # Since formats may be filtered during format selection and may not match - # the original formats the results may be incorrect. Thus original formats - # or pre-calculated metrics should be passed to format selection routines - # as well. - # We will pass a context object containing all necessary additional data - # instead of just formats. - # This fixes incorrect format selection issue (see - # https://github.com/ytdl-org/youtube-dl/issues/10083). - incomplete_formats = ( - # All formats are video-only or - all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) - # all formats are audio-only - or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)) + while True: + if interactive_format_selection: + req_format = input( + self._format_screen('\nEnter format selector: ', self.Styles.EMPHASIS)) + try: + format_selector = self.build_format_selector(req_format) + except SyntaxError as err: + self.report_error(err, tb=False, is_error=False) + continue - ctx = { - 'formats': formats, - 'incomplete_formats': incomplete_formats, - } + # While in format selection we may need to have an access to the original + # format set in order to calculate some metrics or do some processing. + # For now we need to be able to guess whether original formats provided + # by extractor are incomplete or not (i.e. whether extractor provides only + # video-only or audio-only formats) for proper formats selection for + # extractors with such incomplete formats (see + # https://github.com/ytdl-org/youtube-dl/pull/5556). + # Since formats may be filtered during format selection and may not match + # the original formats the results may be incorrect. Thus original formats + # or pre-calculated metrics should be passed to format selection routines + # as well. + # We will pass a context object containing all necessary additional data + # instead of just formats. + # This fixes incorrect format selection issue (see + # https://github.com/ytdl-org/youtube-dl/issues/10083). + incomplete_formats = ( + # All formats are video-only or + all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) + # all formats are audio-only + or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)) + + ctx = { + 'formats': formats, + 'incomplete_formats': incomplete_formats, + } + + formats_to_download = list(format_selector(ctx)) + if interactive_format_selection and not formats_to_download: + self.report_error('Requested format is not available', tb=False, is_error=False) + continue + break - formats_to_download = list(format_selector(ctx)) if not formats_to_download: if not self.params.get('ignore_no_formats_error'): raise ExtractorError('Requested format is not available', expected=True, @@ -2318,7 +2549,7 @@ def is_wellformed(f): new_info['__original_infodict'] = info_dict new_info.update(fmt) self.process_info(new_info) - # We update the info dict with the best quality format (backwards compatibility) + # We update the info dict with the selected best quality format (backwards compatibility) if formats_to_download: info_dict.update(formats_to_download[-1]) return info_dict @@ -2342,20 +2573,24 @@ def process_subtitles(self, video_id, normal_subtitles, automatic_captions): if self.params.get('allsubtitles', False): requested_langs = all_sub_langs elif self.params.get('subtitleslangs', False): - requested_langs = set() - for lang in self.params.get('subtitleslangs'): - if lang == 'all': - requested_langs.update(all_sub_langs) + # A list is used so that the order of languages will be the same as + # given in subtitleslangs. See https://github.com/yt-dlp/yt-dlp/issues/1041 + requested_langs = [] + for lang_re in self.params.get('subtitleslangs'): + if lang_re == 'all': + requested_langs.extend(all_sub_langs) continue - discard = lang[0] == '-' + discard = lang_re[0] == '-' if discard: - lang = lang[1:] - current_langs = filter(re.compile(lang + '$').match, all_sub_langs) + lang_re = lang_re[1:] + current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs) if discard: for lang in current_langs: - requested_langs.discard(lang) + while lang in requested_langs: + requested_langs.remove(lang) else: - requested_langs.update(current_langs) + requested_langs.extend(current_langs) + requested_langs = orderedSet(requested_langs) elif 'en' in available_subs: requested_langs = ['en'] else: @@ -2412,10 +2647,12 @@ def print_optional(field): if self.params.get('forceprint') or self.params.get('forcejson'): self.post_extract(info_dict) for tmpl in self.params.get('forceprint', []): - if re.match(r'\w+$', tmpl): + mobj = re.match(r'\w+(=?)$', tmpl) + if mobj and mobj.group(1): + tmpl = f'{tmpl[:-1]} = %({tmpl[:-1]})s' + elif mobj: tmpl = '%({})s'.format(tmpl) - tmpl, info_copy = self.prepare_outtmpl(tmpl, info_dict) - self.to_stdout(self.escape_outtmpl(tmpl) % info_copy) + self.to_stdout(self.evaluate_outtmpl(tmpl, info_dict)) print_mandatory('title') print_mandatory('id') @@ -2438,7 +2675,7 @@ def dl(self, name, info, subtitle=False, test=False): verbose = self.params.get('verbose') params = { 'test': True, - 'quiet': not verbose, + 'quiet': self.params.get('quiet') or not verbose, 'verbose': verbose, 'noprogress': not verbose, 'nopart': True, @@ -2455,7 +2692,10 @@ def dl(self, name, info, subtitle=False, test=False): fd.add_progress_hook(ph) urls = '", "'.join([f['url'] for f in info.get('requested_formats', [])] or [info['url']]) self.write_debug('Invoking downloader on "%s"' % urls) - new_info = dict(info) + + # Note: Ideally info should be a deep-copied so that hooks cannot modify it. + # But it may contain objects that are not deep-copyable + new_info = self._copy_infodict(info) if new_info.get('http_headers') is None: new_info['http_headers'] = self._calc_headers(new_info) return fd.download(name, new_info, subtitle) @@ -2470,6 +2710,9 @@ def process_info(self, info_dict): if self._num_downloads >= int(max_downloads): raise MaxDownloadsReached() + if info_dict.get('is_live') and not self.params.get('live_from_start'): + info_dict['title'] += ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M') + # TODO: backward compatibility, to be removed info_dict['fulltitle'] = info_dict['title'] @@ -2493,37 +2736,45 @@ def process_info(self, info_dict): if self.params.get('simulate'): if self.params.get('force_write_download_archive', False): self.record_download_archive(info_dict) - # Do nothing else if in simulate mode return if full_filename is None: return - if not self._ensure_dir_exists(encodeFilename(full_filename)): return if not self._ensure_dir_exists(encodeFilename(temp_filename)): return - if self.params.get('writedescription', False): - descfn = self.prepare_filename(info_dict, 'description') - if not self._ensure_dir_exists(encodeFilename(descfn)): - return - if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(descfn)): - self.to_screen('[info] Video description is already present') - elif info_dict.get('description') is None: - self.report_warning('There\'s no description to write.') - else: - try: - self.to_screen('[info] Writing video description to: ' + descfn) - with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: - descfile.write(info_dict['description']) - except (OSError, IOError): - self.report_error('Cannot write description file ' + descfn) - return + if self._write_description('video', info_dict, + self.prepare_filename(info_dict, 'description')) is None: + return + sub_files = self._write_subtitles(info_dict, temp_filename) + if sub_files is None: + return + files_to_move.update(dict(sub_files)) + + thumb_files = self._write_thumbnails( + 'video', info_dict, temp_filename, self.prepare_filename(info_dict, 'thumbnail')) + if thumb_files is None: + return + files_to_move.update(dict(thumb_files)) + + infofn = self.prepare_filename(info_dict, 'infojson') + _infojson_written = self._write_info_json('video', info_dict, infofn) + if _infojson_written: + info_dict['infojson_filename'] = infofn + # For backward compatibility, even though it was a private field + info_dict['__infojson_filename'] = infofn + elif _infojson_written is None: + return + + # Note: Annotations are deprecated + annofn = None if self.params.get('writeannotations', False): annofn = self.prepare_filename(info_dict, 'annotation') + if annofn: if not self._ensure_dir_exists(encodeFilename(annofn)): return if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(annofn)): @@ -2541,115 +2792,44 @@ def process_info(self, info_dict): self.report_error('Cannot write annotations file: ' + annofn) return - subtitles_are_requested = any([self.params.get('writesubtitles', False), - self.params.get('writeautomaticsub')]) - - if subtitles_are_requested and info_dict.get('requested_subtitles'): - # subtitles download errors are already managed as troubles in relevant IE - # that way it will silently go on when used with unsupporting IE - subtitles = info_dict['requested_subtitles'] - # ie = self.get_info_extractor(info_dict['extractor_key']) - for sub_lang, sub_info in subtitles.items(): - sub_format = sub_info['ext'] - sub_filename = subtitles_filename(temp_filename, sub_lang, sub_format, info_dict.get('ext')) - sub_filename_final = subtitles_filename( - self.prepare_filename(info_dict, 'subtitle'), sub_lang, sub_format, info_dict.get('ext')) - if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(sub_filename)): - self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format)) - sub_info['filepath'] = sub_filename - files_to_move[sub_filename] = sub_filename_final - else: - self.to_screen('[info] Writing video subtitles to: ' + sub_filename) - if sub_info.get('data') is not None: - try: - # Use newline='' to prevent conversion of newline characters - # See https://github.com/ytdl-org/youtube-dl/issues/10268 - with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: - subfile.write(sub_info['data']) - sub_info['filepath'] = sub_filename - files_to_move[sub_filename] = sub_filename_final - except (OSError, IOError): - self.report_error('Cannot write subtitles file ' + sub_filename) - return - else: - try: - self.dl(sub_filename, sub_info.copy(), subtitle=True) - sub_info['filepath'] = sub_filename - files_to_move[sub_filename] = sub_filename_final - except (ExtractorError, IOError, OSError, ValueError) + network_exceptions as err: - self.report_warning('Unable to download subtitle for "%s": %s' % - (sub_lang, error_to_compat_str(err))) - continue - - if self.params.get('writeinfojson', False): - infofn = self.prepare_filename(info_dict, 'infojson') - if not self._ensure_dir_exists(encodeFilename(infofn)): - return - if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(infofn)): - self.to_screen('[info] Video metadata is already present') - else: - self.to_screen('[info] Writing video metadata as JSON to: ' + infofn) - try: - write_json_file(self.sanitize_info(info_dict, self.params.get('clean_infojson', True)), infofn) - except (OSError, IOError): - self.report_error('Cannot write video metadata to JSON file ' + infofn) - return - info_dict['__infojson_filename'] = infofn - - for thumb_ext in self._write_thumbnails(info_dict, temp_filename): - thumb_filename_temp = replace_extension(temp_filename, thumb_ext, info_dict.get('ext')) - thumb_filename = replace_extension( - self.prepare_filename(info_dict, 'thumbnail'), thumb_ext, info_dict.get('ext')) - files_to_move[thumb_filename_temp] = thumb_filename - # Write internet shortcut files - url_link = webloc_link = desktop_link = False - if self.params.get('writelink', False): - if sys.platform == "darwin": # macOS. - webloc_link = True - elif sys.platform.startswith("linux"): - desktop_link = True - else: # if sys.platform in ['win32', 'cygwin']: - url_link = True - if self.params.get('writeurllink', False): - url_link = True - if self.params.get('writewebloclink', False): - webloc_link = True - if self.params.get('writedesktoplink', False): - desktop_link = True - - if url_link or webloc_link or desktop_link: + def _write_link_file(link_type): if 'webpage_url' not in info_dict: self.report_error('Cannot write internet shortcut file because the "webpage_url" field is missing in the media information') - return - ascii_url = iri_to_uri(info_dict['webpage_url']) - - def _write_link_file(extension, template, newline, embed_filename): - linkfn = replace_extension(full_filename, extension, info_dict.get('ext')) + return False + linkfn = replace_extension(self.prepare_filename(info_dict, 'link'), link_type, info_dict.get('ext')) + if not self._ensure_dir_exists(encodeFilename(linkfn)): + return False if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)): - self.to_screen('[info] Internet shortcut is already present') - else: - try: - self.to_screen('[info] Writing internet shortcut to: ' + linkfn) - with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', newline=newline) as linkfile: - template_vars = {'url': ascii_url} - if embed_filename: - template_vars['filename'] = linkfn[:-(len(extension) + 1)] - linkfile.write(template % template_vars) - except (OSError, IOError): - self.report_error('Cannot write internet shortcut ' + linkfn) - return False + self.to_screen(f'[info] Internet shortcut (.{link_type}) is already present') + return True + try: + self.to_screen(f'[info] Writing internet shortcut (.{link_type}) to: {linkfn}') + with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', + newline='\r\n' if link_type == 'url' else '\n') as linkfile: + template_vars = {'url': iri_to_uri(info_dict['webpage_url'])} + if link_type == 'desktop': + template_vars['filename'] = linkfn[:-(len(link_type) + 1)] + linkfile.write(LINK_TEMPLATES[link_type] % template_vars) + except (OSError, IOError): + self.report_error(f'Cannot write internet shortcut {linkfn}') + return False return True - if url_link: - if not _write_link_file('url', DOT_URL_LINK_TEMPLATE, '\r\n', embed_filename=False): - return - if webloc_link: - if not _write_link_file('webloc', DOT_WEBLOC_LINK_TEMPLATE, '\n', embed_filename=False): - return - if desktop_link: - if not _write_link_file('desktop', DOT_DESKTOP_LINK_TEMPLATE, '\n', embed_filename=True): - return + write_links = { + 'url': self.params.get('writeurllink'), + 'webloc': self.params.get('writewebloclink'), + 'desktop': self.params.get('writedesktoplink'), + } + if self.params.get('writelink'): + link_type = ('webloc' if sys.platform == 'darwin' + else 'desktop' if sys.platform.startswith('linux') + else 'url') + write_links[link_type] = True + + if any(should_write and not _write_link_file(link_type) + for link_type, should_write in write_links.items()): + return try: info_dict, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move) @@ -2713,10 +2893,19 @@ def compatible_formats(formats): requested_formats = info_dict['requested_formats'] old_ext = info_dict['ext'] - if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats): - info_dict['ext'] = 'mkv' - self.report_warning( - 'Requested formats are incompatible for merge and will be merged into mkv.') + if self.params.get('merge_output_format') is None: + if not compatible_formats(requested_formats): + info_dict['ext'] = 'mkv' + self.report_warning( + 'Requested formats are incompatible for merge and will be merged into mkv') + if (info_dict['ext'] == 'webm' + and info_dict.get('thumbnails') + # check with type instead of pp_key, __name__, or isinstance + # since we dont want any custom PPs to trigger this + and any(type(pp) == EmbedThumbnailPP for pp in self._pps['post_process'])): + info_dict['ext'] = 'mkv' + self.report_warning( + 'webm doesn\'t support embedding a thumbnail, mkv will be used') new_ext = info_dict['ext'] def correct_ext(filename, ext=new_ext): @@ -2735,20 +2924,22 @@ def correct_ext(filename, ext=new_ext): dl_filename = existing_file(full_filename, temp_filename) info_dict['__real_download'] = False - _protocols = set(determine_protocol(f) for f in requested_formats) - if len(_protocols) == 1: # All requested formats have same protocol - info_dict['protocol'] = _protocols.pop() - directly_mergable = FFmpegFD.can_merge_formats(info_dict) + downloaded = [] + merger = FFmpegMergerPP(self) + + fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-') if dl_filename is not None: self.report_file_already_downloaded(dl_filename) - elif (directly_mergable and get_suitable_downloader( - info_dict, self.params, to_stdout=(temp_filename == '-')) == FFmpegFD): + elif fd: + for f in requested_formats if fd != FFmpegFD else []: + f['filepath'] = fname = prepend_extension( + correct_ext(temp_filename, info_dict['ext']), + 'f%s' % f['format_id'], info_dict['ext']) + downloaded.append(fname) info_dict['url'] = '\n'.join(f['url'] for f in requested_formats) success, real_download = self.dl(temp_filename, info_dict) info_dict['__real_download'] = real_download else: - downloaded = [] - merger = FFmpegMergerPP(self) if self.params.get('allow_unplayable_formats'): self.report_warning( 'You have requested merging of multiple formats ' @@ -2760,7 +2951,7 @@ def correct_ext(filename, ext=new_ext): 'The formats won\'t be merged.') if temp_filename == '-': - reason = ('using a downloader other than ffmpeg' if directly_mergable + reason = ('using a downloader other than ffmpeg' if FFmpegFD.can_merge_formats(info_dict, self.params) else 'but the formats are incompatible for simultaneous download' if merger.available else 'but ffmpeg is not installed') self.report_warning( @@ -2777,18 +2968,20 @@ def correct_ext(filename, ext=new_ext): 'f%s' % f['format_id'], new_info['ext']) if not self._ensure_dir_exists(fname): return + f['filepath'] = fname downloaded.append(fname) partial_success, real_download = self.dl(fname, new_info) info_dict['__real_download'] = info_dict['__real_download'] or real_download success = success and partial_success - if merger.available and not self.params.get('allow_unplayable_formats'): - info_dict['__postprocessors'].append(merger) - info_dict['__files_to_merge'] = downloaded - # Even if there were no downloads, it is being merged only now - info_dict['__real_download'] = True - else: - for file in downloaded: - files_to_move[file] = None + + if downloaded and merger.available and not self.params.get('allow_unplayable_formats'): + info_dict['__postprocessors'].append(merger) + info_dict['__files_to_merge'] = downloaded + # Even if there were no downloads, it is being merged only now + info_dict['__real_download'] = True + else: + for file in downloaded: + files_to_move[file] = None else: # Just a single file dl_filename = existing_file(full_filename, temp_filename) @@ -2853,11 +3046,18 @@ def ffmpeg_fixup(cndn, msg, cls): 'writing DASH m4a. Only some players support this container', FFmpegFixupM4aPP) - downloader = (get_suitable_downloader(info_dict, self.params).__name__ - if 'protocol' in info_dict else None) - ffmpeg_fixup(downloader == 'HlsFD', 'malformed AAC bitstream detected', FFmpegFixupM3u8PP) - ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed timestamps detected', FFmpegFixupTimestampPP) - ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed duration detected', FFmpegFixupDurationPP) + downloader = get_suitable_downloader(info_dict, self.params) if 'protocol' in info_dict else None + downloader = downloader.__name__ if downloader else None + + if info_dict.get('requested_formats') is None: # Not necessary if doing merger + ffmpeg_fixup(downloader == 'HlsFD', + 'Possible MPEG-TS in MP4 container or malformed AAC timestamps', + FFmpegFixupM3u8PP) + ffmpeg_fixup(info_dict.get('is_live') and downloader == 'DashSegmentsFD', + 'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP) + + ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'Malformed timestamps detected', FFmpegFixupTimestampPP) + ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'Malformed duration detected', FFmpegFixupDurationPP) fixup() try: @@ -2879,8 +3079,29 @@ def ffmpeg_fixup(cndn, msg, cls): if max_downloads is not None and self._num_downloads >= int(max_downloads): raise MaxDownloadsReached() + def __download_wrapper(self, func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + try: + res = func(*args, **kwargs) + except UnavailableVideoError as e: + self.report_error(e) + except MaxDownloadsReached as e: + self.to_screen(f'[info] {e}') + raise + except DownloadCancelled as e: + self.to_screen(f'[info] {e}') + if not self.params.get('break_per_url'): + raise + else: + if self.params.get('dump_single_json', False): + self.post_extract(res) + self.to_stdout(json.dumps(self.sanitize_info(res))) + return wrapper + def download(self, url_list): """Download a given list of URLs.""" + url_list = variadic(url_list) # Passing a single URL is a common mistake outtmpl = self.outtmpl_dict['default'] if (len(url_list) > 1 and outtmpl != '-' @@ -2889,25 +3110,8 @@ def download(self, url_list): raise SameFileError(outtmpl) for url in url_list: - try: - # It also downloads the videos - res = self.extract_info( - url, force_generic_extractor=self.params.get('force_generic_extractor', False)) - except UnavailableVideoError: - self.report_error('unable to download video') - except MaxDownloadsReached: - self.to_screen('[info] Maximum number of downloads reached') - raise - except ExistingVideoReached: - self.to_screen('[info] Encountered a video that is already in the archive, stopping due to --break-on-existing') - raise - except RejectedVideoReached: - self.to_screen('[info] Encountered a video that did not match filter, stopping due to --break-on-reject') - raise - else: - if self.params.get('dump_single_json', False): - self.post_extract(res) - self.to_stdout(json.dumps(self.sanitize_info(res))) + self.__download_wrapper(self.extract_info)( + url, force_generic_extractor=self.params.get('force_generic_extractor', False)) return self._download_retcode @@ -2918,11 +3122,13 @@ def download_with_info_file(self, info_filename): # FileInput doesn't have a read method, we can't call json.load info = self.sanitize_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True)) try: - self.process_ie_result(info, download=True) - except (DownloadError, EntryNotInPlaylist, ThrottledDownload): + self.__download_wrapper(self.process_ie_result)(info, download=True) + except (DownloadError, EntryNotInPlaylist, ReExtractInfo) as e: + if not isinstance(e, EntryNotInPlaylist): + self.to_stderr('\r') webpage_url = info.get('webpage_url') if webpage_url is not None: - self.report_warning('The info failed to download, trying with "%s"' % webpage_url) + self.report_warning(f'The info failed to download: {e}; trying with URL {webpage_url}') return self.download([webpage_url]) else: raise @@ -2935,21 +3141,27 @@ def sanitize_info(info_dict, remove_private_keys=False): return info_dict info_dict.setdefault('epoch', int(time.time())) remove_keys = {'__original_infodict'} # Always remove this since this may contain a copy of the entire dict - keep_keys = ['_type'], # Always keep this to facilitate load-info-json + keep_keys = ['_type'] # Always keep this to facilitate load-info-json if remove_private_keys: remove_keys |= { - 'requested_formats', 'requested_subtitles', 'requested_entries', - 'filepath', 'entries', 'original_url', 'playlist_autonumber', + 'requested_formats', 'requested_subtitles', 'requested_entries', 'entries', + 'filepath', 'infojson_filename', 'original_url', 'playlist_autonumber', } - empty_values = (None, {}, [], set(), tuple()) reject = lambda k, v: k not in keep_keys and ( - k.startswith('_') or k in remove_keys or v in empty_values) + k.startswith('_') or k in remove_keys or v is None) else: reject = lambda k, v: k in remove_keys - filter_fn = lambda obj: ( - list(map(filter_fn, obj)) if isinstance(obj, (LazyList, list, tuple, set)) - else obj if not isinstance(obj, dict) - else dict((k, filter_fn(v)) for k, v in obj.items() if not reject(k, v))) + + def filter_fn(obj): + if isinstance(obj, dict): + return {k: filter_fn(v) for k, v in obj.items() if not reject(k, v)} + elif isinstance(obj, (list, tuple, set, LazyList)): + return list(map(filter_fn, obj)) + elif obj is None or isinstance(obj, (str, int, float, bool)): + return obj + else: + return repr(obj) + return filter_fn(info_dict) @staticmethod @@ -2961,10 +3173,17 @@ def run_pp(self, pp, infodict): files_to_delete = [] if '__files_to_move' not in infodict: infodict['__files_to_move'] = {} - files_to_delete, infodict = pp.run(infodict) + try: + files_to_delete, infodict = pp.run(infodict) + except PostProcessingError as e: + # Must be True and not 'only_download' + if self.params.get('ignoreerrors') is True: + self.report_error(e) + return infodict + raise + if not files_to_delete: return infodict - if self.params.get('keepvideo', False): for f in files_to_delete: infodict['__files_to_move'].setdefault(f, '') @@ -3062,34 +3281,34 @@ def record_download_archive(self, info_dict): @staticmethod def format_resolution(format, default='unknown'): - if format.get('vcodec') == 'none': - if format.get('acodec') == 'none': - return 'images' + if format.get('vcodec') == 'none' and format.get('acodec') != 'none': return 'audio only' if format.get('resolution') is not None: return format['resolution'] if format.get('width') and format.get('height'): - res = '%dx%d' % (format['width'], format['height']) + return '%dx%d' % (format['width'], format['height']) elif format.get('height'): - res = '%sp' % format['height'] + return '%sp' % format['height'] elif format.get('width'): - res = '%dx?' % format['width'] - else: - res = default - return res + return '%dx?' % format['width'] + return default def _format_note(self, fdict): res = '' if fdict.get('ext') in ['f4f', 'f4m']: - res += '(unsupported) ' + res += '(unsupported)' if fdict.get('language'): if res: res += ' ' - res += '[%s] ' % fdict['language'] + res += '[%s]' % fdict['language'] if fdict.get('format_note') is not None: - res += fdict['format_note'] + ' ' + if res: + res += ' ' + res += fdict['format_note'] if fdict.get('tbr') is not None: - res += '%4dk ' % fdict['tbr'] + if res: + res += ', ' + res += '%4dk' % fdict['tbr'] if fdict.get('container') is not None: if res: res += ', ' @@ -3134,37 +3353,56 @@ def _format_note(self, fdict): res += '~' + format_bytes(fdict['filesize_approx']) return res + def _list_format_headers(self, *headers): + if self.params.get('listformats_table', True) is not False: + return [self._format_screen(header, self.Styles.HEADERS) for header in headers] + return headers + def list_formats(self, info_dict): + if not info_dict.get('formats') and not info_dict.get('url'): + self.to_screen('%s has no formats' % info_dict['id']) + return + self.to_screen('[info] Available formats for %s:' % info_dict['id']) + formats = info_dict.get('formats', [info_dict]) - new_format = ( - 'list-formats' not in self.params.get('compat_opts', []) - and self.params.get('listformats_table', True) is not False) + new_format = self.params.get('listformats_table', True) is not False if new_format: + delim = self._format_screen('\u2502', self.Styles.DELIM, '|', test_encoding=True) table = [ [ - format_field(f, 'format_id'), + self._format_screen(format_field(f, 'format_id'), self.Styles.ID), format_field(f, 'ext'), - self.format_resolution(f), - format_field(f, 'fps', '%d'), - '|', - format_field(f, 'filesize', ' %s', func=format_bytes) + format_field(f, 'filesize_approx', '~%s', func=format_bytes), - format_field(f, 'tbr', '%4dk'), - shorten_protocol_name(f.get('protocol', '').replace("native", "n")), - '|', - format_field(f, 'vcodec', default='unknown').replace('none', ''), - format_field(f, 'vbr', '%4dk'), - format_field(f, 'acodec', default='unknown').replace('none', ''), - format_field(f, 'abr', '%3dk'), - format_field(f, 'asr', '%5dHz'), - ', '.join(filter(None, ( - 'UNSUPPORTED' if f.get('ext') in ('f4f', 'f4m') else '', + format_field(f, func=self.format_resolution, ignore=('audio only', 'images')), + format_field(f, 'fps', '\t%d'), + format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''), + delim, + format_field(f, 'filesize', ' \t%s', func=format_bytes) + format_field(f, 'filesize_approx', '~\t%s', func=format_bytes), + format_field(f, 'tbr', '\t%dk'), + shorten_protocol_name(f.get('protocol', '')), + delim, + format_field(f, 'vcodec', default='unknown').replace( + 'none', + 'images' if f.get('acodec') == 'none' + else self._format_screen('audio only', self.Styles.SUPPRESS)), + format_field(f, 'vbr', '\t%dk'), + format_field(f, 'acodec', default='unknown').replace( + 'none', + '' if f.get('vcodec') == 'none' + else self._format_screen('video only', self.Styles.SUPPRESS)), + format_field(f, 'abr', '\t%dk'), + format_field(f, 'asr', '\t%dHz'), + join_nonempty( + self._format_screen('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None, format_field(f, 'language', '[%s]'), - format_field(f, 'format_note'), - format_field(f, 'container', ignore=(None, f.get('ext'))), - ))), + join_nonempty( + format_field(f, 'format_note'), + format_field(f, 'container', ignore=(None, f.get('ext'))), + delim=', '), + delim=' '), ] for f in formats if f.get('preference') is None or f['preference'] >= -1000] - header_line = ['ID', 'EXT', 'RESOLUTION', 'FPS', '|', ' FILESIZE', ' TBR', 'PROTO', - '|', 'VCODEC', ' VBR', 'ACODEC', ' ABR', ' ASR', 'MORE INFO'] + header_line = self._list_format_headers( + 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', delim, '\tFILESIZE', '\tTBR', 'PROTO', + delim, 'VCODEC', '\tVBR', 'ACODEC', '\tABR', '\tASR', 'MORE INFO') else: table = [ [ @@ -3176,10 +3414,11 @@ def list_formats(self, info_dict): if f.get('preference') is None or f['preference'] >= -1000] header_line = ['format code', 'extension', 'resolution', 'note'] - self.to_screen( - '[info] Available formats for %s:' % info_dict['id']) self.to_stdout(render_table( - header_line, table, delim=new_format, extraGap=(0 if new_format else 1), hideEmpty=new_format)) + header_line, table, + extra_gap=(0 if new_format else 1), + hide_empty=new_format, + delim=new_format and self._format_screen('\u2500', self.Styles.DELIM, '-', test_encoding=True))) def list_thumbnails(self, info_dict): thumbnails = list(info_dict.get('thumbnails')) @@ -3190,7 +3429,7 @@ def list_thumbnails(self, info_dict): self.to_screen( '[info] Thumbnails for %s:' % info_dict['id']) self.to_stdout(render_table( - ['ID', 'width', 'height', 'URL'], + self._list_format_headers('ID', 'Width', 'Height', 'URL'), [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) def list_subtitles(self, video_id, subtitles, name='subtitles'): @@ -3207,9 +3446,9 @@ def _row(lang, formats): return [lang, ', '.join(names), ', '.join(exts)] self.to_stdout(render_table( - ['Language', 'Name', 'Formats'], + self._list_format_headers('Language', 'Name', 'Formats'), [_row(lang, formats) for lang, formats in subtitles.items()], - hideEmpty=True)) + hide_empty=True)) def urlopen(self, req): """ Start an HTTP download """ @@ -3221,44 +3460,60 @@ def print_debug_header(self): if not self.params.get('verbose'): return - stdout_encoding = getattr( - sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__) - encoding_str = ( - '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % ( - locale.getpreferredencoding(), - sys.getfilesystemencoding(), - stdout_encoding, - self.get_encoding())) - write_string(encoding_str, encoding=None) + def get_encoding(stream): + ret = getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__) + if not supports_terminal_sequences(stream): + from .compat import WINDOWS_VT_MODE + ret += ' (No VT)' if WINDOWS_VT_MODE is False else ' (No ANSI)' + return ret - source = ( - '(exe)' if hasattr(sys, 'frozen') - else '(zip)' if isinstance(globals().get('__loader__'), zipimporter) - else '(source)' if os.path.basename(sys.argv[0]) == '__main__.py' - else '') - self._write_string('[debug] yt-dlp version %s %s\n' % (__version__, source)) - if _LAZY_LOADER: - self._write_string('[debug] Lazy loading extractors enabled\n') - if _PLUGIN_CLASSES: - self._write_string( - '[debug] Plugin Extractors: %s\n' % [ie.ie_key() for ie in _PLUGIN_CLASSES]) + encoding_str = 'Encodings: locale %s, fs %s, out %s, err %s, pref %s' % ( + locale.getpreferredencoding(), + sys.getfilesystemencoding(), + get_encoding(self._screen_file), get_encoding(self._err_file), + self.get_encoding()) + + logger = self.params.get('logger') + if logger: + write_debug = lambda msg: logger.debug(f'[debug] {msg}') + write_debug(encoding_str) + else: + write_string(f'[debug] {encoding_str}\n', encoding=None) + write_debug = lambda msg: self._write_string(f'[debug] {msg}\n') + + source = detect_variant() + write_debug(join_nonempty( + 'yt-dlp version', __version__, + f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '', + '' if source == 'unknown' else f'({source})', + delim=' ')) + if not _LAZY_LOADER: + if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): + write_debug('Lazy loading extractors is forcibly disabled') + else: + write_debug('Lazy loading extractors is disabled') + if plugin_extractors or plugin_postprocessors: + write_debug('Plugins: %s' % [ + '%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}') + for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())]) if self.params.get('compat_opts'): - self._write_string( - '[debug] Compatibility options: %s\n' % ', '.join(self.params.get('compat_opts'))) - try: - sp = subprocess.Popen( - ['git', 'rev-parse', '--short', 'HEAD'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE, - cwd=os.path.dirname(os.path.abspath(__file__))) - out, err = process_communicate_or_kill(sp) - out = out.decode().strip() - if re.match('[0-9a-f]+', out): - self._write_string('[debug] Git HEAD: %s\n' % out) - except Exception: + write_debug('Compatibility options: %s' % ', '.join(self.params.get('compat_opts'))) + + if source == 'source': try: - sys.exc_clear() + sp = Popen( + ['git', 'rev-parse', '--short', 'HEAD'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + cwd=os.path.dirname(os.path.abspath(__file__))) + out, err = sp.communicate_or_kill() + out = out.decode().strip() + if re.match('[0-9a-f]+', out): + write_debug('Git HEAD: %s' % out) except Exception: - pass + try: + sys.exc_clear() + except Exception: + pass def python_implementation(): impl_name = platform.python_implementation() @@ -3266,44 +3521,47 @@ def python_implementation(): return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3] return impl_name - self._write_string('[debug] Python version %s (%s %s) - %s\n' % ( + write_debug('Python version %s (%s %s) - %s' % ( platform.python_version(), python_implementation(), platform.architecture()[0], platform_name())) - exe_versions = FFmpegPostProcessor.get_versions(self) + exe_versions, ffmpeg_features = FFmpegPostProcessor.get_versions_and_features(self) + ffmpeg_features = {key for key, val in ffmpeg_features.items() if val} + if ffmpeg_features: + exe_versions['ffmpeg'] += ' (%s)' % ','.join(ffmpeg_features) + exe_versions['rtmpdump'] = rtmpdump_version() exe_versions['phantomjs'] = PhantomJSwrapper._version() exe_str = ', '.join( f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v ) or 'none' - self._write_string('[debug] exe versions: %s\n' % exe_str) + write_debug('exe versions: %s' % exe_str) - from .downloader.fragment import can_decrypt_frag from .downloader.websocket import has_websockets from .postprocessor.embedthumbnail import has_mutagen - from .cookies import SQLITE_AVAILABLE, KEYRING_AVAILABLE + from .cookies import SQLITE_AVAILABLE, SECRETSTORAGE_AVAILABLE - lib_str = ', '.join(sorted(filter(None, ( - can_decrypt_frag and 'pycryptodome', - has_websockets and 'websockets', + lib_str = join_nonempty( + compat_pycrypto_AES and compat_pycrypto_AES.__name__.split('.')[0], + SECRETSTORAGE_AVAILABLE and 'secretstorage', has_mutagen and 'mutagen', SQLITE_AVAILABLE and 'sqlite', - KEYRING_AVAILABLE and 'keyring', - )))) or 'none' - self._write_string('[debug] Optional libraries: %s\n' % lib_str) + has_websockets and 'websockets', + delim=', ') or 'none' + write_debug('Optional libraries: %s' % lib_str) proxy_map = {} for handler in self._opener.handlers: if hasattr(handler, 'proxies'): proxy_map.update(handler.proxies) - self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n') + write_debug(f'Proxy map: {proxy_map}') - if self.params.get('call_home', False): + # Not implemented + if False and self.params.get('call_home'): ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8') - self._write_string('[debug] Public IP address: %s\n' % ipaddr) - return + write_debug('Public IP address: %s' % ipaddr) latest_version = self.urlopen( 'https://yt-dl.org/latest/version').read().decode('utf-8') if version_tuple(latest_version) > version_tuple(__version__): @@ -3314,7 +3572,7 @@ def python_implementation(): def _setup_opener(self): timeout_val = self.params.get('socket_timeout') - self._socket_timeout = 600 if timeout_val is None else float(timeout_val) + self._socket_timeout = 20 if timeout_val is None else float(timeout_val) opts_cookiesfrombrowser = self.params.get('cookiesfrombrowser') opts_cookiefile = self.params.get('cookiefile') @@ -3376,39 +3634,137 @@ def get_encoding(self): encoding = preferredencoding() return encoding - def _write_thumbnails(self, info_dict, filename): # return the extensions + def _write_info_json(self, label, ie_result, infofn, overwrite=None): + ''' Write infojson and returns True = written, False = skip, None = error ''' + if overwrite is None: + overwrite = self.params.get('overwrites', True) + if not self.params.get('writeinfojson'): + return False + elif not infofn: + self.write_debug(f'Skipping writing {label} infojson') + return False + elif not self._ensure_dir_exists(infofn): + return None + elif not overwrite and os.path.exists(infofn): + self.to_screen(f'[info] {label.title()} metadata is already present') + else: + self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}') + try: + write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn) + except (OSError, IOError): + self.report_error(f'Cannot write {label} metadata to JSON file {infofn}') + return None + return True + + def _write_description(self, label, ie_result, descfn): + ''' Write description and returns True = written, False = skip, None = error ''' + if not self.params.get('writedescription'): + return False + elif not descfn: + self.write_debug(f'Skipping writing {label} description') + return False + elif not self._ensure_dir_exists(descfn): + return None + elif not self.params.get('overwrites', True) and os.path.exists(descfn): + self.to_screen(f'[info] {label.title()} description is already present') + elif ie_result.get('description') is None: + self.report_warning(f'There\'s no {label} description to write') + return False + else: + try: + self.to_screen(f'[info] Writing {label} description to: {descfn}') + with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: + descfile.write(ie_result['description']) + except (OSError, IOError): + self.report_error(f'Cannot write {label} description file {descfn}') + return None + return True + + def _write_subtitles(self, info_dict, filename): + ''' Write subtitles to file and return list of (sub_filename, final_sub_filename); or None if error''' + ret = [] + subtitles = info_dict.get('requested_subtitles') + if not subtitles or not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')): + # subtitles download errors are already managed as troubles in relevant IE + # that way it will silently go on when used with unsupporting IE + return ret + + sub_filename_base = self.prepare_filename(info_dict, 'subtitle') + if not sub_filename_base: + self.to_screen('[info] Skipping writing video subtitles') + return ret + for sub_lang, sub_info in subtitles.items(): + sub_format = sub_info['ext'] + sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext')) + sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext')) + if not self.params.get('overwrites', True) and os.path.exists(sub_filename): + self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present') + sub_info['filepath'] = sub_filename + ret.append((sub_filename, sub_filename_final)) + continue + + self.to_screen(f'[info] Writing video subtitles to: {sub_filename}') + if sub_info.get('data') is not None: + try: + # Use newline='' to prevent conversion of newline characters + # See https://github.com/ytdl-org/youtube-dl/issues/10268 + with io.open(sub_filename, 'w', encoding='utf-8', newline='') as subfile: + subfile.write(sub_info['data']) + sub_info['filepath'] = sub_filename + ret.append((sub_filename, sub_filename_final)) + continue + except (OSError, IOError): + self.report_error(f'Cannot write video subtitles file {sub_filename}') + return None + + try: + sub_copy = sub_info.copy() + sub_copy.setdefault('http_headers', info_dict.get('http_headers')) + self.dl(sub_filename, sub_copy, subtitle=True) + sub_info['filepath'] = sub_filename + ret.append((sub_filename, sub_filename_final)) + except (ExtractorError, IOError, OSError, ValueError) + network_exceptions as err: + self.report_warning(f'Unable to download video subtitles for {sub_lang!r}: {err}') + continue + return ret + + def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None): + ''' Write thumbnails to file and return list of (thumb_filename, final_thumb_filename) ''' write_all = self.params.get('write_all_thumbnails', False) - thumbnails = [] + thumbnails, ret = [], [] if write_all or self.params.get('writethumbnail', False): thumbnails = info_dict.get('thumbnails') or [] multiple = write_all and len(thumbnails) > 1 - ret = [] - for t in thumbnails[::-1]: - thumb_ext = determine_ext(t['url'], 'jpg') - suffix = '%s.' % t['id'] if multiple else '' - thumb_display_id = '%s ' % t['id'] if multiple else '' - thumb_filename = replace_extension(filename, suffix + thumb_ext, info_dict.get('ext')) + if thumb_filename_base is None: + thumb_filename_base = filename + if thumbnails and not thumb_filename_base: + self.write_debug(f'Skipping writing {label} thumbnail') + return ret - if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(thumb_filename)): - ret.append(suffix + thumb_ext) + for idx, t in list(enumerate(thumbnails))[::-1]: + thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg') + thumb_display_id = f'{label} thumbnail {t["id"]}' + thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext')) + thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext')) + + if not self.params.get('overwrites', True) and os.path.exists(thumb_filename): + ret.append((thumb_filename, thumb_filename_final)) t['filepath'] = thumb_filename - self.to_screen('[%s] %s: Thumbnail %sis already present' % - (info_dict['extractor'], info_dict['id'], thumb_display_id)) + self.to_screen('[info] %s is already present' % ( + thumb_display_id if multiple else f'{label} thumbnail').capitalize()) else: - self.to_screen('[%s] %s: Downloading thumbnail %s ...' % - (info_dict['extractor'], info_dict['id'], thumb_display_id)) + self.to_screen(f'[info] Downloading {thumb_display_id} ...') try: uf = self.urlopen(t['url']) + self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}') with open(encodeFilename(thumb_filename), 'wb') as thumbf: shutil.copyfileobj(uf, thumbf) - ret.append(suffix + thumb_ext) - self.to_screen('[%s] %s: Writing thumbnail %sto: %s' % - (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename)) + ret.append((thumb_filename, thumb_filename_final)) t['filepath'] = thumb_filename except network_exceptions as err: - self.report_warning('Unable to download thumbnail "%s": %s' % - (t['url'], error_to_compat_str(err))) + thumbnails.pop(idx) + self.report_warning(f'Unable to download {thumb_display_id}: {err}') if ret and not write_all: break return ret diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 58e8ea5d93..a03961c1b2 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # coding: utf-8 -from __future__ import unicode_literals +f'You are using an unsupported version of Python. Only Python versions 3.6 and above are supported by yt-dlp' # noqa: F541 __license__ = 'Public Domain' @@ -13,28 +13,30 @@ import re import sys - from .options import ( parseOpts, ) from .compat import ( compat_getpass, + compat_os_name, compat_shlex_quote, workaround_optparse_bug9161, ) -from .cookies import SUPPORTED_BROWSERS +from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS from .utils import ( DateRange, decodeOption, + DownloadCancelled, DownloadError, error_to_compat_str, - ExistingVideoReached, expand_path, + GeoUtils, + float_or_none, + int_or_none, match_filter_func, - MaxDownloadsReached, + parse_duration, preferredencoding, read_batch_urls, - RejectedVideoReached, render_table, SameFileError, setproctitle, @@ -71,7 +73,7 @@ def _real_main(argv=None): setproctitle('yt-dlp') parser, opts, args = parseOpts(argv) - warnings = [] + warnings, deprecation_warnings = [], [] # Set user agent if opts.user_agent is not None: @@ -94,6 +96,8 @@ def _real_main(argv=None): if opts.batchfile is not None: try: if opts.batchfile == '-': + write_string('Reading URLs from stdin - EOF (%s) to end:\n' % ( + 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D')) batchfd = sys.stdin else: batchfd = io.open( @@ -122,10 +126,10 @@ def _real_main(argv=None): desc = getattr(ie, 'IE_DESC', ie.IE_NAME) if desc is False: continue - if hasattr(ie, 'SEARCH_KEY'): + if getattr(ie, 'SEARCH_KEY', None) is not None: _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow') _COUNTS = ('', '5', '10', 'all') - desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES)) + desc += f'; "{ie.SEARCH_KEY}:" prefix (Example: "{ie.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(_SEARCHES)}")' write_string(desc + '\n', out=sys.stdout) sys.exit(0) if opts.ap_list_mso: @@ -134,6 +138,11 @@ def _real_main(argv=None): sys.exit(0) # Conflicting, missing and erroneous options + if opts.format == 'best': + warnings.append('.\n '.join(( + '"-f best" selects the best pre-merged format which is often not the best option', + 'To let yt-dlp download and merge the best available formats, simply do not pass any format selection', + 'If you know what you are doing and want only the best pre-merged format, use "-f b" instead to suppress this warning'))) if opts.usenetrc and (opts.username is not None or opts.password is not None): parser.error('using .netrc conflicts with giving username/password') if opts.password is not None and opts.username is None: @@ -193,7 +202,14 @@ def _real_main(argv=None): if opts.overwrites: # --yes-overwrites implies --no-continue opts.continue_dl = False if opts.concurrent_fragment_downloads <= 0: - raise ValueError('Concurrent fragments must be positive') + parser.error('Concurrent fragments must be positive') + if opts.wait_for_video is not None: + min_wait, max_wait, *_ = map(parse_duration, opts.wait_for_video.split('-', 1) + [None]) + if min_wait is None or (max_wait is None and '-' in opts.wait_for_video): + parser.error('Invalid time range to wait') + elif max_wait is not None and max_wait < min_wait: + parser.error('Minimum time range to wait must not be longer than the maximum') + opts.wait_for_video = (min_wait, max_wait) def parse_retries(retries, name=''): if retries in ('inf', 'infinite'): @@ -206,6 +222,8 @@ def parse_retries(retries, name=''): return parsed_retries if opts.retries is not None: opts.retries = parse_retries(opts.retries) + if opts.file_access_retries is not None: + opts.file_access_retries = parse_retries(opts.file_access_retries, 'file access ') if opts.fragment_retries is not None: opts.fragment_retries = parse_retries(opts.fragment_retries, 'fragment ') if opts.extractor_retries is not None: @@ -221,15 +239,17 @@ def parse_retries(retries, name=''): parser.error('invalid http chunk size specified') opts.http_chunk_size = numeric_chunksize if opts.playliststart <= 0: - raise ValueError('Playlist start must be positive') + raise parser.error('Playlist start must be positive') if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart: - raise ValueError('Playlist end must be greater than playlist start') + raise parser.error('Playlist end must be greater than playlist start') if opts.extractaudio: + opts.audioformat = opts.audioformat.lower() if opts.audioformat not in ['best'] + list(FFmpegExtractAudioPP.SUPPORTED_EXTS): parser.error('invalid audio format specified') if opts.audioquality: opts.audioquality = opts.audioquality.strip('k').strip('K') - if not opts.audioquality.isdigit(): + audioquality = int_or_none(float_or_none(opts.audioquality)) # int_or_none prevents inf, nan + if audioquality is None or audioquality < 0: parser.error('invalid audio quality specified') if opts.recodevideo is not None: opts.recodevideo = opts.recodevideo.replace(' ', '') @@ -245,12 +265,27 @@ def parse_retries(retries, name=''): if opts.convertthumbnails is not None: if opts.convertthumbnails not in FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS: parser.error('invalid thumbnail format specified') - if opts.cookiesfrombrowser is not None: - opts.cookiesfrombrowser = [ - part.strip() or None for part in opts.cookiesfrombrowser.split(':', 1)] - if opts.cookiesfrombrowser[0] not in SUPPORTED_BROWSERS: - parser.error('unsupported browser specified for cookies') + mobj = re.match(r'(?P<name>[^+:]+)(\s*\+\s*(?P<keyring>[^:]+))?(\s*:(?P<profile>.+))?', opts.cookiesfrombrowser) + if mobj is None: + parser.error(f'invalid cookies from browser arguments: {opts.cookiesfrombrowser}') + browser_name, keyring, profile = mobj.group('name', 'keyring', 'profile') + browser_name = browser_name.lower() + if browser_name not in SUPPORTED_BROWSERS: + parser.error(f'unsupported browser specified for cookies: "{browser_name}". ' + f'Supported browsers are: {", ".join(sorted(SUPPORTED_BROWSERS))}') + if keyring is not None: + keyring = keyring.upper() + if keyring not in SUPPORTED_KEYRINGS: + parser.error(f'unsupported keyring specified for cookies: "{keyring}". ' + f'Supported keyrings are: {", ".join(sorted(SUPPORTED_KEYRINGS))}') + opts.cookiesfrombrowser = (browser_name, profile, keyring) + geo_bypass_code = opts.geo_bypass_ip_block or opts.geo_bypass_country + if geo_bypass_code is not None: + try: + GeoUtils.random_ipv4(geo_bypass_code) + except Exception: + parser.error('unsupported geo-bypass country or ip-block') if opts.date is not None: date = DateRange.day(opts.date) @@ -259,6 +294,9 @@ def parse_retries(retries, name=''): compat_opts = opts.compat_opts + def report_conflict(arg1, arg2): + warnings.append(f'{arg2} is ignored since {arg1} was given') + def _unused_compat_opt(name): if name not in compat_opts: return False @@ -280,9 +318,14 @@ def set_default_compat(compat_name, opt_name, default=True, remove_compat=True): setattr(opts, opt_name, default) return None - set_default_compat('abort-on-error', 'ignoreerrors') + set_default_compat('abort-on-error', 'ignoreerrors', 'only_download') set_default_compat('no-playlist-metafiles', 'allow_playlist_files') set_default_compat('no-clean-infojson', 'clean_infojson') + if 'no-attach-info-json' in compat_opts: + if opts.embed_infojson: + _unused_compat_opt('no-attach-info-json') + else: + opts.embed_infojson = False if 'format-sort' in compat_opts: opts.format_sort.extend(InfoExtractor.FormatSort.ytdl_default) _video_multistreams_set = set_default_compat('multistreams', 'allow_multiple_video_streams', False, remove_compat=False) @@ -290,10 +333,14 @@ def set_default_compat(compat_name, opt_name, default=True, remove_compat=True): if _video_multistreams_set is False and _audio_multistreams_set is False: _unused_compat_opt('multistreams') outtmpl_default = opts.outtmpl.get('default') + if opts.useid: + if outtmpl_default is None: + outtmpl_default = opts.outtmpl['default'] = '%(id)s.%(ext)s' + else: + report_conflict('--output', '--id') if 'filename' in compat_opts: if outtmpl_default is None: - outtmpl_default = '%(title)s-%(id)s.%(ext)s' - opts.outtmpl.update({'default': outtmpl_default}) + outtmpl_default = opts.outtmpl['default'] = '%(title)s-%(id)s.%(ext)s' else: _unused_compat_opt('filename') @@ -303,10 +350,14 @@ def validate_outtmpl(tmpl, msg): parser.error('invalid %s %r: %s' % (msg, tmpl, error_to_compat_str(err))) for k, tmpl in opts.outtmpl.items(): - validate_outtmpl(tmpl, '%s output template' % k) + validate_outtmpl(tmpl, f'{k} output template') opts.forceprint = opts.forceprint or [] for tmpl in opts.forceprint or []: validate_outtmpl(tmpl, 'print template') + validate_outtmpl(opts.sponsorblock_chapter_title, 'SponsorBlock chapter title') + for k, tmpl in opts.progress_template.items(): + k = f'{k[:-6]} console title' if '-title' in k else f'{k} progress' + validate_outtmpl(tmpl, f'{k} template') if opts.extractaudio and not opts.keepvideo and opts.format is None: opts.format = 'bestaudio/best' @@ -353,47 +404,67 @@ def metadataparser_actions(f): if opts.getcomments and not printing_json: opts.writeinfojson = True - def report_conflict(arg1, arg2): - warnings.append('%s is ignored since %s was given' % (arg2, arg1)) + if opts.no_sponsorblock: + opts.sponsorblock_mark = set() + opts.sponsorblock_remove = set() + sponsorblock_query = opts.sponsorblock_mark | opts.sponsorblock_remove - if opts.remuxvideo and opts.recodevideo: - report_conflict('--recode-video', '--remux-video') - opts.remuxvideo = False + opts.remove_chapters = opts.remove_chapters or [] + + if (opts.remove_chapters or sponsorblock_query) and opts.sponskrub is not False: + if opts.sponskrub: + if opts.remove_chapters: + report_conflict('--remove-chapters', '--sponskrub') + if opts.sponsorblock_mark: + report_conflict('--sponsorblock-mark', '--sponskrub') + if opts.sponsorblock_remove: + report_conflict('--sponsorblock-remove', '--sponskrub') + opts.sponskrub = False if opts.sponskrub_cut and opts.split_chapters and opts.sponskrub is not False: report_conflict('--split-chapter', '--sponskrub-cut') opts.sponskrub_cut = False + if opts.remuxvideo and opts.recodevideo: + report_conflict('--recode-video', '--remux-video') + opts.remuxvideo = False + if opts.allow_unplayable_formats: - if opts.extractaudio: - report_conflict('--allow-unplayable-formats', '--extract-audio') - opts.extractaudio = False - if opts.remuxvideo: - report_conflict('--allow-unplayable-formats', '--remux-video') - opts.remuxvideo = False - if opts.recodevideo: - report_conflict('--allow-unplayable-formats', '--recode-video') - opts.recodevideo = False - if opts.addmetadata: - report_conflict('--allow-unplayable-formats', '--add-metadata') - opts.addmetadata = False - if opts.embedsubtitles: - report_conflict('--allow-unplayable-formats', '--embed-subs') - opts.embedsubtitles = False - if opts.embedthumbnail: - report_conflict('--allow-unplayable-formats', '--embed-thumbnail') - opts.embedthumbnail = False - if opts.xattrs: - report_conflict('--allow-unplayable-formats', '--xattrs') - opts.xattrs = False - if opts.fixup and opts.fixup.lower() not in ('never', 'ignore'): - report_conflict('--allow-unplayable-formats', '--fixup') + def report_unplayable_conflict(opt_name, arg, default=False, allowed=None): + val = getattr(opts, opt_name) + if (not allowed and val) or (allowed and not allowed(val)): + report_conflict('--allow-unplayable-formats', arg) + setattr(opts, opt_name, default) + + report_unplayable_conflict('extractaudio', '--extract-audio') + report_unplayable_conflict('remuxvideo', '--remux-video') + report_unplayable_conflict('recodevideo', '--recode-video') + report_unplayable_conflict('addmetadata', '--embed-metadata') + report_unplayable_conflict('addchapters', '--embed-chapters') + report_unplayable_conflict('embed_infojson', '--embed-info-json') + opts.embed_infojson = False + report_unplayable_conflict('embedsubtitles', '--embed-subs') + report_unplayable_conflict('embedthumbnail', '--embed-thumbnail') + report_unplayable_conflict('xattrs', '--xattrs') + report_unplayable_conflict('fixup', '--fixup', default='never', allowed=lambda x: x in (None, 'never', 'ignore')) opts.fixup = 'never' - if opts.sponskrub: - report_conflict('--allow-unplayable-formats', '--sponskrub') + report_unplayable_conflict('remove_chapters', '--remove-chapters', default=[]) + report_unplayable_conflict('sponsorblock_remove', '--sponsorblock-remove', default=set()) + report_unplayable_conflict('sponskrub', '--sponskrub', default=set()) opts.sponskrub = False + if (opts.addmetadata or opts.sponsorblock_mark) and opts.addchapters is None: + opts.addchapters = True + # PostProcessors - postprocessors = [] + postprocessors = list(opts.add_postprocessors) + if sponsorblock_query: + postprocessors.append({ + 'key': 'SponsorBlock', + 'categories': sponsorblock_query, + 'api': opts.sponsorblock_api, + # Run this immediately after extraction is complete + 'when': 'pre_process' + }) if opts.parse_metadata: postprocessors.append({ 'key': 'MetadataParser', @@ -439,16 +510,7 @@ def report_conflict(arg1, arg2): 'key': 'FFmpegVideoConvertor', 'preferedformat': opts.recodevideo, }) - # FFmpegMetadataPP should be run after FFmpegVideoConvertorPP and - # FFmpegExtractAudioPP as containers before conversion may not support - # metadata (3gp, webm, etc.) - # And this post-processor should be placed before other metadata - # manipulating post-processors (FFmpegEmbedSubtitle) to prevent loss of - # extra metadata. By default ffmpeg preserves metadata applicable for both - # source and target containers. From this point the container won't change, - # so metadata can be added here. - if opts.addmetadata: - postprocessors.append({'key': 'FFmpegMetadata'}) + # If ModifyChapters is going to remove chapters, subtitles must already be in the container. if opts.embedsubtitles: already_have_subtitle = opts.writesubtitles and 'no-keep-subs' not in compat_opts postprocessors.append({ @@ -462,6 +524,44 @@ def report_conflict(arg1, arg2): # this was the old behaviour if only --all-sub was given. if opts.allsubtitles and not opts.writeautomaticsub: opts.writesubtitles = True + # ModifyChapters must run before FFmpegMetadataPP + remove_chapters_patterns, remove_ranges = [], [] + for regex in opts.remove_chapters: + if regex.startswith('*'): + dur = list(map(parse_duration, regex[1:].split('-'))) + if len(dur) == 2 and all(t is not None for t in dur): + remove_ranges.append(tuple(dur)) + continue + parser.error(f'invalid --remove-chapters time range {regex!r}. Must be of the form *start-end') + try: + remove_chapters_patterns.append(re.compile(regex)) + except re.error as err: + parser.error(f'invalid --remove-chapters regex {regex!r} - {err}') + if opts.remove_chapters or sponsorblock_query: + postprocessors.append({ + 'key': 'ModifyChapters', + 'remove_chapters_patterns': remove_chapters_patterns, + 'remove_sponsor_segments': opts.sponsorblock_remove, + 'remove_ranges': remove_ranges, + 'sponsorblock_chapter_title': opts.sponsorblock_chapter_title, + 'force_keyframes': opts.force_keyframes_at_cuts + }) + # FFmpegMetadataPP should be run after FFmpegVideoConvertorPP and + # FFmpegExtractAudioPP as containers before conversion may not support + # metadata (3gp, webm, etc.) + # By default ffmpeg preserves metadata applicable for both + # source and target containers. From this point the container won't change, + # so metadata can be added here. + if opts.addmetadata or opts.addchapters or opts.embed_infojson: + if opts.embed_infojson is None: + opts.embed_infojson = 'if_exists' + postprocessors.append({ + 'key': 'FFmpegMetadata', + 'add_chapters': opts.addchapters, + 'add_metadata': opts.addmetadata, + 'add_infojson': opts.embed_infojson, + }) + # Deprecated # This should be above EmbedThumbnail since sponskrub removes the thumbnail attachment # but must be below EmbedSubtitle and FFmpegMetadata # See https://github.com/yt-dlp/yt-dlp/issues/204 , https://github.com/faissaloo/SponSkrub/issues/29 @@ -474,18 +574,22 @@ def report_conflict(arg1, arg2): 'cut': opts.sponskrub_cut, 'force': opts.sponskrub_force, 'ignoreerror': opts.sponskrub is None, + '_from_cli': True, }) if opts.embedthumbnail: - already_have_thumbnail = opts.writethumbnail or opts.write_all_thumbnails postprocessors.append({ 'key': 'EmbedThumbnail', # already_have_thumbnail = True prevents the file from being deleted after embedding - 'already_have_thumbnail': already_have_thumbnail + 'already_have_thumbnail': opts.writethumbnail }) - if not already_have_thumbnail: + if not opts.writethumbnail: opts.writethumbnail = True + opts.outtmpl['pl_thumbnail'] = '' if opts.split_chapters: - postprocessors.append({'key': 'FFmpegSplitChapters'}) + postprocessors.append({ + 'key': 'FFmpegSplitChapters', + 'force_keyframes': opts.force_keyframes_at_cuts, + }) # XAttrMetadataPP should be run after post-processors that may change file contents if opts.xattrs: postprocessors.append({'key': 'XAttrMetadata'}) @@ -509,6 +613,19 @@ def report_args_compat(arg, name): opts.postprocessor_args.setdefault('sponskrub', []) opts.postprocessor_args['default'] = opts.postprocessor_args['default-compat'] + def report_deprecation(val, old, new=None): + if not val: + return + deprecation_warnings.append( + f'{old} is deprecated and may be removed in a future version. Use {new} instead' if new + else f'{old} is deprecated and may not work as expected') + + report_deprecation(opts.sponskrub, '--sponskrub', '--sponsorblock-mark or --sponsorblock-remove') + report_deprecation(not opts.prefer_ffmpeg, '--prefer-avconv', 'ffmpeg') + report_deprecation(opts.include_ads, '--include-ads') + # report_deprecation(opts.call_home, '--call-home') # We may re-implement this in future + # report_deprecation(opts.writeannotations, '--write-annotations') # It's just that no website has it + final_ext = ( opts.recodevideo if opts.recodevideo in FFmpegVideoConvertorPP.SUPPORTED_EXTS else opts.remuxvideo if opts.remuxvideo in FFmpegVideoRemuxerPP.SUPPORTED_EXTS @@ -521,6 +638,7 @@ def report_args_compat(arg, name): ydl_opts = { 'usenetrc': opts.usenetrc, + 'netrc_location': opts.netrc_location, 'username': opts.username, 'password': opts.password, 'twofactor': opts.twofactor, @@ -567,6 +685,7 @@ def report_args_compat(arg, name): 'throttledratelimit': opts.throttledratelimit, 'overwrites': opts.overwrites, 'retries': opts.retries, + 'file_access_retries': opts.file_access_retries, 'fragment_retries': opts.fragment_retries, 'extractor_retries': opts.extractor_retries, 'skip_unavailable_fragments': opts.skip_unavailable_fragments, @@ -576,8 +695,9 @@ def report_args_compat(arg, name): 'noresizebuffer': opts.noresizebuffer, 'http_chunk_size': opts.http_chunk_size, 'continuedl': opts.continue_dl, - 'noprogress': opts.noprogress, + 'noprogress': opts.quiet if opts.noprogress is None else opts.noprogress, 'progress_with_newline': opts.progress_with_newline, + 'progress_template': opts.progress_template, 'playliststart': opts.playliststart, 'playlistend': opts.playlistend, 'playlistreverse': opts.playlist_reverse, @@ -593,8 +713,8 @@ def report_args_compat(arg, name): 'allow_playlist_files': opts.allow_playlist_files, 'clean_infojson': opts.clean_infojson, 'getcomments': opts.getcomments, - 'writethumbnail': opts.writethumbnail, - 'write_all_thumbnails': opts.write_all_thumbnails, + 'writethumbnail': opts.writethumbnail is True, + 'write_all_thumbnails': opts.writethumbnail == 'all', 'writelink': opts.writelink, 'writeurllink': opts.writeurllink, 'writewebloclink': opts.writewebloclink, @@ -626,6 +746,7 @@ def report_args_compat(arg, name): 'download_archive': download_archive_fn, 'break_on_existing': opts.break_on_existing, 'break_on_reject': opts.break_on_reject, + 'break_per_url': opts.break_per_url, 'skip_playlist_after_errors': opts.skip_playlist_after_errors, 'cookiefile': opts.cookiefile, 'cookiesfrombrowser': opts.cookiesfrombrowser, @@ -644,6 +765,8 @@ def report_args_compat(arg, name): 'youtube_include_hls_manifest': opts.youtube_include_hls_manifest, 'encoding': opts.encoding, 'extract_flat': opts.extract_flat, + 'live_from_start': opts.live_from_start, + 'wait_for_video': opts.wait_for_video, 'mark_watched': opts.mark_watched, 'merge_output_format': opts.merge_output_format, 'final_ext': final_ext, @@ -672,16 +795,13 @@ def report_args_compat(arg, name): 'geo_bypass': opts.geo_bypass, 'geo_bypass_country': opts.geo_bypass_country, 'geo_bypass_ip_block': opts.geo_bypass_ip_block, - 'warnings': warnings, + '_warnings': warnings, + '_deprecation_warnings': deprecation_warnings, 'compat_opts': compat_opts, - # just for deprecation check - 'autonumber': opts.autonumber or None, - 'usetitle': opts.usetitle or None, - 'useid': opts.useid or None, } with YoutubeDL(ydl_opts) as ydl: - actual_use = len(all_urls) or opts.load_info_filename + actual_use = all_urls or opts.load_info_filename # Remove cache dir if opts.rm_cachedir: @@ -710,7 +830,7 @@ def report_args_compat(arg, name): retcode = ydl.download_with_info_file(expand_path(opts.load_info_filename)) else: retcode = ydl.download(all_urls) - except (MaxDownloadsReached, ExistingVideoReached, RejectedVideoReached): + except DownloadCancelled: ydl.to_screen('Aborting remaining downloads') retcode = 101 @@ -722,15 +842,15 @@ def main(argv=None): _real_main(argv) except DownloadError: sys.exit(1) - except SameFileError: - sys.exit('ERROR: fixed output name but more than one file to download') + except SameFileError as e: + sys.exit(f'ERROR: {e}') except KeyboardInterrupt: sys.exit('\nERROR: Interrupted by user') - except BrokenPipeError: + except BrokenPipeError as e: # https://docs.python.org/3/library/signal.html#note-on-sigpipe devnull = os.open(os.devnull, os.O_WRONLY) os.dup2(devnull, sys.stdout.fileno()) - sys.exit(r'\nERROR: {err}') + sys.exit(f'\nERROR: {e}') __all__ = ['main', 'YoutubeDL', 'gen_extractors', 'list_extractors'] diff --git a/yt_dlp/aes.py b/yt_dlp/aes.py index 461bb6d413..8503e3dfd6 100644 --- a/yt_dlp/aes.py +++ b/yt_dlp/aes.py @@ -2,36 +2,110 @@ from math import ceil -from .compat import compat_b64decode +from .compat import compat_b64decode, compat_pycrypto_AES from .utils import bytes_to_intlist, intlist_to_bytes + +if compat_pycrypto_AES: + def aes_cbc_decrypt_bytes(data, key, iv): + """ Decrypt bytes with AES-CBC using pycryptodome """ + return compat_pycrypto_AES.new(key, compat_pycrypto_AES.MODE_CBC, iv).decrypt(data) + + def aes_gcm_decrypt_and_verify_bytes(data, key, tag, nonce): + """ Decrypt bytes with AES-GCM using pycryptodome """ + return compat_pycrypto_AES.new(key, compat_pycrypto_AES.MODE_GCM, nonce).decrypt_and_verify(data, tag) + +else: + def aes_cbc_decrypt_bytes(data, key, iv): + """ Decrypt bytes with AES-CBC using native implementation since pycryptodome is unavailable """ + return intlist_to_bytes(aes_cbc_decrypt(*map(bytes_to_intlist, (data, key, iv)))) + + def aes_gcm_decrypt_and_verify_bytes(data, key, tag, nonce): + """ Decrypt bytes with AES-GCM using native implementation since pycryptodome is unavailable """ + return intlist_to_bytes(aes_gcm_decrypt_and_verify(*map(bytes_to_intlist, (data, key, tag, nonce)))) + + BLOCK_SIZE_BYTES = 16 -def aes_ctr_decrypt(data, key, counter): +def aes_ecb_encrypt(data, key, iv=None): """ - Decrypt with aes in counter mode + Encrypt with aes in ECB mode - @param {int[]} data cipher + @param {int[]} data cleartext @param {int[]} key 16/24/32-Byte cipher key - @param {instance} counter Instance whose next_value function (@returns {int[]} 16-Byte block) - returns the next counter block + @param {int[]} iv Unused for this mode + @returns {int[]} encrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + encrypted_data = [] + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + encrypted_data += aes_encrypt(block, expanded_key) + encrypted_data = encrypted_data[:len(data)] + + return encrypted_data + + +def aes_ecb_decrypt(data, key, iv=None): + """ + Decrypt with aes in ECB mode + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv Unused for this mode @returns {int[]} decrypted data """ expanded_key = key_expansion(key) block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) - decrypted_data = [] + encrypted_data = [] for i in range(block_count): - counter_block = counter.next_value() + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + encrypted_data += aes_decrypt(block, expanded_key) + encrypted_data = encrypted_data[:len(data)] + + return encrypted_data + + +def aes_ctr_decrypt(data, key, iv): + """ + Decrypt with aes in counter mode + + @param {int[]} data cipher + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte initialization vector + @returns {int[]} decrypted data + """ + return aes_ctr_encrypt(data, key, iv) + + +def aes_ctr_encrypt(data, key, iv): + """ + Encrypt with aes in counter mode + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte initialization vector + @returns {int[]} encrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + counter = iter_vector(iv) + + encrypted_data = [] + for i in range(block_count): + counter_block = next(counter) block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] block += [0] * (BLOCK_SIZE_BYTES - len(block)) cipher_counter_block = aes_encrypt(counter_block, expanded_key) - decrypted_data += xor(block, cipher_counter_block) - decrypted_data = decrypted_data[:len(data)] + encrypted_data += xor(block, cipher_counter_block) + encrypted_data = encrypted_data[:len(data)] - return decrypted_data + return encrypted_data def aes_cbc_decrypt(data, key, iv): @@ -88,39 +162,47 @@ def aes_cbc_encrypt(data, key, iv): return encrypted_data -def key_expansion(data): +def aes_gcm_decrypt_and_verify(data, key, tag, nonce): """ - Generate key schedule + Decrypt with aes in GBM mode and checks authenticity using tag - @param {int[]} data 16/24/32-Byte cipher key - @returns {int[]} 176/208/240-Byte expanded key + @param {int[]} data cipher + @param {int[]} key 16-Byte cipher key + @param {int[]} tag authentication tag + @param {int[]} nonce IV (recommended 12-Byte) + @returns {int[]} decrypted data """ - data = data[:] # copy - rcon_iteration = 1 - key_size_bytes = len(data) - expanded_key_size_bytes = (key_size_bytes // 4 + 7) * BLOCK_SIZE_BYTES - while len(data) < expanded_key_size_bytes: - temp = data[-4:] - temp = key_schedule_core(temp, rcon_iteration) - rcon_iteration += 1 - data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + # XXX: check aes, gcm param - for _ in range(3): - temp = data[-4:] - data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + hash_subkey = aes_encrypt([0] * BLOCK_SIZE_BYTES, key_expansion(key)) - if key_size_bytes == 32: - temp = data[-4:] - temp = sub_bytes(temp) - data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + if len(nonce) == 12: + j0 = nonce + [0, 0, 0, 1] + else: + fill = (BLOCK_SIZE_BYTES - (len(nonce) % BLOCK_SIZE_BYTES)) % BLOCK_SIZE_BYTES + 8 + ghash_in = nonce + [0] * fill + bytes_to_intlist((8 * len(nonce)).to_bytes(8, 'big')) + j0 = ghash(hash_subkey, ghash_in) - for _ in range(3 if key_size_bytes == 32 else 2 if key_size_bytes == 24 else 0): - temp = data[-4:] - data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) - data = data[:expanded_key_size_bytes] + # TODO: add nonce support to aes_ctr_decrypt - return data + # nonce_ctr = j0[:12] + iv_ctr = inc(j0) + + decrypted_data = aes_ctr_decrypt(data, key, iv_ctr + [0] * (BLOCK_SIZE_BYTES - len(iv_ctr))) + pad_len = len(data) // 16 * 16 + s_tag = ghash( + hash_subkey, + data + + [0] * (BLOCK_SIZE_BYTES - len(data) + pad_len) # pad + + bytes_to_intlist((0 * 8).to_bytes(8, 'big') # length of associated data + + ((len(data) * 8).to_bytes(8, 'big'))) # length of data + ) + + if tag != aes_ctr_encrypt(s_tag, key, j0): + raise ValueError("Mismatching authentication tag") + + return decrypted_data def aes_encrypt(data, expanded_key): @@ -138,7 +220,7 @@ def aes_encrypt(data, expanded_key): data = sub_bytes(data) data = shift_rows(data) if i != rounds: - data = mix_columns(data) + data = list(iter_mix_columns(data, MIX_COLUMN_MATRIX)) data = xor(data, expanded_key[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]) return data @@ -157,7 +239,7 @@ def aes_decrypt(data, expanded_key): for i in range(rounds, 0, -1): data = xor(data, expanded_key[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]) if i != rounds: - data = mix_columns_inv(data) + data = list(iter_mix_columns(data, MIX_COLUMN_MATRIX_INV)) data = shift_rows_inv(data) data = sub_bytes_inv(data) data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) @@ -189,15 +271,7 @@ def aes_decrypt_text(data, password, key_size_bytes): nonce = data[:NONCE_LENGTH_BYTES] cipher = data[NONCE_LENGTH_BYTES:] - class Counter(object): - __value = nonce + [0] * (BLOCK_SIZE_BYTES - NONCE_LENGTH_BYTES) - - def next_value(self): - temp = self.__value - self.__value = inc(self.__value) - return temp - - decrypted_data = aes_ctr_decrypt(cipher, key, Counter()) + decrypted_data = aes_ctr_decrypt(cipher, key, nonce + [0] * (BLOCK_SIZE_BYTES - NONCE_LENGTH_BYTES)) plaintext = intlist_to_bytes(decrypted_data) return plaintext @@ -278,6 +352,47 @@ def next_value(self): 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07) +def key_expansion(data): + """ + Generate key schedule + + @param {int[]} data 16/24/32-Byte cipher key + @returns {int[]} 176/208/240-Byte expanded key + """ + data = data[:] # copy + rcon_iteration = 1 + key_size_bytes = len(data) + expanded_key_size_bytes = (key_size_bytes // 4 + 7) * BLOCK_SIZE_BYTES + + while len(data) < expanded_key_size_bytes: + temp = data[-4:] + temp = key_schedule_core(temp, rcon_iteration) + rcon_iteration += 1 + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + + for _ in range(3): + temp = data[-4:] + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + + if key_size_bytes == 32: + temp = data[-4:] + temp = sub_bytes(temp) + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + + for _ in range(3 if key_size_bytes == 32 else 2 if key_size_bytes == 24 else 0): + temp = data[-4:] + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + data = data[:expanded_key_size_bytes] + + return data + + +def iter_vector(iv): + while True: + yield iv + iv = inc(iv) + + def sub_bytes(data): return [SBOX[x] for x in data] @@ -302,48 +417,36 @@ def xor(data1, data2): return [x ^ y for x, y in zip(data1, data2)] -def rijndael_mul(a, b): - if(a == 0 or b == 0): - return 0 - return RIJNDAEL_EXP_TABLE[(RIJNDAEL_LOG_TABLE[a] + RIJNDAEL_LOG_TABLE[b]) % 0xFF] - - -def mix_column(data, matrix): - data_mixed = [] - for row in range(4): - mixed = 0 - for column in range(4): - # xor is (+) and (-) - mixed ^= rijndael_mul(data[column], matrix[row][column]) - data_mixed.append(mixed) - return data_mixed - - -def mix_columns(data, matrix=MIX_COLUMN_MATRIX): - data_mixed = [] - for i in range(4): - column = data[i * 4: (i + 1) * 4] - data_mixed += mix_column(column, matrix) - return data_mixed - - -def mix_columns_inv(data): - return mix_columns(data, MIX_COLUMN_MATRIX_INV) +def iter_mix_columns(data, matrix): + for i in (0, 4, 8, 12): + for row in matrix: + mixed = 0 + for j in range(4): + # xor is (+) and (-) + mixed ^= (0 if data[i:i + 4][j] == 0 or row[j] == 0 else + RIJNDAEL_EXP_TABLE[(RIJNDAEL_LOG_TABLE[data[i + j]] + RIJNDAEL_LOG_TABLE[row[j]]) % 0xFF]) + yield mixed def shift_rows(data): - data_shifted = [] - for column in range(4): - for row in range(4): - data_shifted.append(data[((column + row) & 0b11) * 4 + row]) - return data_shifted + return [data[((column + row) & 0b11) * 4 + row] for column in range(4) for row in range(4)] def shift_rows_inv(data): + return [data[((column - row) & 0b11) * 4 + row] for column in range(4) for row in range(4)] + + +def shift_block(data): data_shifted = [] - for column in range(4): - for row in range(4): - data_shifted.append(data[((column - row) & 0b11) * 4 + row]) + + bit = 0 + for n in data: + if bit: + n |= 0x100 + bit = n & 1 + n >>= 1 + data_shifted.append(n) + return data_shifted @@ -358,4 +461,50 @@ def inc(data): return data -__all__ = ['aes_encrypt', 'key_expansion', 'aes_ctr_decrypt', 'aes_cbc_decrypt', 'aes_decrypt_text'] +def block_product(block_x, block_y): + # NIST SP 800-38D, Algorithm 1 + + if len(block_x) != BLOCK_SIZE_BYTES or len(block_y) != BLOCK_SIZE_BYTES: + raise ValueError("Length of blocks need to be %d bytes" % BLOCK_SIZE_BYTES) + + block_r = [0xE1] + [0] * (BLOCK_SIZE_BYTES - 1) + block_v = block_y[:] + block_z = [0] * BLOCK_SIZE_BYTES + + for i in block_x: + for bit in range(7, -1, -1): + if i & (1 << bit): + block_z = xor(block_z, block_v) + + do_xor = block_v[-1] & 1 + block_v = shift_block(block_v) + if do_xor: + block_v = xor(block_v, block_r) + + return block_z + + +def ghash(subkey, data): + # NIST SP 800-38D, Algorithm 2 + + if len(data) % BLOCK_SIZE_BYTES: + raise ValueError("Length of data should be %d bytes" % BLOCK_SIZE_BYTES) + + last_y = [0] * BLOCK_SIZE_BYTES + for i in range(0, len(data), BLOCK_SIZE_BYTES): + block = data[i : i + BLOCK_SIZE_BYTES] # noqa: E203 + last_y = block_product(xor(last_y, block), subkey) + + return last_y + + +__all__ = [ + 'aes_ctr_decrypt', + 'aes_cbc_decrypt', + 'aes_cbc_decrypt_bytes', + 'aes_decrypt_text', + 'aes_encrypt', + 'aes_gcm_decrypt_and_verify', + 'aes_gcm_decrypt_and_verify_bytes', + 'key_expansion' +] diff --git a/yt_dlp/cache.py b/yt_dlp/cache.py index dde9cca646..e5cb193bce 100644 --- a/yt_dlp/cache.py +++ b/yt_dlp/cache.py @@ -50,6 +50,7 @@ def store(self, section, key, data, dtype='json'): except OSError as ose: if ose.errno != errno.EEXIST: raise + self._ydl.write_debug(f'Saving {section}.{key} to cache') write_json_file(data, fn) except Exception: tb = traceback.format_exc() @@ -66,6 +67,7 @@ def load(self, section, key, dtype='json', default=None): try: try: with io.open(cache_fn, 'r', encoding='utf-8') as cachef: + self._ydl.write_debug(f'Loading {section}.{key} from cache') return json.load(cachef) except ValueError: try: diff --git a/yt_dlp/compat.py b/yt_dlp/compat.py index ab1a3ba44c..79c8e34946 100644 --- a/yt_dlp/compat.py +++ b/yt_dlp/compat.py @@ -19,6 +19,7 @@ import shutil import socket import struct +import subprocess import sys import tokenize import urllib @@ -33,6 +34,8 @@ class compat_HTMLParseError(Exception): pass +# compat_ctypes_WINFUNCTYPE = ctypes.WINFUNCTYPE +# will not work since ctypes.WINFUNCTYPE does not exist in UNIX machines def compat_ctypes_WINFUNCTYPE(*args, **kwargs): return ctypes.WINFUNCTYPE(*args, **kwargs) @@ -130,6 +133,49 @@ def compat_asyncio_run(coro): asyncio.run = compat_asyncio_run +# Python 3.8+ does not honor %HOME% on windows, but this breaks compatibility with youtube-dl +# See https://github.com/yt-dlp/yt-dlp/issues/792 +# https://docs.python.org/3/library/os.path.html#os.path.expanduser +if compat_os_name in ('nt', 'ce') and 'HOME' in os.environ: + _userhome = os.environ['HOME'] + + def compat_expanduser(path): + if not path.startswith('~'): + return path + i = path.replace('\\', '/', 1).find('/') # ~user + if i < 0: + i = len(path) + userhome = os.path.join(os.path.dirname(_userhome), path[1:i]) if i > 1 else _userhome + return userhome + path[i:] +else: + compat_expanduser = os.path.expanduser + + +try: + from Cryptodome.Cipher import AES as compat_pycrypto_AES +except ImportError: + try: + from Crypto.Cipher import AES as compat_pycrypto_AES + except ImportError: + compat_pycrypto_AES = None + + +WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None + + +def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075 + if compat_os_name != 'nt': + return + global WINDOWS_VT_MODE + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + try: + subprocess.Popen('', shell=True, startupinfo=startupinfo) + WINDOWS_VT_MODE = True + except Exception: + pass + + # Deprecated compat_basestring = str @@ -152,7 +198,6 @@ def compat_asyncio_run(coro): compat_cookies_SimpleCookie = compat_cookies.SimpleCookie compat_etree_Element = etree.Element compat_etree_register_namespace = etree.register_namespace -compat_expanduser = os.path.expanduser compat_get_terminal_size = shutil.get_terminal_size compat_getenv = os.getenv compat_getpass = getpass.getpass @@ -189,6 +234,7 @@ def compat_asyncio_run(coro): # Set public objects __all__ = [ + 'WINDOWS_VT_MODE', 'compat_HTMLParseError', 'compat_HTMLParser', 'compat_HTTPError', @@ -224,6 +270,7 @@ def compat_asyncio_run(coro): 'compat_os_name', 'compat_parse_qs', 'compat_print', + 'compat_pycrypto_AES', 'compat_realpath', 'compat_setenv', 'compat_shlex_quote', @@ -252,5 +299,6 @@ def compat_asyncio_run(coro): 'compat_xml_parse_error', 'compat_xpath', 'compat_zip', + 'windows_enable_vt_mode', 'workaround_optparse_bug9161', ] diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index c28833159a..74e133bc96 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -1,3 +1,4 @@ +import contextlib import ctypes import json import os @@ -7,19 +8,17 @@ import sys import tempfile from datetime import datetime, timedelta, timezone +from enum import Enum, auto from hashlib import pbkdf2_hmac -from yt_dlp.aes import aes_cbc_decrypt -from yt_dlp.compat import ( +from .aes import aes_cbc_decrypt_bytes, aes_gcm_decrypt_and_verify_bytes +from .compat import ( compat_b64decode, compat_cookiejar_Cookie, ) -from yt_dlp.utils import ( - bug_reports_message, - bytes_to_intlist, +from .utils import ( expand_path, - intlist_to_bytes, - process_communicate_or_kill, + Popen, YoutubeDLCookieJar, ) @@ -33,25 +32,16 @@ try: - from Crypto.Cipher import AES - CRYPTO_AVAILABLE = True + import secretstorage + SECRETSTORAGE_AVAILABLE = True except ImportError: - CRYPTO_AVAILABLE = False - -try: - import keyring - KEYRING_AVAILABLE = True - KEYRING_UNAVAILABLE_REASON = f'due to unknown reasons{bug_reports_message()}' -except ImportError: - KEYRING_AVAILABLE = False - KEYRING_UNAVAILABLE_REASON = ( - 'as the `keyring` module is not installed. ' - 'Please install by running `python3 -m pip install keyring`. ' - 'Depending on your platform, additional packages may be required ' - 'to access the keyring; see https://pypi.org/project/keyring') + SECRETSTORAGE_AVAILABLE = False + SECRETSTORAGE_UNAVAILABLE_REASON = ( + 'as the `secretstorage` module is not installed. ' + 'Please install by running `python3 -m pip install secretstorage`.') except Exception as _err: - KEYRING_AVAILABLE = False - KEYRING_UNAVAILABLE_REASON = 'as the `keyring` module could not be initialized: %s' % _err + SECRETSTORAGE_AVAILABLE = False + SECRETSTORAGE_UNAVAILABLE_REASON = f'as the `secretstorage` module could not be initialized. {_err}' CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} @@ -82,8 +72,8 @@ def error(self, message): def load_cookies(cookie_file, browser_specification, ydl): cookie_jars = [] if browser_specification is not None: - browser_name, profile = _parse_browser_specification(*browser_specification) - cookie_jars.append(extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl))) + browser_name, profile, keyring = _parse_browser_specification(*browser_specification) + cookie_jars.append(extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl), keyring=keyring)) if cookie_file is not None: cookie_file = expand_path(cookie_file) @@ -95,13 +85,13 @@ def load_cookies(cookie_file, browser_specification, ydl): return _merge_cookie_jars(cookie_jars) -def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger()): +def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger(), *, keyring=None): if browser_name == 'firefox': return _extract_firefox_cookies(profile, logger) elif browser_name == 'safari': return _extract_safari_cookies(profile, logger) elif browser_name in CHROMIUM_BASED_BROWSERS: - return _extract_chrome_cookies(browser_name, profile, logger) + return _extract_chrome_cookies(browser_name, profile, keyring, logger) else: raise ValueError('unknown browser: {}'.format(browser_name)) @@ -123,9 +113,9 @@ def _extract_firefox_cookies(profile, logger): cookie_database_path = _find_most_recently_used_file(search_root, 'cookies.sqlite') if cookie_database_path is None: raise FileNotFoundError('could not find firefox cookies database in {}'.format(search_root)) - logger.debug('extracting from: "{}"'.format(cookie_database_path)) + logger.debug('Extracting cookies from: "{}"'.format(cookie_database_path)) - with tempfile.TemporaryDirectory(prefix='youtube_dl') as tmpdir: + with tempfile.TemporaryDirectory(prefix='yt_dlp') as tmpdir: cursor = None try: cursor = _open_database_copy(cookie_database_path, tmpdir) @@ -215,7 +205,7 @@ def _get_chromium_based_browser_settings(browser_name): } -def _extract_chrome_cookies(browser_name, profile, logger): +def _extract_chrome_cookies(browser_name, profile, keyring, logger): logger.info('Extracting cookies from {}'.format(browser_name)) if not SQLITE_AVAILABLE: @@ -240,11 +230,11 @@ def _extract_chrome_cookies(browser_name, profile, logger): cookie_database_path = _find_most_recently_used_file(search_root, 'Cookies') if cookie_database_path is None: raise FileNotFoundError('could not find {} cookies database in "{}"'.format(browser_name, search_root)) - logger.debug('extracting from: "{}"'.format(cookie_database_path)) + logger.debug('Extracting cookies from: "{}"'.format(cookie_database_path)) - decryptor = get_cookie_decryptor(config['browser_dir'], config['keyring_name'], logger) + decryptor = get_cookie_decryptor(config['browser_dir'], config['keyring_name'], logger, keyring=keyring) - with tempfile.TemporaryDirectory(prefix='youtube_dl') as tmpdir: + with tempfile.TemporaryDirectory(prefix='yt_dlp') as tmpdir: cursor = None try: cursor = _open_database_copy(cookie_database_path, tmpdir) @@ -255,6 +245,7 @@ def _extract_chrome_cookies(browser_name, profile, logger): 'expires_utc, {} FROM cookies'.format(secure_column)) jar = YoutubeDLCookieJar() failed_cookies = 0 + unencrypted_cookies = 0 for host_key, name, value, encrypted_value, path, expires_utc, is_secure in cursor.fetchall(): host_key = host_key.decode('utf-8') name = name.decode('utf-8') @@ -266,6 +257,8 @@ def _extract_chrome_cookies(browser_name, profile, logger): if value is None: failed_cookies += 1 continue + else: + unencrypted_cookies += 1 cookie = compat_cookiejar_Cookie( version=0, name=name, value=value, port=None, port_specified=False, @@ -278,6 +271,9 @@ def _extract_chrome_cookies(browser_name, profile, logger): else: failed_message = '' logger.info('Extracted {} cookies from {}{}'.format(len(jar), browser_name, failed_message)) + counts = decryptor.cookie_counts.copy() + counts['unencrypted'] = unencrypted_cookies + logger.debug('cookie version breakdown: {}'.format(counts)) return jar finally: if cursor is not None: @@ -313,10 +309,14 @@ class ChromeCookieDecryptor: def decrypt(self, encrypted_value): raise NotImplementedError + @property + def cookie_counts(self): + raise NotImplementedError -def get_cookie_decryptor(browser_root, browser_keyring_name, logger): + +def get_cookie_decryptor(browser_root, browser_keyring_name, logger, *, keyring=None): if sys.platform in ('linux', 'linux2'): - return LinuxChromeCookieDecryptor(browser_keyring_name, logger) + return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring) elif sys.platform == 'darwin': return MacChromeCookieDecryptor(browser_keyring_name, logger) elif sys.platform == 'win32': @@ -327,13 +327,12 @@ def get_cookie_decryptor(browser_root, browser_keyring_name, logger): class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): - def __init__(self, browser_keyring_name, logger): + def __init__(self, browser_keyring_name, logger, *, keyring=None): self._logger = logger self._v10_key = self.derive_key(b'peanuts') - if KEYRING_AVAILABLE: - self._v11_key = self.derive_key(_get_linux_keyring_password(browser_keyring_name)) - else: - self._v11_key = None + password = _get_linux_keyring_password(browser_keyring_name, keyring, logger) + self._v11_key = None if password is None else self.derive_key(password) + self._cookie_counts = {'v10': 0, 'v11': 0, 'other': 0} @staticmethod def derive_key(password): @@ -341,28 +340,36 @@ def derive_key(password): # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_linux.cc return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1, key_length=16) + @property + def cookie_counts(self): + return self._cookie_counts + def decrypt(self, encrypted_value): version = encrypted_value[:3] ciphertext = encrypted_value[3:] if version == b'v10': + self._cookie_counts['v10'] += 1 return _decrypt_aes_cbc(ciphertext, self._v10_key, self._logger) elif version == b'v11': + self._cookie_counts['v11'] += 1 if self._v11_key is None: - self._logger.warning(f'cannot decrypt cookie {KEYRING_UNAVAILABLE_REASON}', only_once=True) + self._logger.warning('cannot decrypt v11 cookies: no key found', only_once=True) return None return _decrypt_aes_cbc(ciphertext, self._v11_key, self._logger) else: + self._cookie_counts['other'] += 1 return None class MacChromeCookieDecryptor(ChromeCookieDecryptor): def __init__(self, browser_keyring_name, logger): self._logger = logger - password = _get_mac_keyring_password(browser_keyring_name) + password = _get_mac_keyring_password(browser_keyring_name, logger) self._v10_key = None if password is None else self.derive_key(password) + self._cookie_counts = {'v10': 0, 'other': 0} @staticmethod def derive_key(password): @@ -370,11 +377,16 @@ def derive_key(password): # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1003, key_length=16) + @property + def cookie_counts(self): + return self._cookie_counts + def decrypt(self, encrypted_value): version = encrypted_value[:3] ciphertext = encrypted_value[3:] if version == b'v10': + self._cookie_counts['v10'] += 1 if self._v10_key is None: self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True) return None @@ -382,6 +394,7 @@ def decrypt(self, encrypted_value): return _decrypt_aes_cbc(ciphertext, self._v10_key, self._logger) else: + self._cookie_counts['other'] += 1 # other prefixes are considered 'old data' which were stored as plaintext # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm return encrypted_value @@ -391,20 +404,21 @@ class WindowsChromeCookieDecryptor(ChromeCookieDecryptor): def __init__(self, browser_root, logger): self._logger = logger self._v10_key = _get_windows_v10_key(browser_root, logger) + self._cookie_counts = {'v10': 0, 'other': 0} + + @property + def cookie_counts(self): + return self._cookie_counts def decrypt(self, encrypted_value): version = encrypted_value[:3] ciphertext = encrypted_value[3:] if version == b'v10': + self._cookie_counts['v10'] += 1 if self._v10_key is None: self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True) return None - elif not CRYPTO_AVAILABLE: - self._logger.warning('cannot decrypt cookie as the `pycryptodome` module is not installed. ' - 'Please install by running `python3 -m pip install pycryptodome`', - only_once=True) - return None # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_win.cc # kNonceLength @@ -421,6 +435,7 @@ def decrypt(self, encrypted_value): return _decrypt_aes_gcm(ciphertext, self._v10_key, nonce, authentication_tag, self._logger) else: + self._cookie_counts['other'] += 1 # any other prefix means the data is DPAPI encrypted # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_win.cc return _decrypt_windows_dpapi(encrypted_value, self._logger).decode('utf-8') @@ -559,7 +574,7 @@ def _parse_safari_cookies_record(data, jar, logger): p.skip_to(value_offset) value = p.read_cstring() except UnicodeDecodeError: - logger.warning('failed to parse cookie because UTF-8 decoding failed') + logger.warning('failed to parse Safari cookie because UTF-8 decoding failed', only_once=True) return record_size p.skip_to(record_size, 'space at the end of the record') @@ -590,37 +605,221 @@ def parse_safari_cookies(data, jar=None, logger=YDLLogger()): return jar -def _get_linux_keyring_password(browser_keyring_name): - password = keyring.get_password('{} Keys'.format(browser_keyring_name), - '{} Safe Storage'.format(browser_keyring_name)) - if password is None: - # this sometimes occurs in KDE because chrome does not check hasEntry and instead - # just tries to read the value (which kwallet returns "") whereas keyring checks hasEntry - # to verify this: - # dbus-monitor "interface='org.kde.KWallet'" "type=method_return" - # while starting chrome. - # this may be a bug as the intended behaviour is to generate a random password and store - # it, but that doesn't matter here. - password = '' - return password.encode('utf-8') +class _LinuxDesktopEnvironment(Enum): + """ + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/base/nix/xdg_util.h + DesktopEnvironment + """ + OTHER = auto() + CINNAMON = auto() + GNOME = auto() + KDE = auto() + PANTHEON = auto() + UNITY = auto() + XFCE = auto() -def _get_mac_keyring_password(browser_keyring_name): - if KEYRING_AVAILABLE: - password = keyring.get_password('{} Safe Storage'.format(browser_keyring_name), browser_keyring_name) - return password.encode('utf-8') +class _LinuxKeyring(Enum): + """ + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_util_linux.h + SelectedLinuxBackend + """ + KWALLET = auto() + GNOMEKEYRING = auto() + BASICTEXT = auto() + + +SUPPORTED_KEYRINGS = _LinuxKeyring.__members__.keys() + + +def _get_linux_desktop_environment(env): + """ + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/base/nix/xdg_util.cc + GetDesktopEnvironment + """ + xdg_current_desktop = env.get('XDG_CURRENT_DESKTOP', None) + desktop_session = env.get('DESKTOP_SESSION', None) + if xdg_current_desktop is not None: + xdg_current_desktop = xdg_current_desktop.split(':')[0].strip() + + if xdg_current_desktop == 'Unity': + if desktop_session is not None and 'gnome-fallback' in desktop_session: + return _LinuxDesktopEnvironment.GNOME + else: + return _LinuxDesktopEnvironment.UNITY + elif xdg_current_desktop == 'GNOME': + return _LinuxDesktopEnvironment.GNOME + elif xdg_current_desktop == 'X-Cinnamon': + return _LinuxDesktopEnvironment.CINNAMON + elif xdg_current_desktop == 'KDE': + return _LinuxDesktopEnvironment.KDE + elif xdg_current_desktop == 'Pantheon': + return _LinuxDesktopEnvironment.PANTHEON + elif xdg_current_desktop == 'XFCE': + return _LinuxDesktopEnvironment.XFCE + elif desktop_session is not None: + if desktop_session in ('mate', 'gnome'): + return _LinuxDesktopEnvironment.GNOME + elif 'kde' in desktop_session: + return _LinuxDesktopEnvironment.KDE + elif 'xfce' in desktop_session: + return _LinuxDesktopEnvironment.XFCE else: - proc = subprocess.Popen(['security', 'find-generic-password', - '-w', # write password to stdout - '-a', browser_keyring_name, # match 'account' - '-s', '{} Safe Storage'.format(browser_keyring_name)], # match 'service' - stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL) - try: - stdout, stderr = process_communicate_or_kill(proc) - return stdout - except BaseException: - return None + if 'GNOME_DESKTOP_SESSION_ID' in env: + return _LinuxDesktopEnvironment.GNOME + elif 'KDE_FULL_SESSION' in env: + return _LinuxDesktopEnvironment.KDE + else: + return _LinuxDesktopEnvironment.OTHER + + +def _choose_linux_keyring(logger): + """ + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_util_linux.cc + SelectBackend + """ + desktop_environment = _get_linux_desktop_environment(os.environ) + logger.debug('detected desktop environment: {}'.format(desktop_environment.name)) + if desktop_environment == _LinuxDesktopEnvironment.KDE: + linux_keyring = _LinuxKeyring.KWALLET + elif desktop_environment == _LinuxDesktopEnvironment.OTHER: + linux_keyring = _LinuxKeyring.BASICTEXT + else: + linux_keyring = _LinuxKeyring.GNOMEKEYRING + return linux_keyring + + +def _get_kwallet_network_wallet(logger): + """ The name of the wallet used to store network passwords. + + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/kwallet_dbus.cc + KWalletDBus::NetworkWallet + which does a dbus call to the following function: + https://api.kde.org/frameworks/kwallet/html/classKWallet_1_1Wallet.html + Wallet::NetworkWallet + """ + default_wallet = 'kdewallet' + try: + proc = Popen([ + 'dbus-send', '--session', '--print-reply=literal', + '--dest=org.kde.kwalletd5', + '/modules/kwalletd5', + 'org.kde.KWallet.networkWallet' + ], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + + stdout, stderr = proc.communicate_or_kill() + if proc.returncode != 0: + logger.warning('failed to read NetworkWallet') + return default_wallet + else: + network_wallet = stdout.decode('utf-8').strip() + logger.debug('NetworkWallet = "{}"'.format(network_wallet)) + return network_wallet + except BaseException as e: + logger.warning('exception while obtaining NetworkWallet: {}'.format(e)) + return default_wallet + + +def _get_kwallet_password(browser_keyring_name, logger): + logger.debug('using kwallet-query to obtain password from kwallet') + + if shutil.which('kwallet-query') is None: + logger.error('kwallet-query command not found. KWallet and kwallet-query ' + 'must be installed to read from KWallet. kwallet-query should be' + 'included in the kwallet package for your distribution') + return b'' + + network_wallet = _get_kwallet_network_wallet(logger) + + try: + proc = Popen([ + 'kwallet-query', + '--read-password', '{} Safe Storage'.format(browser_keyring_name), + '--folder', '{} Keys'.format(browser_keyring_name), + network_wallet + ], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + + stdout, stderr = proc.communicate_or_kill() + if proc.returncode != 0: + logger.error('kwallet-query failed with return code {}. Please consult ' + 'the kwallet-query man page for details'.format(proc.returncode)) + return b'' + else: + if stdout.lower().startswith(b'failed to read'): + logger.debug('failed to read password from kwallet. Using empty string instead') + # this sometimes occurs in KDE because chrome does not check hasEntry and instead + # just tries to read the value (which kwallet returns "") whereas kwallet-query + # checks hasEntry. To verify this: + # dbus-monitor "interface='org.kde.KWallet'" "type=method_return" + # while starting chrome. + # this may be a bug as the intended behaviour is to generate a random password and store + # it, but that doesn't matter here. + return b'' + else: + logger.debug('password found') + if stdout[-1:] == b'\n': + stdout = stdout[:-1] + return stdout + except BaseException as e: + logger.warning(f'exception running kwallet-query: {type(e).__name__}({e})') + return b'' + + +def _get_gnome_keyring_password(browser_keyring_name, logger): + if not SECRETSTORAGE_AVAILABLE: + logger.error('secretstorage not available {}'.format(SECRETSTORAGE_UNAVAILABLE_REASON)) + return b'' + # the Gnome keyring does not seem to organise keys in the same way as KWallet, + # using `dbus-monitor` during startup, it can be observed that chromium lists all keys + # and presumably searches for its key in the list. It appears that we must do the same. + # https://github.com/jaraco/keyring/issues/556 + with contextlib.closing(secretstorage.dbus_init()) as con: + col = secretstorage.get_default_collection(con) + for item in col.get_all_items(): + if item.get_label() == '{} Safe Storage'.format(browser_keyring_name): + return item.get_secret() + else: + logger.error('failed to read from keyring') + return b'' + + +def _get_linux_keyring_password(browser_keyring_name, keyring, logger): + # note: chrome/chromium can be run with the following flags to determine which keyring backend + # it has chosen to use + # chromium --enable-logging=stderr --v=1 2>&1 | grep key_storage_ + # Chromium supports a flag: --password-store=<basic|gnome|kwallet> so the automatic detection + # will not be sufficient in all cases. + + keyring = _LinuxKeyring[keyring] or _choose_linux_keyring(logger) + logger.debug(f'Chosen keyring: {keyring.name}') + + if keyring == _LinuxKeyring.KWALLET: + return _get_kwallet_password(browser_keyring_name, logger) + elif keyring == _LinuxKeyring.GNOMEKEYRING: + return _get_gnome_keyring_password(browser_keyring_name, logger) + elif keyring == _LinuxKeyring.BASICTEXT: + # when basic text is chosen, all cookies are stored as v10 (so no keyring password is required) + return None + assert False, f'Unknown keyring {keyring}' + + +def _get_mac_keyring_password(browser_keyring_name, logger): + logger.debug('using find-generic-password to obtain password from OSX keychain') + try: + proc = Popen( + ['security', 'find-generic-password', + '-w', # write password to stdout + '-a', browser_keyring_name, # match 'account' + '-s', '{} Safe Storage'.format(browser_keyring_name)], # match 'service' + stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + + stdout, stderr = proc.communicate_or_kill() + if stdout[-1:] == b'\n': + stdout = stdout[:-1] + return stdout + except BaseException as e: + logger.warning(f'exception running find-generic-password: {type(e).__name__}({e})') + return None def _get_windows_v10_key(browser_root, logger): @@ -628,7 +827,7 @@ def _get_windows_v10_key(browser_root, logger): if path is None: logger.error('could not find local state file') return None - with open(path, 'r') as f: + with open(path, 'r', encoding='utf8') as f: data = json.load(f) try: base64_key = data['os_crypt']['encrypted_key'] @@ -648,29 +847,26 @@ def pbkdf2_sha1(password, salt, iterations, key_length): def _decrypt_aes_cbc(ciphertext, key, logger, initialization_vector=b' ' * 16): - plaintext = aes_cbc_decrypt(bytes_to_intlist(ciphertext), - bytes_to_intlist(key), - bytes_to_intlist(initialization_vector)) + plaintext = aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector) padding_length = plaintext[-1] try: - return intlist_to_bytes(plaintext[:-padding_length]).decode('utf-8') + return plaintext[:-padding_length].decode('utf-8') except UnicodeDecodeError: - logger.warning('failed to decrypt cookie because UTF-8 decoding failed. Possibly the key is wrong?') + logger.warning('failed to decrypt cookie (AES-CBC) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True) return None def _decrypt_aes_gcm(ciphertext, key, nonce, authentication_tag, logger): - cipher = AES.new(key, AES.MODE_GCM, nonce) try: - plaintext = cipher.decrypt_and_verify(ciphertext, authentication_tag) + plaintext = aes_gcm_decrypt_and_verify_bytes(ciphertext, key, authentication_tag, nonce) except ValueError: - logger.warning('failed to decrypt cookie because the MAC check failed. Possibly the key is wrong?') + logger.warning('failed to decrypt cookie (AES-GCM) because the MAC check failed. Possibly the key is wrong?', only_once=True) return None try: return plaintext.decode('utf-8') except UnicodeDecodeError: - logger.warning('failed to decrypt cookie because UTF-8 decoding failed. Possibly the key is wrong?') + logger.warning('failed to decrypt cookie (AES-GCM) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True) return None @@ -698,7 +894,7 @@ class DATA_BLOB(ctypes.Structure): ctypes.byref(blob_out) # pDataOut ) if not ret: - logger.warning('failed to decrypt with DPAPI') + logger.warning('failed to decrypt with DPAPI', only_once=True) return None result = ctypes.string_at(blob_out.pbData, blob_out.cbData) @@ -747,9 +943,11 @@ def _is_path(value): return os.path.sep in value -def _parse_browser_specification(browser_name, profile=None): +def _parse_browser_specification(browser_name, profile=None, keyring=None): if browser_name not in SUPPORTED_BROWSERS: raise ValueError(f'unsupported browser: "{browser_name}"') + if keyring not in (None, *SUPPORTED_KEYRINGS): + raise ValueError(f'unsupported keyring: "{keyring}"') if profile is not None and _is_path(profile): profile = os.path.expanduser(profile) - return browser_name, profile + return browser_name, profile, keyring diff --git a/yt_dlp/downloader/__init__.py b/yt_dlp/downloader/__init__.py index 739d98c2b6..acc19f43a6 100644 --- a/yt_dlp/downloader/__init__.py +++ b/yt_dlp/downloader/__init__.py @@ -10,10 +10,20 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=None, to_stdout=False): info_dict['protocol'] = determine_protocol(info_dict) info_copy = info_dict.copy() - if protocol: - info_copy['protocol'] = protocol info_copy['to_stdout'] = to_stdout - return _get_suitable_downloader(info_copy, params, default) + + protocols = (protocol or info_copy['protocol']).split('+') + downloaders = [_get_suitable_downloader(info_copy, proto, params, default) for proto in protocols] + + if set(downloaders) == {FFmpegFD} and FFmpegFD.can_merge_formats(info_copy, params): + return FFmpegFD + elif (set(downloaders) == {DashSegmentsFD} + and not (to_stdout and len(protocols) > 1) + and set(protocols) == {'http_dash_segments_generator'}): + return DashSegmentsFD + elif len(downloaders) == 1: + return downloaders[0] + return None # Some of these require get_suitable_downloader @@ -36,6 +46,7 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N PROTOCOL_MAP = { 'rtmp': RtmpFD, + 'rtmpe': RtmpFD, 'rtmp_ffmpeg': FFmpegFD, 'm3u8_native': HlsFD, 'm3u8': FFmpegFD, @@ -43,6 +54,7 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N 'rtsp': RtspFD, 'f4m': F4mFD, 'http_dash_segments': DashSegmentsFD, + 'http_dash_segments_generator': DashSegmentsFD, 'ism': IsmFD, 'mhtml': MhtmlFD, 'niconico_dmc': NiconicoDmcFD, @@ -57,6 +69,7 @@ def shorten_protocol_name(proto, simplify=False): 'm3u8_native': 'm3u8_n', 'rtmp_ffmpeg': 'rtmp_f', 'http_dash_segments': 'dash', + 'http_dash_segments_generator': 'dash_g', 'niconico_dmc': 'dmc', 'websocket_frag': 'WSfrag', } @@ -65,6 +78,7 @@ def shorten_protocol_name(proto, simplify=False): 'https': 'http', 'ftps': 'ftp', 'm3u8_native': 'm3u8', + 'http_dash_segments_generator': 'dash', 'rtmp_ffmpeg': 'rtmp', 'm3u8_frag_urls': 'm3u8', 'dash_frag_urls': 'dash', @@ -72,7 +86,7 @@ def shorten_protocol_name(proto, simplify=False): return short_protocol_names.get(proto, proto) -def _get_suitable_downloader(info_dict, params, default): +def _get_suitable_downloader(info_dict, protocol, params, default): """Get the downloader class that can handle the info dict.""" if default is NO_DEFAULT: default = HttpFD @@ -80,7 +94,7 @@ def _get_suitable_downloader(info_dict, params, default): # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict): # return FFmpegFD - protocol = info_dict['protocol'] + info_dict['protocol'] = protocol downloaders = params.get('external_downloader') external_downloader = ( downloaders if isinstance(downloaders, compat_str) or downloaders is None diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index f5f6393a62..37321e34b1 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -1,20 +1,26 @@ from __future__ import division, unicode_literals -import copy import os import re -import sys import time import random +import errno -from ..compat import compat_os_name from ..utils import ( decodeArgument, encodeFilename, error_to_compat_str, format_bytes, + sanitize_open, shell_quote, timeconvert, + timetuple_from_msec, +) +from ..minicurses import ( + MultilineLogger, + MultilinePrinter, + QuietMultilinePrinter, + BreaklineStatusPrinter ) @@ -35,12 +41,11 @@ class FileDownloader(object): ratelimit: Download speed limit, in bytes/sec. throttledratelimit: Assume the download is being throttled below this speed (bytes/sec) retries: Number of times to retry for HTTP error 5xx + file_access_retries: Number of times to retry on file access error buffersize: Size of download buffer in bytes. noresizebuffer: Do not automatically resize the download buffer. continuedl: Try to continue downloads if possible. noprogress: Do not print the progress bar. - logtostderr: Log messages to stderr instead of stdout. - consoletitle: Display progress in console window's titlebar. nopart: Do not use temporary .part files. updatetime: Use the Last-modified header to set output file timestamps. test: Download only first bytes to test the downloader. @@ -56,6 +61,7 @@ class FileDownloader(object): http_chunk_size: Size of a chunk for chunk-based HTTP downloading. May be useful for bypassing bandwidth throttling imposed by a webserver (experimental) + progress_template: See YoutubeDL.py Subclasses of this one must re-define the real_download method. """ @@ -68,18 +74,17 @@ def __init__(self, ydl, params): self.ydl = ydl self._progress_hooks = [] self.params = params + self._prepare_multiline_status() self.add_progress_hook(self.report_progress) @staticmethod def format_seconds(seconds): - (mins, secs) = divmod(seconds, 60) - (hours, mins) = divmod(mins, 60) - if hours > 99: + time = timetuple_from_msec(seconds * 1000) + if time.hours > 99: return '--:--:--' - if hours == 0: - return '%02d:%02d' % (mins, secs) - else: - return '%02d:%02d:%02d' % (hours, mins, secs) + if not time.hours: + return '%02d:%02d' % time[1:-1] + return '%02d:%02d:%02d' % time[:-1] @staticmethod def calc_percent(byte_counter, data_len): @@ -91,6 +96,8 @@ def calc_percent(byte_counter, data_len): def format_percent(percent): if percent is None: return '---.-%' + elif percent == 100: + return '100%' return '%6s' % ('%3.1f%%' % percent) @staticmethod @@ -203,16 +210,28 @@ def undo_temp_name(self, filename): def ytdl_filename(self, filename): return filename + '.ytdl' + def sanitize_open(self, filename, open_mode): + file_access_retries = self.params.get('file_access_retries', 10) + retry = 0 + while True: + try: + return sanitize_open(filename, open_mode) + except (IOError, OSError) as err: + retry = retry + 1 + if retry > file_access_retries or err.errno not in (errno.EACCES,): + raise + self.to_screen( + '[download] Got file access error. Retrying (attempt %d of %s) ...' + % (retry, self.format_retries(file_access_retries))) + time.sleep(0.01) + def try_rename(self, old_filename, new_filename): if old_filename == new_filename: return try: - if self.params.get('overwrites', False): - if os.path.isfile(encodeFilename(new_filename)): - os.remove(encodeFilename(new_filename)) - os.rename(encodeFilename(old_filename), encodeFilename(new_filename)) + os.replace(old_filename, new_filename) except (IOError, OSError) as err: - self.report_error('unable to rename file: %s' % error_to_compat_str(err)) + self.report_error(f'unable to rename file: {err}') def try_utime(self, filename, last_modified_hdr): """Try to set the last-modified time of the given file.""" @@ -239,39 +258,67 @@ def report_destination(self, filename): """Report destination filename.""" self.to_screen('[download] Destination: ' + filename) - def _report_progress_status(self, msg, is_last_line=False): - fullmsg = '[download] ' + msg - if self.params.get('progress_with_newline', False): - self.to_screen(fullmsg) + def _prepare_multiline_status(self, lines=1): + if self.params.get('noprogress'): + self._multiline = QuietMultilinePrinter() + elif self.ydl.params.get('logger'): + self._multiline = MultilineLogger(self.ydl.params['logger'], lines) + elif self.params.get('progress_with_newline'): + self._multiline = BreaklineStatusPrinter(self.ydl._screen_file, lines) else: - if compat_os_name == 'nt': - prev_len = getattr(self, '_report_progress_prev_line_length', - 0) - if prev_len > len(fullmsg): - fullmsg += ' ' * (prev_len - len(fullmsg)) - self._report_progress_prev_line_length = len(fullmsg) - clear_line = '\r' - else: - clear_line = ('\r\x1b[K' if sys.stderr.isatty() else '\r') - self.to_screen(clear_line + fullmsg, skip_eol=not is_last_line) - self.to_console_title('yt-dlp ' + msg) + self._multiline = MultilinePrinter(self.ydl._screen_file, lines, not self.params.get('quiet')) + self._multiline.allow_colors = self._multiline._HAVE_FULLCAP and not self.params.get('no_color') + + def _finish_multiline_status(self): + self._multiline.end() + + _progress_styles = { + 'downloaded_bytes': 'light blue', + 'percent': 'light blue', + 'eta': 'yellow', + 'speed': 'green', + 'elapsed': 'bold white', + 'total_bytes': '', + 'total_bytes_estimate': '', + } + + def _report_progress_status(self, s, default_template): + for name, style in self._progress_styles.items(): + name = f'_{name}_str' + if name not in s: + continue + s[name] = self._format_progress(s[name], style) + s['_default_template'] = default_template % s + + progress_dict = s.copy() + progress_dict.pop('info_dict') + progress_dict = {'info': s['info_dict'], 'progress': progress_dict} + + progress_template = self.params.get('progress_template', {}) + self._multiline.print_at_line(self.ydl.evaluate_outtmpl( + progress_template.get('download') or '[download] %(progress._default_template)s', + progress_dict), s.get('progress_idx') or 0) + self.to_console_title(self.ydl.evaluate_outtmpl( + progress_template.get('download-title') or 'yt-dlp %(progress._default_template)s', + progress_dict)) + + def _format_progress(self, *args, **kwargs): + return self.ydl._format_text( + self._multiline.stream, self._multiline.allow_colors, *args, **kwargs) def report_progress(self, s): if s['status'] == 'finished': - if self.params.get('noprogress', False): + if self.params.get('noprogress'): self.to_screen('[download] Download completed') - else: - msg_template = '100%%' - if s.get('total_bytes') is not None: - s['_total_bytes_str'] = format_bytes(s['total_bytes']) - msg_template += ' of %(_total_bytes_str)s' - if s.get('elapsed') is not None: - s['_elapsed_str'] = self.format_seconds(s['elapsed']) - msg_template += ' in %(_elapsed_str)s' - self._report_progress_status( - msg_template % s, is_last_line=True) - - if self.params.get('noprogress'): + msg_template = '100%%' + if s.get('total_bytes') is not None: + s['_total_bytes_str'] = format_bytes(s['total_bytes']) + msg_template += ' of %(_total_bytes_str)s' + if s.get('elapsed') is not None: + s['_elapsed_str'] = self.format_seconds(s['elapsed']) + msg_template += ' in %(_elapsed_str)s' + s['_percent_str'] = self.format_percent(100) + self._report_progress_status(s, msg_template) return if s['status'] != 'downloading': @@ -280,7 +327,7 @@ def report_progress(self, s): if s.get('eta') is not None: s['_eta_str'] = self.format_eta(s['eta']) else: - s['_eta_str'] = 'Unknown ETA' + s['_eta_str'] = 'Unknown' if s.get('total_bytes') and s.get('downloaded_bytes') is not None: s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes']) @@ -312,9 +359,12 @@ def report_progress(self, s): else: msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s' else: - msg_template = '%(_percent_str)s % at %(_speed_str)s ETA %(_eta_str)s' - - self._report_progress_status(msg_template % s) + msg_template = '%(_percent_str)s at %(_speed_str)s ETA %(_eta_str)s' + if s.get('fragment_index') and s.get('fragment_count'): + msg_template += ' (frag %(fragment_index)s/%(fragment_count)s)' + elif s.get('fragment_index'): + msg_template += ' (frag %(fragment_index)s)' + self._report_progress_status(s, msg_template) def report_resuming_byte(self, resume_len): """Report attempt to resume at given byte.""" @@ -365,6 +415,7 @@ def download(self, filename, info_dict, subtitle=False): 'status': 'finished', 'total_bytes': os.path.getsize(encodeFilename(filename)), }, info_dict) + self._finish_multiline_status() return True, False if subtitle is False: @@ -386,7 +437,9 @@ def download(self, filename, info_dict, subtitle=False): '[download] Sleeping %s seconds ...' % ( sleep_interval_sub)) time.sleep(sleep_interval_sub) - return self.real_download(filename, info_dict), True + ret = self.real_download(filename, info_dict) + self._finish_multiline_status() + return ret, True def real_download(self, filename, info_dict): """Real download process. Redefine in subclasses.""" @@ -395,13 +448,10 @@ def real_download(self, filename, info_dict): def _hook_progress(self, status, info_dict): if not self._progress_hooks: return - info_dict = dict(info_dict) - for key in ('__original_infodict', '__postprocessors'): - info_dict.pop(key, None) + status['info_dict'] = info_dict # youtube-dl passes the same status object to all the hooks. # Some third party scripts seems to be relying on this. # So keep this behavior if possible - status['info_dict'] = copy.deepcopy(info_dict) for ph in self._progress_hooks: ph(status) diff --git a/yt_dlp/downloader/dash.py b/yt_dlp/downloader/dash.py index 734eab3ef2..a845ee7d3d 100644 --- a/yt_dlp/downloader/dash.py +++ b/yt_dlp/downloader/dash.py @@ -1,4 +1,5 @@ from __future__ import unicode_literals +import time from ..downloader import get_suitable_downloader from .fragment import FragmentFD @@ -15,27 +16,53 @@ class DashSegmentsFD(FragmentFD): FD_NAME = 'dashsegments' def real_download(self, filename, info_dict): - if info_dict.get('is_live'): + if info_dict.get('is_live') and set(info_dict['protocol'].split('+')) != {'http_dash_segments_generator'}: self.report_error('Live DASH videos are not supported') - fragment_base_url = info_dict.get('fragment_base_url') - fragments = info_dict['fragments'][:1] if self.params.get( - 'test', False) else info_dict['fragments'] - + real_start = time.time() real_downloader = get_suitable_downloader( info_dict, self.params, None, protocol='dash_frag_urls', to_stdout=(filename == '-')) - ctx = { - 'filename': filename, - 'total_frags': len(fragments), - } + requested_formats = [{**info_dict, **fmt} for fmt in info_dict.get('requested_formats', [])] + args = [] + for fmt in requested_formats or [info_dict]: + try: + fragment_count = 1 if self.params.get('test') else len(fmt['fragments']) + except TypeError: + fragment_count = None + ctx = { + 'filename': fmt.get('filepath') or filename, + 'live': 'is_from_start' if fmt.get('is_from_start') else fmt.get('is_live'), + 'total_frags': fragment_count, + } - if real_downloader: - self._prepare_external_frag_download(ctx) - else: - self._prepare_and_start_frag_download(ctx, info_dict) + if real_downloader: + self._prepare_external_frag_download(ctx) + else: + self._prepare_and_start_frag_download(ctx, fmt) + ctx['start'] = real_start + + fragments_to_download = self._get_fragments(fmt, ctx) + + if real_downloader: + self.to_screen( + '[%s] Fragment downloads will be delegated to %s' % (self.FD_NAME, real_downloader.get_basename())) + info_dict['fragments'] = list(fragments_to_download) + fd = real_downloader(self.ydl, self.params) + return fd.real_download(filename, info_dict) + + args.append([ctx, fragments_to_download, fmt]) + + return self.download_and_append_fragments_multiple(*args) + + def _resolve_fragments(self, fragments, ctx): + fragments = fragments(ctx) if callable(fragments) else fragments + return [next(iter(fragments))] if self.params.get('test') else fragments + + def _get_fragments(self, fmt, ctx): + fragment_base_url = fmt.get('fragment_base_url') + fragments = self._resolve_fragments(fmt['fragments'], ctx) - fragments_to_download = [] frag_index = 0 for i, fragment in enumerate(fragments): frag_index += 1 @@ -46,18 +73,8 @@ def real_download(self, filename, info_dict): assert fragment_base_url fragment_url = urljoin(fragment_base_url, fragment['path']) - fragments_to_download.append({ + yield { 'frag_index': frag_index, 'index': i, 'url': fragment_url, - }) - - if real_downloader: - self.to_screen( - '[%s] Fragment downloads will be delegated to %s' % (self.FD_NAME, real_downloader.get_basename())) - info_copy = info_dict.copy() - info_copy['fragments'] = fragments_to_download - fd = real_downloader(self.ydl, self.params) - return fd.real_download(filename, info_copy) - - return self.download_and_append_fragments(ctx, fragments_to_download, info_dict) + } diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index fdfabb38da..17be3c46f5 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -6,13 +6,7 @@ import sys import time -try: - from Crypto.Cipher import AES - can_decrypt_frag = True -except ImportError: - can_decrypt_frag = False - -from .common import FileDownloader +from .fragment import FragmentFD from ..compat import ( compat_setenv, compat_str, @@ -27,14 +21,11 @@ encodeArgument, handle_youtubedl_headers, check_executable, - is_outdated_version, - process_communicate_or_kill, - sanitized_Request, - sanitize_open, + Popen, ) -class ExternalFD(FileDownloader): +class ExternalFD(FragmentFD): SUPPORTED_PROTOCOLS = ('http', 'https', 'ftp', 'ftps') can_download_to_stdout = False @@ -122,73 +113,54 @@ def _call_downloader(self, tmpfilename, info_dict): self._debug_cmd(cmd) - if 'fragments' in info_dict: - fragment_retries = self.params.get('fragment_retries', 0) - skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) - - count = 0 - while count <= fragment_retries: - p = subprocess.Popen( - cmd, stderr=subprocess.PIPE) - _, stderr = process_communicate_or_kill(p) - if p.returncode == 0: - break - # TODO: Decide whether to retry based on error code - # https://aria2.github.io/manual/en/html/aria2c.html#exit-status - self.to_stderr(stderr.decode('utf-8', 'replace')) - count += 1 - if count <= fragment_retries: - self.to_screen( - '[%s] Got error. Retrying fragments (attempt %d of %s)...' - % (self.get_basename(), count, self.format_retries(fragment_retries))) - if count > fragment_retries: - if not skip_unavailable_fragments: - self.report_error('Giving up after %s fragment retries' % fragment_retries) - return -1 - - dest, _ = sanitize_open(tmpfilename, 'wb') - for frag_index, fragment in enumerate(info_dict['fragments']): - fragment_filename = '%s-Frag%d' % (tmpfilename, frag_index) - try: - src, _ = sanitize_open(fragment_filename, 'rb') - except IOError: - if skip_unavailable_fragments and frag_index > 1: - self.to_screen('[%s] Skipping fragment %d ...' % (self.get_basename(), frag_index)) - continue - self.report_error('Unable to open fragment %d' % frag_index) - return -1 - decrypt_info = fragment.get('decrypt_info') - if decrypt_info: - if decrypt_info['METHOD'] == 'AES-128': - iv = decrypt_info.get('IV') - decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen( - self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read() - encrypted_data = src.read() - decrypted_data = AES.new( - decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(encrypted_data) - dest.write(decrypted_data) - else: - fragment_data = src.read() - dest.write(fragment_data) - else: - fragment_data = src.read() - dest.write(fragment_data) - src.close() - if not self.params.get('keep_fragments', False): - os.remove(encodeFilename(fragment_filename)) - dest.close() - os.remove(encodeFilename('%s.frag.urls' % tmpfilename)) - else: - p = subprocess.Popen( - cmd, stderr=subprocess.PIPE) - _, stderr = process_communicate_or_kill(p) + if 'fragments' not in info_dict: + p = Popen(cmd, stderr=subprocess.PIPE) + _, stderr = p.communicate_or_kill() if p.returncode != 0: self.to_stderr(stderr.decode('utf-8', 'replace')) - return p.returncode + return p.returncode - def _prepare_url(self, info_dict, url): - headers = info_dict.get('http_headers') - return sanitized_Request(url, None, headers) if headers else url + fragment_retries = self.params.get('fragment_retries', 0) + skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) + + count = 0 + while count <= fragment_retries: + p = Popen(cmd, stderr=subprocess.PIPE) + _, stderr = p.communicate_or_kill() + if p.returncode == 0: + break + # TODO: Decide whether to retry based on error code + # https://aria2.github.io/manual/en/html/aria2c.html#exit-status + self.to_stderr(stderr.decode('utf-8', 'replace')) + count += 1 + if count <= fragment_retries: + self.to_screen( + '[%s] Got error. Retrying fragments (attempt %d of %s)...' + % (self.get_basename(), count, self.format_retries(fragment_retries))) + if count > fragment_retries: + if not skip_unavailable_fragments: + self.report_error('Giving up after %s fragment retries' % fragment_retries) + return -1 + + decrypt_fragment = self.decrypter(info_dict) + dest, _ = self.sanitize_open(tmpfilename, 'wb') + for frag_index, fragment in enumerate(info_dict['fragments']): + fragment_filename = '%s-Frag%d' % (tmpfilename, frag_index) + try: + src, _ = self.sanitize_open(fragment_filename, 'rb') + except IOError as err: + if skip_unavailable_fragments and frag_index > 1: + self.report_skip_fragment(frag_index, err) + continue + self.report_error(f'Unable to open fragment {frag_index}; {err}') + return -1 + dest.write(decrypt_fragment(fragment, src.read())) + src.close() + if not self.params.get('keep_fragments', False): + os.remove(encodeFilename(fragment_filename)) + dest.close() + os.remove(encodeFilename('%s.frag.urls' % tmpfilename)) + return 0 class CurlFD(ExternalFD): @@ -223,8 +195,8 @@ def _call_downloader(self, tmpfilename, info_dict): self._debug_cmd(cmd) # curl writes the progress to stderr so don't capture it. - p = subprocess.Popen(cmd) - process_communicate_or_kill(p) + p = Popen(cmd) + p.communicate_or_kill() return p.returncode @@ -288,10 +260,12 @@ def _make_cmd(self, tmpfilename, info_dict): if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] + cmd += self._option('--max-overall-download-limit', 'ratelimit') cmd += self._option('--interface', 'source_address') cmd += self._option('--all-proxy', 'proxy') cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=') cmd += self._bool_option('--remote-time', 'updatetime', 'true', 'false', '=') + cmd += self._bool_option('--show-console-readout', 'noprogress', 'false', 'true', '=') cmd += self._configuration_args() # aria2c strips out spaces from the beginning/end of filenames and paths. @@ -316,7 +290,7 @@ def _make_cmd(self, tmpfilename, info_dict): for frag_index, fragment in enumerate(info_dict['fragments']): fragment_filename = '%s-Frag%d' % (os.path.basename(tmpfilename), frag_index) url_list.append('%s\n\tout=%s' % (fragment['url'], fragment_filename)) - stream, _ = sanitize_open(url_list_file, 'wb') + stream, _ = self.sanitize_open(url_list_file, 'wb') stream.write('\n'.join(url_list).encode('utf-8')) stream.close() cmd += ['-i', url_list_file] @@ -351,12 +325,16 @@ def available(cls, path=None): # Fixme: This may be wrong when --ffmpeg-location is used return FFmpegPostProcessor().available + @classmethod + def supports(cls, info_dict): + return all(proto in cls.SUPPORTED_PROTOCOLS for proto in info_dict['protocol'].split('+')) + def on_process_started(self, proc, stdin): """ Override this in subclasses """ pass @classmethod - def can_merge_formats(cls, info_dict, params={}): + def can_merge_formats(cls, info_dict, params): return ( info_dict.get('requested_formats') and info_dict.get('protocol') @@ -465,8 +443,7 @@ def _call_downloader(self, tmpfilename, info_dict): if info_dict.get('requested_formats') or protocol == 'http_dash_segments': for (i, fmt) in enumerate(info_dict.get('requested_formats') or [info_dict]): stream_number = fmt.get('manifest_stream_number', 0) - a_or_v = 'a' if fmt.get('acodec') != 'none' else 'v' - args.extend(['-map', f'{i}:{a_or_v}:{stream_number}']) + args.extend(['-map', f'{i}:{stream_number}']) if self.params.get('test', False): args += ['-fs', compat_str(self._TEST_FILE_SIZE)] @@ -480,7 +457,7 @@ def _call_downloader(self, tmpfilename, info_dict): args += ['-f', 'mpegts'] else: args += ['-f', 'mp4'] - if (ffpp.basename == 'ffmpeg' and is_outdated_version(ffpp._versions['ffmpeg'], '3.2', False)) and (not info_dict.get('acodec') or info_dict['acodec'].split('.')[0] in ('aac', 'mp4a')): + if (ffpp.basename == 'ffmpeg' and ffpp._features.get('needs_adtstoasc')) and (not info_dict.get('acodec') or info_dict['acodec'].split('.')[0] in ('aac', 'mp4a')): args += ['-bsf:a', 'aac_adtstoasc'] elif protocol == 'rtmp': args += ['-f', 'flv'] @@ -495,7 +472,7 @@ def _call_downloader(self, tmpfilename, info_dict): args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) self._debug_cmd(args) - proc = subprocess.Popen(args, stdin=subprocess.PIPE, env=env) + proc = Popen(args, stdin=subprocess.PIPE, env=env) if url in ('-', 'pipe:'): self.on_process_started(proc, proc.stdin) try: @@ -507,7 +484,7 @@ def _call_downloader(self, tmpfilename, info_dict): # streams). Note that Windows is not affected and produces playable # files (see https://github.com/ytdl-org/youtube-dl/issues/8300). if isinstance(e, KeyboardInterrupt) and sys.platform != 'win32' and url not in ('-', 'pipe:'): - process_communicate_or_kill(proc, b'q') + proc.communicate_or_kill(b'q') else: proc.kill() proc.wait() @@ -522,7 +499,7 @@ class AVconvFD(FFmpegFD): _BY_NAME = dict( (klass.get_basename(), klass) for name, klass in globals().items() - if name.endswith('FD') and name != 'ExternalFD' + if name.endswith('FD') and name not in ('ExternalFD', 'FragmentFD') ) diff --git a/yt_dlp/downloader/f4m.py b/yt_dlp/downloader/f4m.py index 9da2776d92..0008b7c286 100644 --- a/yt_dlp/downloader/f4m.py +++ b/yt_dlp/downloader/f4m.py @@ -366,7 +366,7 @@ def real_download(self, filename, info_dict): ctx = { 'filename': filename, 'total_frags': total_frags, - 'live': live, + 'live': bool(live), } self._prepare_frag_download(ctx) diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index e3af140fde..d4f112b0f4 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -1,14 +1,10 @@ from __future__ import division, unicode_literals +import http.client +import json +import math import os import time -import json - -try: - from Crypto.Cipher import AES - can_decrypt_frag = True -except ImportError: - can_decrypt_frag = False try: import concurrent.futures @@ -18,7 +14,9 @@ from .common import FileDownloader from .http import HttpFD +from ..aes import aes_cbc_decrypt_bytes from ..compat import ( + compat_os_name, compat_urllib_error, compat_struct_pack, ) @@ -26,7 +24,6 @@ DownloadError, error_to_compat_str, encodeFilename, - sanitize_open, sanitized_Request, ) @@ -35,6 +32,10 @@ class HttpQuietDownloader(HttpFD): def to_screen(self, *args, **kargs): pass + def report_retry(self, err, count, retries): + super().to_screen( + f'[download] Got server HTTP error: {err}. Retrying (attempt {count} of {self.format_retries(retries)}) ...') + class FragmentFD(FileDownloader): """ @@ -48,6 +49,7 @@ class FragmentFD(FileDownloader): Skip unavailable fragments (DASH and hlsnative only) keep_fragments: Keep downloaded fragments on disk after downloading is finished + concurrent_fragment_downloads: The number of threads to use for native hls and dash downloads _no_ytdl_file: Don't use .ytdl file For each incomplete fragment download yt-dlp keeps on disk a special @@ -76,8 +78,9 @@ def report_retry_fragment(self, err, frag_index, count, retries): '\r[download] Got server HTTP error: %s. Retrying fragment %d (attempt %d of %s) ...' % (error_to_compat_str(err), frag_index, count, self.format_retries(retries))) - def report_skip_fragment(self, frag_index): - self.to_screen('[download] Skipping fragment %d ...' % frag_index) + def report_skip_fragment(self, frag_index, err=None): + err = f' {err};' if err else '' + self.to_screen(f'[download]{err} Skipping fragment {frag_index:d} ...') def _prepare_url(self, info_dict, url): headers = info_dict.get('http_headers') @@ -88,11 +91,11 @@ def _prepare_and_start_frag_download(self, ctx, info_dict): self._start_frag_download(ctx, info_dict) def __do_ytdl_file(self, ctx): - return not ctx['live'] and not ctx['tmpfilename'] == '-' and not self.params.get('_no_ytdl_file') + return ctx['live'] is not True and ctx['tmpfilename'] != '-' and not self.params.get('_no_ytdl_file') def _read_ytdl_file(self, ctx): assert 'ytdl_corrupt' not in ctx - stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r') + stream, _ = self.sanitize_open(self.ytdl_filename(ctx['filename']), 'r') try: ytdl_data = json.loads(stream.read()) ctx['fragment_index'] = ytdl_data['downloader']['current_fragment']['index'] @@ -104,7 +107,7 @@ def _read_ytdl_file(self, ctx): stream.close() def _write_ytdl_file(self, ctx): - frag_index_stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'w') + frag_index_stream, _ = self.sanitize_open(self.ytdl_filename(ctx['filename']), 'w') try: downloader = { 'current_fragment': { @@ -125,6 +128,7 @@ def _download_fragment(self, ctx, frag_url, info_dict, headers=None, request_dat 'url': frag_url, 'http_headers': headers or info_dict.get('http_headers'), 'request_data': request_data, + 'ctx_id': ctx.get('ctx_id'), } success = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: @@ -135,7 +139,7 @@ def _download_fragment(self, ctx, frag_url, info_dict, headers=None, request_dat return True, self._read_fragment(ctx) def _read_fragment(self, ctx): - down, frag_sanitized = sanitize_open(ctx['fragment_filename_sanitized'], 'rb') + down, frag_sanitized = self.sanitize_open(ctx['fragment_filename_sanitized'], 'rb') ctx['fragment_filename_sanitized'] = frag_sanitized frag_content = down.read() down.close() @@ -169,7 +173,7 @@ def _prepare_frag_download(self, ctx): self.ydl, { 'continuedl': True, - 'quiet': True, + 'quiet': self.params.get('quiet'), 'noprogress': True, 'ratelimit': self.params.get('ratelimit'), 'retries': self.params.get('retries', 0), @@ -211,7 +215,7 @@ def _prepare_frag_download(self, ctx): self._write_ytdl_file(ctx) assert ctx['fragment_index'] == 0 - dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode) + dest_stream, tmpfilename = self.sanitize_open(tmpfilename, open_mode) ctx.update({ 'dl': dl, @@ -224,6 +228,7 @@ def _prepare_frag_download(self, ctx): def _start_frag_download(self, ctx, info_dict): resume_len = ctx['complete_frags_downloaded_bytes'] total_frags = ctx['total_frags'] + ctx_id = ctx.get('ctx_id') # This dict stores the download progress, it's updated by the progress # hook state = { @@ -238,6 +243,7 @@ def _start_frag_download(self, ctx, info_dict): start = time.time() ctx.update({ 'started': start, + 'fragment_started': start, # Amount of fragment's bytes downloaded by the time of the previous # frag progress hook invocation 'prev_frag_downloaded_bytes': 0, @@ -247,6 +253,12 @@ def frag_progress_hook(s): if s['status'] not in ('downloading', 'finished'): return + if ctx_id is not None and s.get('ctx_id') != ctx_id: + return + + state['max_progress'] = ctx.get('max_progress') + state['progress_idx'] = ctx.get('progress_idx') + time_now = time.time() state['elapsed'] = time_now - start frag_total_bytes = s.get('total_bytes') or 0 @@ -262,6 +274,9 @@ def frag_progress_hook(s): ctx['fragment_index'] = state['fragment_index'] state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes'] ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes'] + ctx['speed'] = state['speed'] = self.calc_speed( + ctx['fragment_started'], time_now, frag_total_bytes) + ctx['fragment_started'] = time.time() ctx['prev_frag_downloaded_bytes'] = 0 else: frag_downloaded_bytes = s['downloaded_bytes'] @@ -270,8 +285,8 @@ def frag_progress_hook(s): state['eta'] = self.calc_eta( start, time_now, estimated_size - resume_len, state['downloaded_bytes'] - resume_len) - state['speed'] = s.get('speed') or ctx.get('speed') - ctx['speed'] = state['speed'] + ctx['speed'] = state['speed'] = self.calc_speed( + ctx['fragment_started'], time_now, frag_downloaded_bytes) ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes self._hook_progress(state, info_dict) @@ -306,6 +321,9 @@ def _finish_frag_download(self, ctx, info_dict): 'filename': ctx['filename'], 'status': 'finished', 'elapsed': elapsed, + 'ctx_id': ctx.get('ctx_id'), + 'max_progress': ctx.get('max_progress'), + 'progress_idx': ctx.get('progress_idx'), }, info_dict) def _prepare_external_frag_download(self, ctx): @@ -329,15 +347,96 @@ def _prepare_external_frag_download(self, ctx): 'fragment_index': 0, }) - def download_and_append_fragments(self, ctx, fragments, info_dict, *, pack_func=None, finish_func=None): + def decrypter(self, info_dict): + _key_cache = {} + + def _get_key(url): + if url not in _key_cache: + _key_cache[url] = self.ydl.urlopen(self._prepare_url(info_dict, url)).read() + return _key_cache[url] + + def decrypt_fragment(fragment, frag_content): + decrypt_info = fragment.get('decrypt_info') + if not decrypt_info or decrypt_info['METHOD'] != 'AES-128': + return frag_content + iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', fragment['media_sequence']) + decrypt_info['KEY'] = decrypt_info.get('KEY') or _get_key(info_dict.get('_decryption_key_url') or decrypt_info['URI']) + # Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block + # size (see https://github.com/ytdl-org/youtube-dl/pull/27660). Tests only care that the correct data downloaded, + # not what it decrypts to. + if self.params.get('test', False): + return frag_content + decrypted_data = aes_cbc_decrypt_bytes(frag_content, decrypt_info['KEY'], iv) + return decrypted_data[:-decrypted_data[-1]] + + return decrypt_fragment + + def download_and_append_fragments_multiple(self, *args, pack_func=None, finish_func=None): + ''' + @params (ctx1, fragments1, info_dict1), (ctx2, fragments2, info_dict2), ... + all args must be either tuple or list + ''' + interrupt_trigger = [True] + max_progress = len(args) + if max_progress == 1: + return self.download_and_append_fragments(*args[0], pack_func=pack_func, finish_func=finish_func) + max_workers = self.params.get('concurrent_fragment_downloads', 1) + if max_progress > 1: + self._prepare_multiline_status(max_progress) + + def thread_func(idx, ctx, fragments, info_dict, tpe): + ctx['max_progress'] = max_progress + ctx['progress_idx'] = idx + return self.download_and_append_fragments( + ctx, fragments, info_dict, pack_func=pack_func, finish_func=finish_func, + tpe=tpe, interrupt_trigger=interrupt_trigger) + + class FTPE(concurrent.futures.ThreadPoolExecutor): + # has to stop this or it's going to wait on the worker thread itself + def __exit__(self, exc_type, exc_val, exc_tb): + pass + + spins = [] + if compat_os_name == 'nt': + self.report_warning('Ctrl+C does not work on Windows when used with parallel threads. ' + 'This is a known issue and patches are welcome') + for idx, (ctx, fragments, info_dict) in enumerate(args): + tpe = FTPE(math.ceil(max_workers / max_progress)) + job = tpe.submit(thread_func, idx, ctx, fragments, info_dict, tpe) + spins.append((tpe, job)) + + result = True + for tpe, job in spins: + try: + result = result and job.result() + except KeyboardInterrupt: + interrupt_trigger[0] = False + finally: + tpe.shutdown(wait=True) + if not interrupt_trigger[0]: + raise KeyboardInterrupt() + return result + + def download_and_append_fragments( + self, ctx, fragments, info_dict, *, pack_func=None, finish_func=None, + tpe=None, interrupt_trigger=None): + if not interrupt_trigger: + interrupt_trigger = (True, ) + fragment_retries = self.params.get('fragment_retries', 0) - is_fatal = (lambda idx: idx == 0) if self.params.get('skip_unavailable_fragments', True) else (lambda _: True) + is_fatal = ( + ((lambda _: False) if info_dict.get('is_live') else (lambda idx: idx == 0)) + if self.params.get('skip_unavailable_fragments', True) else (lambda _: True)) + if not pack_func: pack_func = lambda frag_content, _: frag_content def download_fragment(fragment, ctx): frag_index = ctx['fragment_index'] = fragment['frag_index'] - headers = info_dict.get('http_headers', {}) + ctx['last_error'] = None + if not interrupt_trigger[0]: + return False, frag_index + headers = info_dict.get('http_headers', {}).copy() byte_range = fragment.get('byte_range') if byte_range: headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'] - 1) @@ -351,12 +450,13 @@ def download_fragment(fragment, ctx): if not success: return False, frag_index break - except compat_urllib_error.HTTPError as err: + except (compat_urllib_error.HTTPError, http.client.IncompleteRead) as err: # Unavailable (possibly temporary) fragments may be served. # First we try to retry then either skip or abort. # See https://github.com/ytdl-org/youtube-dl/issues/10165, # https://github.com/ytdl-org/youtube-dl/issues/10448). count += 1 + ctx['last_error'] = err if count <= fragment_retries: self.report_retry_fragment(err, frag_index, count, fragment_retries) except DownloadError: @@ -374,24 +474,10 @@ def download_fragment(fragment, ctx): return False, frag_index return frag_content, frag_index - def decrypt_fragment(fragment, frag_content): - decrypt_info = fragment.get('decrypt_info') - if not decrypt_info or decrypt_info['METHOD'] != 'AES-128': - return frag_content - iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', fragment['media_sequence']) - decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen( - self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read() - # Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block - # size (see https://github.com/ytdl-org/youtube-dl/pull/27660). Tests only care that the correct data downloaded, - # not what it decrypts to. - if self.params.get('test', False): - return frag_content - return AES.new(decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) - def append_fragment(frag_content, frag_index, ctx): if not frag_content: if not is_fatal(frag_index - 1): - self.report_skip_fragment(frag_index) + self.report_skip_fragment(frag_index, 'fragment not found') return True else: ctx['dest_stream'].close() @@ -401,7 +487,10 @@ def append_fragment(frag_content, frag_index, ctx): self._append_fragment(ctx, pack_func(frag_content, frag_index)) return True - max_workers = self.params.get('concurrent_fragment_downloads', 1) + decrypt_fragment = self.decrypter(info_dict) + + max_workers = math.ceil( + self.params.get('concurrent_fragment_downloads', 1) / ctx.get('max_progress', 1)) if can_threaded_download and max_workers > 1: def _download_fragment(fragment): @@ -410,8 +499,10 @@ def _download_fragment(fragment): return fragment, frag_content, frag_index, ctx_copy.get('fragment_filename_sanitized') self.report_warning('The download speed shown is only of one thread. This is a known issue and patches are welcome') - with concurrent.futures.ThreadPoolExecutor(max_workers) as pool: + with tpe or concurrent.futures.ThreadPoolExecutor(max_workers) as pool: for fragment, frag_content, frag_index, frag_filename in pool.map(_download_fragment, fragments): + if not interrupt_trigger[0]: + break ctx['fragment_filename_sanitized'] = frag_filename ctx['fragment_index'] = frag_index result = append_fragment(decrypt_fragment(fragment, frag_content), frag_index, ctx) @@ -419,6 +510,8 @@ def _download_fragment(fragment): return False else: for fragment in fragments: + if not interrupt_trigger[0]: + break frag_content, frag_index = download_fragment(fragment, ctx) result = append_fragment(decrypt_fragment(fragment, frag_content), frag_index, ctx) if not result: diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py index 779658b70e..e932fd6aea 100644 --- a/yt_dlp/downloader/hls.py +++ b/yt_dlp/downloader/hls.py @@ -5,10 +5,11 @@ import binascii from ..downloader import get_suitable_downloader -from .fragment import FragmentFD, can_decrypt_frag +from .fragment import FragmentFD from .external import FFmpegFD from ..compat import ( + compat_pycrypto_AES, compat_urlparse, ) from ..utils import ( @@ -29,7 +30,7 @@ class HlsFD(FragmentFD): FD_NAME = 'hlsnative' @staticmethod - def can_download(manifest, info_dict, allow_unplayable_formats=False, with_crypto=can_decrypt_frag): + def can_download(manifest, info_dict, allow_unplayable_formats=False): UNSUPPORTED_FEATURES = [ # r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] @@ -56,9 +57,6 @@ def can_download(manifest, info_dict, allow_unplayable_formats=False, with_crypt def check_results(): yield not info_dict.get('is_live') - is_aes128_enc = '#EXT-X-KEY:METHOD=AES-128' in manifest - yield with_crypto or not is_aes128_enc - yield not (is_aes128_enc and r'#EXT-X-BYTERANGE' in manifest) for feature in UNSUPPORTED_FEATURES: yield not re.search(feature, manifest) return all(check_results()) @@ -71,16 +69,29 @@ def real_download(self, filename, info_dict): man_url = urlh.geturl() s = urlh.read().decode('utf-8', 'ignore') - if not self.can_download(s, info_dict, self.params.get('allow_unplayable_formats')): - if info_dict.get('extra_param_to_segment_url') or info_dict.get('_decryption_key_url'): - self.report_error('pycryptodome not found. Please install') + can_download, message = self.can_download(s, info_dict, self.params.get('allow_unplayable_formats')), None + if can_download and not compat_pycrypto_AES and '#EXT-X-KEY:METHOD=AES-128' in s: + if FFmpegFD.available(): + can_download, message = False, 'The stream has AES-128 encryption and pycryptodomex is not available' + else: + message = ('The stream has AES-128 encryption and neither ffmpeg nor pycryptodomex are available; ' + 'Decryption will be performed natively, but will be extremely slow') + if not can_download: + has_drm = re.search('|'.join([ + r'#EXT-X-FAXS-CM:', # Adobe Flash Access + r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay + ]), s) + if has_drm and not self.params.get('allow_unplayable_formats'): + self.report_error( + 'This video is DRM protected; Try selecting another format with --format or ' + 'add --check-formats to automatically fallback to the next best format') return False - if self.can_download(s, info_dict, with_crypto=True): - self.report_warning('pycryptodome is needed to download this file natively') + message = message or 'Unsupported features have been detected' fd = FFmpegFD(self.ydl, self.params) - self.report_warning( - '%s detected unsupported features; extraction will be delegated to %s' % (self.FD_NAME, fd.get_basename())) + self.report_warning(f'{message}; extraction will be delegated to {fd.get_basename()}') return fd.real_download(filename, info_dict) + elif message: + self.report_warning(message) is_webvtt = info_dict['ext'] == 'vtt' if is_webvtt: @@ -172,6 +183,7 @@ def is_ad_fragment_end(s): 'byte_range': byte_range, 'media_sequence': media_sequence, }) + media_sequence += 1 elif line.startswith('#EXT-X-MAP'): if format_index and discontinuity_count != format_index: @@ -196,6 +208,7 @@ def is_ad_fragment_end(s): 'byte_range': byte_range, 'media_sequence': media_sequence }) + media_sequence += 1 if map_info.get('BYTERANGE'): splitted_byte_range = map_info.get('BYTERANGE').split('@') @@ -235,20 +248,18 @@ def is_ad_fragment_end(s): elif line.startswith('#EXT-X-DISCONTINUITY'): discontinuity_count += 1 i += 1 - media_sequence += 1 # We only download the first fragment during the test if self.params.get('test', False): fragments = [fragments[0] if fragments else None] if real_downloader: - info_copy = info_dict.copy() - info_copy['fragments'] = fragments + info_dict['fragments'] = fragments fd = real_downloader(self.ydl, self.params) # TODO: Make progress updates work without hooking twice # for ph in self._progress_hooks: # fd.add_progress_hook(ph) - return fd.real_download(filename, info_copy) + return fd.real_download(filename, info_dict) if is_webvtt: def pack_fragment(frag_content, frag_index): diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py index 1edb0f91f6..34a1eb59b6 100644 --- a/yt_dlp/downloader/http.py +++ b/yt_dlp/downloader/http.py @@ -16,7 +16,6 @@ ContentTooShortError, encodeFilename, int_or_none, - sanitize_open, sanitized_Request, ThrottledDownload, write_xattr, @@ -48,8 +47,9 @@ class DownloadContext(dict): is_test = self.params.get('test', False) chunk_size = self._TEST_FILE_SIZE if is_test else ( - info_dict.get('downloader_options', {}).get('http_chunk_size') - or self.params.get('http_chunk_size') or 0) + self.params.get('http_chunk_size') + or info_dict.get('downloader_options', {}).get('http_chunk_size') + or 0) ctx.open_mode = 'wb' ctx.resume_len = 0 @@ -57,6 +57,7 @@ class DownloadContext(dict): ctx.block_size = self.params.get('buffersize', 1024) ctx.start_time = time.time() ctx.chunk_size = None + throttle_start = None if self.params.get('continuedl', True): # Establish possible resume length @@ -189,13 +190,16 @@ def establish_connection(): # Unexpected HTTP error raise raise RetryDownload(err) - except socket.error as err: - if err.errno != errno.ECONNRESET: - # Connection reset is no problem, just retry - raise + except socket.timeout as err: raise RetryDownload(err) + except socket.error as err: + if err.errno in (errno.ECONNRESET, errno.ETIMEDOUT): + # Connection reset is no problem, just retry + raise RetryDownload(err) + raise def download(): + nonlocal throttle_start data_len = ctx.data.info().get('Content-length', None) # Range HTTP header may be ignored/unsupported by a webserver @@ -224,7 +228,6 @@ def download(): # measure time over whole while-loop, so slow_down() and best_block_size() work together properly now = None # needed for slow_down() in the first loop run before = start # start measuring - throttle_start = None def retry(e): to_stdout = ctx.tmpfilename == '-' @@ -259,7 +262,7 @@ def retry(e): # Open destination file just in time if ctx.stream is None: try: - ctx.stream, ctx.tmpfilename = sanitize_open( + ctx.stream, ctx.tmpfilename = self.sanitize_open( ctx.tmpfilename, ctx.open_mode) assert ctx.stream is not None ctx.filename = self.undo_temp_name(ctx.tmpfilename) @@ -310,6 +313,7 @@ def retry(e): 'eta': eta, 'speed': speed, 'elapsed': now - ctx.start_time, + 'ctx_id': info_dict.get('ctx_id'), }, info_dict) if data_len is not None and byte_counter == data_len: @@ -324,7 +328,7 @@ def retry(e): if ctx.stream is not None and ctx.tmpfilename != '-': ctx.stream.close() raise ThrottledDownload() - else: + elif speed: throttle_start = None if not is_test and ctx.chunk_size and ctx.data_len is not None and byte_counter < ctx.data_len: @@ -357,6 +361,7 @@ def retry(e): 'filename': ctx.filename, 'status': 'finished', 'elapsed': time.time() - ctx.start_time, + 'ctx_id': info_dict.get('ctx_id'), }, info_dict) return True @@ -369,6 +374,8 @@ def retry(e): count += 1 if count <= retries: self.report_retry(e.source_error, count, retries) + else: + self.to_screen(f'[download] Got server HTTP error: {e.source_error}') continue except NextFragment: continue diff --git a/yt_dlp/downloader/mhtml.py b/yt_dlp/downloader/mhtml.py index b75db18a8a..1477f65a69 100644 --- a/yt_dlp/downloader/mhtml.py +++ b/yt_dlp/downloader/mhtml.py @@ -114,8 +114,8 @@ def real_download(self, filename, info_dict): fragment_base_url = info_dict.get('fragment_base_url') fragments = info_dict['fragments'][:1] if self.params.get( 'test', False) else info_dict['fragments'] - title = info_dict['title'] - origin = info_dict['webpage_url'] + title = info_dict.get('title', info_dict['format_id']) + origin = info_dict.get('webpage_url', info_dict['url']) ctx = { 'filename': filename, diff --git a/yt_dlp/downloader/niconico.py b/yt_dlp/downloader/niconico.py index 256840d689..521dfece31 100644 --- a/yt_dlp/downloader/niconico.py +++ b/yt_dlp/downloader/niconico.py @@ -6,7 +6,7 @@ from .common import FileDownloader from ..downloader import get_suitable_downloader from ..extractor.niconico import NiconicoIE -from ..compat import compat_urllib_request +from ..utils import sanitized_Request class NiconicoDmcFD(FileDownloader): @@ -29,9 +29,11 @@ def real_download(self, filename, info_dict): heartbeat_data = heartbeat_info_dict['data'].encode() heartbeat_interval = heartbeat_info_dict.get('interval', 30) + request = sanitized_Request(heartbeat_url, heartbeat_data) + def heartbeat(): try: - compat_urllib_request.urlopen(url=heartbeat_url, data=heartbeat_data) + self.ydl.urlopen(request).read() except Exception: self.to_screen('[%s] Heartbeat failed' % self.FD_NAME) diff --git a/yt_dlp/downloader/rtmp.py b/yt_dlp/downloader/rtmp.py index 6dca64725d..90f1acfd44 100644 --- a/yt_dlp/downloader/rtmp.py +++ b/yt_dlp/downloader/rtmp.py @@ -12,6 +12,7 @@ encodeFilename, encodeArgument, get_exe_version, + Popen, ) @@ -26,7 +27,7 @@ def run_rtmpdump(args): start = time.time() resume_percent = None resume_downloaded_data_len = None - proc = subprocess.Popen(args, stderr=subprocess.PIPE) + proc = Popen(args, stderr=subprocess.PIPE) cursor_in_new_line = True proc_stderr_closed = False try: diff --git a/yt_dlp/downloader/youtube_live_chat.py b/yt_dlp/downloader/youtube_live_chat.py index 2dc6ff954c..ef4205edc7 100644 --- a/yt_dlp/downloader/youtube_live_chat.py +++ b/yt_dlp/downloader/youtube_live_chat.py @@ -183,7 +183,7 @@ def download_and_parse_fragment(url, frag_index, request_data=None, headers=None request_data['currentPlayerState'] = {'playerOffsetMs': str(max(offset - 5000, 0))} if click_tracking_params: request_data['context']['clickTracking'] = {'clickTrackingParams': click_tracking_params} - headers = ie.generate_api_headers(ytcfg, visitor_data=visitor_data) + headers = ie.generate_api_headers(ytcfg=ytcfg, visitor_data=visitor_data) headers.update({'content-type': 'application/json'}) fragment_request_data = json.dumps(request_data, ensure_ascii=False).encode('utf-8') + b'\n' success, continuation_id, offset, click_tracking_params = download_and_parse_fragment( diff --git a/yt_dlp/extractor/__init__.py b/yt_dlp/extractor/__init__.py index 7d540540e2..b35484246a 100644 --- a/yt_dlp/extractor/__init__.py +++ b/yt_dlp/extractor/__init__.py @@ -1,14 +1,15 @@ -from __future__ import unicode_literals +import os from ..utils import load_plugins -try: - from .lazy_extractors import * - from .lazy_extractors import _ALL_CLASSES - _LAZY_LOADER = True - _PLUGIN_CLASSES = [] -except ImportError: - _LAZY_LOADER = False +_LAZY_LOADER = False +if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): + try: + from .lazy_extractors import * + from .lazy_extractors import _ALL_CLASSES + _LAZY_LOADER = True + except ImportError: + pass if not _LAZY_LOADER: from .extractors import * @@ -19,8 +20,8 @@ ] _ALL_CLASSES.append(GenericIE) - _PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals()) - _ALL_CLASSES = _PLUGIN_CLASSES + _ALL_CLASSES +_PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals()) +_ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES def gen_extractor_classes(): diff --git a/yt_dlp/extractor/abc.py b/yt_dlp/extractor/abc.py index 3e202168ed..354453a274 100644 --- a/yt_dlp/extractor/abc.py +++ b/yt_dlp/extractor/abc.py @@ -8,6 +8,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + dict_get, ExtractorError, js_to_json, int_or_none, @@ -233,8 +234,6 @@ def tokenize_url(url, token): }] is_live = video_params.get('livestream') == '1' - if is_live: - title = self._live_title(title) return { 'id': video_id, @@ -255,3 +254,66 @@ def tokenize_url(url, token): 'subtitles': subtitles, 'is_live': is_live, } + + +class ABCIViewShowSeriesIE(InfoExtractor): + IE_NAME = 'abc.net.au:iview:showseries' + _VALID_URL = r'https?://iview\.abc\.net\.au/show/(?P<id>[^/]+)(?:/series/\d+)?$' + _GEO_COUNTRIES = ['AU'] + + _TESTS = [{ + 'url': 'https://iview.abc.net.au/show/upper-middle-bogan', + 'info_dict': { + 'id': '124870-1', + 'title': 'Series 1', + 'description': 'md5:93119346c24a7c322d446d8eece430ff', + 'series': 'Upper Middle Bogan', + 'season': 'Series 1', + 'thumbnail': r're:^https?://cdn\.iview\.abc\.net\.au/thumbs/.*\.jpg$' + }, + 'playlist_count': 8, + }, { + 'url': 'https://iview.abc.net.au/show/upper-middle-bogan', + 'info_dict': { + 'id': 'CO1108V001S00', + 'ext': 'mp4', + 'title': 'Series 1 Ep 1 I\'m A Swan', + 'description': 'md5:7b676758c1de11a30b79b4d301e8da93', + 'series': 'Upper Middle Bogan', + 'uploader_id': 'abc1', + 'upload_date': '20210630', + 'timestamp': 1625036400, + }, + 'params': { + 'noplaylist': True, + 'skip_download': 'm3u8', + }, + }] + + def _real_extract(self, url): + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) + webpage_data = self._search_regex( + r'window\.__INITIAL_STATE__\s*=\s*[\'"](.+?)[\'"]\s*;', + webpage, 'initial state') + video_data = self._parse_json( + unescapeHTML(webpage_data).encode('utf-8').decode('unicode_escape'), show_id) + video_data = video_data['route']['pageData']['_embedded'] + + if self.get_param('noplaylist') and 'highlightVideo' in video_data: + self.to_screen('Downloading just the highlight video because of --no-playlist') + return self.url_result(video_data['highlightVideo']['shareUrl'], ie=ABCIViewIE.ie_key()) + + self.to_screen(f'Downloading playlist {show_id} - add --no-playlist to just download the highlight video') + series = video_data['selectedSeries'] + return { + '_type': 'playlist', + 'entries': [self.url_result(episode['shareUrl']) + for episode in series['_embedded']['videoEpisodes']], + 'id': series.get('id'), + 'title': dict_get(series, ('title', 'displaySubtitle')), + 'description': series.get('description'), + 'series': dict_get(series, ('showTitle', 'displayTitle')), + 'season': dict_get(series, ('title', 'displaySubtitle')), + 'thumbnail': series.get('thumbnail'), + } diff --git a/yt_dlp/extractor/adn.py b/yt_dlp/extractor/adn.py index a55ebbcbd6..5a1283baa5 100644 --- a/yt_dlp/extractor/adn.py +++ b/yt_dlp/extractor/adn.py @@ -15,6 +15,7 @@ compat_ord, ) from ..utils import ( + ass_subtitles_timecode, bytes_to_intlist, bytes_to_long, ExtractorError, @@ -68,10 +69,6 @@ class ADNIE(InfoExtractor): 'end': 4, } - @staticmethod - def _ass_subtitles_timecode(seconds): - return '%01d:%02d:%02d.%02d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 100) - def _get_subtitles(self, sub_url, video_id): if not sub_url: return None @@ -117,8 +114,8 @@ def _get_subtitles(self, sub_url, video_id): continue alignment = self._POS_ALIGN_MAP.get(position_align, 2) + self._LINE_ALIGN_MAP.get(line_align, 0) ssa += os.linesep + 'Dialogue: Marked=0,%s,%s,Default,,0,0,0,,%s%s' % ( - self._ass_subtitles_timecode(start), - self._ass_subtitles_timecode(end), + ass_subtitles_timecode(start), + ass_subtitles_timecode(end), '{\\a%d}' % alignment if alignment != 2 else '', text.replace('\n', '\\N').replace('<i>', '{\\i1}').replace('</i>', '{\\i0}')) diff --git a/yt_dlp/extractor/adobeconnect.py b/yt_dlp/extractor/adobeconnect.py index 728549eb92..e688dddcbb 100644 --- a/yt_dlp/extractor/adobeconnect.py +++ b/yt_dlp/extractor/adobeconnect.py @@ -31,7 +31,7 @@ def _real_extract(self, url): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'formats': formats, 'is_live': is_live, } diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py index ffab332941..bebcafa6b7 100644 --- a/yt_dlp/extractor/adobepass.py +++ b/yt_dlp/extractor/adobepass.py @@ -37,6 +37,11 @@ 'username_field': 'email', 'password_field': 'loginpassword', }, + 'RCN': { + 'name': 'RCN', + 'username_field': 'username', + 'password_field': 'password', + }, 'Rogers': { 'name': 'Rogers', 'username_field': 'UserName', diff --git a/yt_dlp/extractor/adobetv.py b/yt_dlp/extractor/adobetv.py index 12b8192060..3cfa1ff550 100644 --- a/yt_dlp/extractor/adobetv.py +++ b/yt_dlp/extractor/adobetv.py @@ -9,6 +9,7 @@ float_or_none, int_or_none, ISO639Utils, + join_nonempty, OnDemandPagedList, parse_duration, str_or_none, @@ -263,7 +264,7 @@ def _real_extract(self, url): continue formats.append({ 'filesize': int_or_none(source.get('kilobytes') or None, invscale=1000), - 'format_id': '-'.join(filter(None, [source.get('format'), source.get('label')])), + 'format_id': join_nonempty(source.get('format'), source.get('label')), 'height': int_or_none(source.get('height') or None), 'tbr': int_or_none(source.get('bitrate') or None), 'width': int_or_none(source.get('width') or None), diff --git a/yt_dlp/extractor/afreecatv.py b/yt_dlp/extractor/afreecatv.py index 648f1122dc..063872b4f5 100644 --- a/yt_dlp/extractor/afreecatv.py +++ b/yt_dlp/extractor/afreecatv.py @@ -6,9 +6,11 @@ from .common import InfoExtractor from ..compat import compat_xpath from ..utils import ( + date_from_str, determine_ext, ExtractorError, int_or_none, + unified_strdate, url_or_none, urlencode_postdata, xpath_text, @@ -237,6 +239,7 @@ def _real_extract(self, url): r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id) partial_view = False + adult_view = False for _ in range(2): query = { 'nTitleNo': video_id, @@ -245,6 +248,8 @@ def _real_extract(self, url): } if partial_view: query['partialView'] = 'SKIP_ADULT' + if adult_view: + query['adultView'] = 'ADULT_VIEW' video_xml = self._download_xml( 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php', video_id, 'Downloading video info XML%s' @@ -264,6 +269,9 @@ def _real_extract(self, url): partial_view = True continue elif flag == 'ADULT': + if not adult_view: + adult_view = True + continue error = 'Only users older than 19 are able to watch this video. Provide account credentials to download this content.' else: error = flag @@ -309,8 +317,15 @@ def _real_extract(self, url): if not file_url: continue key = file_element.get('key', '') - upload_date = self._search_regex( - r'^(\d{8})_', key, 'upload date', default=None) + upload_date = unified_strdate(self._search_regex( + r'^(\d{8})_', key, 'upload date', default=None)) + if upload_date is not None: + # sometimes the upload date isn't included in the file name + # instead, another random ID is, which may parse as a valid + # date but be wildly out of a reasonable range + parsed_date = date_from_str(upload_date) + if parsed_date.year < 2000 or parsed_date.year >= 2100: + upload_date = None file_duration = int_or_none(file_element.get('duration')) format_id = key if key else '%s_%s' % (video_id, file_num) if determine_ext(file_url) == 'm3u8': diff --git a/yt_dlp/extractor/aljazeera.py b/yt_dlp/extractor/aljazeera.py index e829b45e47..7bcdb7afba 100644 --- a/yt_dlp/extractor/aljazeera.py +++ b/yt_dlp/extractor/aljazeera.py @@ -1,55 +1,86 @@ +# coding: utf-8 from __future__ import unicode_literals import json from .common import InfoExtractor +from ..utils import ( + try_get, +) class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?P<type>program/[^/]+|(?:feature|video)s)/\d{4}/\d{1,2}/\d{1,2}/(?P<id>[^/?&#]+)' + _VALID_URL = r'https?://(?P<base>\w+\.aljazeera\.\w+)/(?P<type>programs?/[^/]+|(?:feature|video|new)s)?/\d{4}/\d{1,2}/\d{1,2}/(?P<id>[^/?&#]+)' _TESTS = [{ - 'url': 'https://www.aljazeera.com/program/episode/2014/9/19/deliverance', + 'url': 'https://balkans.aljazeera.net/videos/2021/11/6/pojedini-domovi-u-sarajevu-jos-pod-vodom-mjestanima-se-dostavlja-hrana', 'info_dict': { - 'id': '3792260579001', + 'id': '6280641530001', 'ext': 'mp4', - 'title': 'The Slum - Episode 1: Deliverance', - 'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.', - 'uploader_id': '665003303001', - 'timestamp': 1411116829, - 'upload_date': '20140919', + 'title': 'Pojedini domovi u Sarajevu još pod vodom, mještanima se dostavlja hrana', + 'timestamp': 1636219149, + 'description': 'U sarajevskim naseljima Rajlovac i Reljevo stambeni objekti, ali i industrijska postrojenja i dalje su pod vodom.', + 'upload_date': '20211106', + } + }, { + 'url': 'https://balkans.aljazeera.net/videos/2021/11/6/djokovic-usao-u-finale-mastersa-u-parizu', + 'info_dict': { + 'id': '6280654936001', + 'ext': 'mp4', + 'title': 'Đoković ušao u finale Mastersa u Parizu', + 'timestamp': 1636221686, + 'description': 'Novak Đoković je u polufinalu Mastersa u Parizu nakon preokreta pobijedio Poljaka Huberta Hurkacza.', + 'upload_date': '20211106', }, - 'add_ie': ['BrightcoveNew'], - 'skip': 'Not accessible from Travis CI server', - }, { - 'url': 'https://www.aljazeera.com/videos/2017/5/11/sierra-leone-709-carat-diamond-to-be-auctioned-off', - 'only_matching': True, - }, { - 'url': 'https://www.aljazeera.com/features/2017/8/21/transforming-pakistans-buses-into-art', - 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + BRIGHTCOVE_URL_RE = r'https?://players.brightcove.net/(?P<account>\d+)/(?P<player_id>[a-zA-Z0-9]+)_(?P<embed>[^/]+)/index.html\?videoId=(?P<id>\d+)' def _real_extract(self, url): - post_type, name = self._match_valid_url(url).groups() + base, post_type, id = self._match_valid_url(url).groups() + wp = { + 'balkans.aljazeera.net': 'ajb', + 'chinese.aljazeera.net': 'chinese', + 'mubasher.aljazeera.net': 'ajm', + }.get(base) or 'aje' post_type = { 'features': 'post', 'program': 'episode', + 'programs': 'episode', 'videos': 'video', + 'news': 'news', }[post_type.split('/')[0]] video = self._download_json( - 'https://www.aljazeera.com/graphql', name, query={ + f'https://{base}/graphql', id, query={ + 'wp-site': wp, 'operationName': 'ArchipelagoSingleArticleQuery', 'variables': json.dumps({ - 'name': name, + 'name': id, 'postType': post_type, }), }, headers={ - 'wp-site': 'aje', - })['data']['article']['video'] - video_id = video['id'] - account_id = video.get('accountId') or '665003303001' - player_id = video.get('playerId') or 'BkeSH5BDb' - return self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), - 'BrightcoveNew', video_id) + 'wp-site': wp, + }) + video = try_get(video, lambda x: x['data']['article']['video']) or {} + video_id = video.get('id') + account = video.get('accountId') or '911432371001' + player_id = video.get('playerId') or 'csvTfAlKW' + embed = 'default' + + if video_id is None: + webpage = self._download_webpage(url, id) + + account, player_id, embed, video_id = self._search_regex(self.BRIGHTCOVE_URL_RE, webpage, 'video id', + group=(1, 2, 3, 4), default=(None, None, None, None)) + + if video_id is None: + return { + '_type': 'url_transparent', + 'url': url, + 'ie_key': 'Generic' + } + + return { + '_type': 'url_transparent', + 'url': f'https://players.brightcove.net/{account}/{player_id}_{embed}/index.html?videoId={video_id}', + 'ie_key': 'BrightcoveNew' + } diff --git a/yt_dlp/extractor/amazon.py b/yt_dlp/extractor/amazon.py new file mode 100644 index 0000000000..07b1b18611 --- /dev/null +++ b/yt_dlp/extractor/amazon.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import int_or_none + + +class AmazonStoreIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/(?:[^/]+/)?(?:dp|gp/product)/(?P<id>[^/&#$?]+)' + + _TESTS = [{ + 'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/', + 'info_dict': { + 'id': 'B098XNCHLD', + 'title': 'md5:5f3194dbf75a8dcfc83079bd63a2abed', + }, + 'playlist_mincount': 1, + 'playlist': [{ + 'info_dict': { + 'id': 'A1F83G8C2ARO7P', + 'ext': 'mp4', + 'title': 'mcdodo usb c cable 100W 5a', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }] + }, { + 'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3', + 'info_dict': { + 'id': 'B0863TXGM3', + 'title': 'md5:b0bde4881d3cfd40d63af19f7898b8ff', + }, + 'playlist_mincount': 4, + }, { + 'url': 'https://www.amazon.com/dp/B0845NXCXF/', + 'info_dict': { + 'id': 'B0845NXCXF', + 'title': 'md5:2145cd4e3c7782f1ee73649a3cff1171', + }, + 'playlist-mincount': 1, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._parse_json(self._html_search_regex(r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'(.*)\'\)', webpage, 'data'), id) + entries = [{ + 'id': video['marketPlaceID'], + 'url': video['url'], + 'title': video.get('title'), + 'thumbnail': video.get('thumbUrl') or video.get('thumb'), + 'duration': video.get('durationSeconds'), + 'height': int_or_none(video.get('videoHeight')), + 'width': int_or_none(video.get('videoWidth')), + } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')] + return self.playlist_result(entries, playlist_id=id, playlist_title=data_json['title']) diff --git a/yt_dlp/extractor/animeondemand.py b/yt_dlp/extractor/animeondemand.py index 54e097d2f7..5694f72400 100644 --- a/yt_dlp/extractor/animeondemand.py +++ b/yt_dlp/extractor/animeondemand.py @@ -8,6 +8,7 @@ determine_ext, extract_attributes, ExtractorError, + join_nonempty, url_or_none, urlencode_postdata, urljoin, @@ -140,15 +141,8 @@ def extract_info(html, video_id, num=None): kind = self._search_regex( r'videomaterialurl/\d+/([^/]+)/', playlist_url, 'media kind', default=None) - format_id_list = [] - if lang: - format_id_list.append(lang) - if kind: - format_id_list.append(kind) - if not format_id_list and num is not None: - format_id_list.append(compat_str(num)) - format_id = '-'.join(format_id_list) - format_note = ', '.join(filter(None, (kind, lang_note))) + format_id = join_nonempty(lang, kind) if lang or kind else str(num) + format_note = join_nonempty(kind, lang_note, delim=', ') item_id_list = [] if format_id: item_id_list.append(format_id) @@ -195,12 +189,10 @@ def extract_info(html, video_id, num=None): if not file_: continue ext = determine_ext(file_) - format_id_list = [lang, kind] - if ext == 'm3u8': - format_id_list.append('hls') - elif source.get('type') == 'video/dash' or ext == 'mpd': - format_id_list.append('dash') - format_id = '-'.join(filter(None, format_id_list)) + format_id = join_nonempty( + lang, kind, + 'hls' if ext == 'm3u8' else None, + 'dash' if source.get('type') == 'video/dash' or ext == 'mpd' else None) if ext == 'm3u8': file_formats = self._extract_m3u8_formats( file_, video_id, 'mp4', diff --git a/yt_dlp/extractor/anvato.py b/yt_dlp/extractor/anvato.py index d688e2c5bc..0d444fc33e 100644 --- a/yt_dlp/extractor/anvato.py +++ b/yt_dlp/extractor/anvato.py @@ -16,6 +16,7 @@ determine_ext, intlist_to_bytes, int_or_none, + join_nonempty, strip_jsonp, unescapeHTML, unsmuggle_url, @@ -303,13 +304,13 @@ def _get_anvato_videos(self, access_key, video_id): tbr = int_or_none(published_url.get('kbps')) a_format = { 'url': video_url, - 'format_id': ('-'.join(filter(None, ['http', published_url.get('cdn_name')]))).lower(), - 'tbr': tbr if tbr != 0 else None, + 'format_id': join_nonempty('http', published_url.get('cdn_name')).lower(), + 'tbr': tbr or None, } if media_format == 'm3u8' and tbr is not None: a_format.update({ - 'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])), + 'format_id': join_nonempty('hls', tbr), 'ext': 'mp4', }) elif media_format == 'm3u8-variant' or ext == 'm3u8': diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index d90fcb13aa..467fe48752 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -3,33 +3,36 @@ import re import json - from .common import InfoExtractor -from .youtube import YoutubeIE +from .youtube import YoutubeIE, YoutubeBaseInfoExtractor from ..compat import ( compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, compat_HTTPError ) from ..utils import ( + bug_reports_message, clean_html, - determine_ext, dict_get, extract_attributes, ExtractorError, + get_element_by_id, HEADRequest, int_or_none, KNOWN_EXTENSIONS, merge_dicts, mimetype2ext, + orderedSet, parse_duration, parse_qs, - RegexNotFoundError, str_to_int, str_or_none, + traverse_obj, try_get, unified_strdate, unified_timestamp, + urlhandle_detect_ext, + url_or_none ) @@ -262,12 +265,12 @@ class YoutubeWebArchiveIE(InfoExtractor): _VALID_URL = r"""(?x)^ (?:https?://)?web\.archive\.org/ (?:web/)? - (?:[0-9A-Za-z_*]+/)? # /web and the version index is optional + (?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional (?:https?(?::|%3[Aa])//)? (?: - (?:\w+\.)?youtube\.com/watch(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL - |(wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url + (?:\w+\.)?youtube\.com(?::(?:80|443))?/watch(?:\.php)?(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL + |(?:wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url ) (?P<id>[0-9A-Za-z_-]{11})(?:%26|\#|&|$) """ @@ -278,141 +281,391 @@ class YoutubeWebArchiveIE(InfoExtractor): 'info_dict': { 'id': 'aYAGB11YrSs', 'ext': 'webm', - 'title': 'Team Fortress 2 - Sandviches!' + 'title': 'Team Fortress 2 - Sandviches!', + 'description': 'md5:4984c0f9a07f349fc5d8e82ab7af4eaf', + 'upload_date': '20110926', + 'uploader': 'Zeurel', + 'channel_id': 'UCukCyHaD-bK3in_pKpfH9Eg', + 'duration': 32, + 'uploader_id': 'Zeurel', + 'uploader_url': 'http://www.youtube.com/user/Zeurel' } - }, - { + }, { # Internal link 'url': 'https://web.archive.org/web/2oe/http://wayback-fakeurl.archive.org/yt/97t7Xj_iBv0', 'info_dict': { 'id': '97t7Xj_iBv0', 'ext': 'mp4', - 'title': 'How Flexible Machines Could Save The World' + 'title': 'Why Machines That Bend Are Better', + 'description': 'md5:00404df2c632d16a674ff8df1ecfbb6c', + 'upload_date': '20190312', + 'uploader': 'Veritasium', + 'channel_id': 'UCHnyfMqiRRG1u-2MsSQLbXA', + 'duration': 771, + 'uploader_id': '1veritasium', + 'uploader_url': 'http://www.youtube.com/user/1veritasium' } - }, - { - # Video from 2012, webm format itag 45. + }, { + # Video from 2012, webm format itag 45. Newest capture is deleted video, with an invalid description. + # Should use the date in the link. Title ends with '- Youtube'. Capture has description in eow-description 'url': 'https://web.archive.org/web/20120712231619/http://www.youtube.com/watch?v=AkhihxRKcrs&gl=US&hl=en', 'info_dict': { 'id': 'AkhihxRKcrs', 'ext': 'webm', - 'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)' + 'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)', + 'upload_date': '20120712', + 'duration': 398, + 'description': 'md5:ff4de6a7980cb65d951c2f6966a4f2f3', + 'uploader_id': 'machinima', + 'uploader_url': 'http://www.youtube.com/user/machinima' } - }, - { - # Old flash-only video. Webpage title starts with "YouTube - ". + }, { + # FLV video. Video file URL does not provide itag information 'url': 'https://web.archive.org/web/20081211103536/http://www.youtube.com/watch?v=jNQXAC9IVRw', 'info_dict': { 'id': 'jNQXAC9IVRw', - 'ext': 'unknown_video', - 'title': 'Me at the zoo' + 'ext': 'flv', + 'title': 'Me at the zoo', + 'upload_date': '20050423', + 'channel_id': 'UC4QobU6STFB0P71PMvOGN5A', + 'duration': 19, + 'description': 'md5:10436b12e07ac43ff8df65287a56efb4', + 'uploader_id': 'jawed', + 'uploader_url': 'http://www.youtube.com/user/jawed' } - }, - { - # Flash video with .flv extension (itag 34). Title has prefix "YouTube -" - # Title has some weird unicode characters too. + }, { 'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA', 'info_dict': { 'id': 'lTx3G6h2xyA', 'ext': 'flv', - 'title': '‪Madeon - Pop Culture (live mashup)‬‏' + 'title': 'Madeon - Pop Culture (live mashup)', + 'upload_date': '20110711', + 'uploader': 'Madeon', + 'channel_id': 'UCqMDNf3Pn5L7pcNkuSEeO3w', + 'duration': 204, + 'description': 'md5:f7535343b6eda34a314eff8b85444680', + 'uploader_id': 'itsmadeon', + 'uploader_url': 'http://www.youtube.com/user/itsmadeon' } - }, - { # Some versions of Youtube have have "YouTube" as page title in html (and later rewritten by js). + }, { + # First capture is of dead video, second is the oldest from CDX response. + 'url': 'https://web.archive.org/https://www.youtube.com/watch?v=1JYutPM8O6E', + 'info_dict': { + 'id': '1JYutPM8O6E', + 'ext': 'mp4', + 'title': 'Fake Teen Doctor Strikes AGAIN! - Weekly Weird News', + 'upload_date': '20160218', + 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA', + 'duration': 1236, + 'description': 'md5:21032bae736421e89c2edf36d1936947', + 'uploader_id': 'MachinimaETC', + 'uploader_url': 'http://www.youtube.com/user/MachinimaETC' + } + }, { + # First capture of dead video, capture date in link links to dead capture. + 'url': 'https://web.archive.org/web/20180803221945/https://www.youtube.com/watch?v=6FPhZJGvf4E', + 'info_dict': { + 'id': '6FPhZJGvf4E', + 'ext': 'mp4', + 'title': 'WTF: Video Games Still Launch BROKEN?! - T.U.G.S.', + 'upload_date': '20160219', + 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA', + 'duration': 798, + 'description': 'md5:a1dbf12d9a3bd7cb4c5e33b27d77ffe7', + 'uploader_id': 'MachinimaETC', + 'uploader_url': 'http://www.youtube.com/user/MachinimaETC' + }, + 'expected_warnings': [ + r'unable to download capture webpage \(it may not be archived\)' + ] + }, { # Very old YouTube page, has - YouTube in title. + 'url': 'http://web.archive.org/web/20070302011044/http://youtube.com/watch?v=-06-KB9XTzg', + 'info_dict': { + 'id': '-06-KB9XTzg', + 'ext': 'flv', + 'title': 'New Coin Hack!! 100% Safe!!' + } + }, { + 'url': 'web.archive.org/https://www.youtube.com/watch?v=dWW7qP423y8', + 'info_dict': { + 'id': 'dWW7qP423y8', + 'ext': 'mp4', + 'title': 'It\'s Bootleg AirPods Time.', + 'upload_date': '20211021', + 'channel_id': 'UC7Jwj9fkrf1adN4fMmTkpug', + 'channel_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug', + 'duration': 810, + 'description': 'md5:7b567f898d8237b256f36c1a07d6d7bc', + 'uploader': 'DankPods', + 'uploader_id': 'UC7Jwj9fkrf1adN4fMmTkpug', + 'uploader_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug' + } + }, { + # player response contains '};' See: https://github.com/ytdl-org/youtube-dl/issues/27093 + 'url': 'https://web.archive.org/web/20200827003909if_/http://www.youtube.com/watch?v=6Dh-RL__uN4', + 'info_dict': { + 'id': '6Dh-RL__uN4', + 'ext': 'mp4', + 'title': 'bitch lasagna', + 'upload_date': '20181005', + 'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'channel_url': 'http://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'duration': 135, + 'description': 'md5:2dbe4051feeff2dab5f41f82bb6d11d0', + 'uploader': 'PewDiePie', + 'uploader_id': 'PewDiePie', + 'uploader_url': 'http://www.youtube.com/user/PewDiePie' + } + }, { 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw', - 'info_dict': { - 'id': 'kH-G_aIBlFw', - 'ext': 'mp4', - 'title': 'kH-G_aIBlFw' - }, - 'expected_warnings': [ - 'unable to extract title', - ] - }, - { - # First capture is a 302 redirect intermediary page. - 'url': 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=0altSZ96U4M', - 'info_dict': { - 'id': '0altSZ96U4M', - 'ext': 'mp4', - 'title': '0altSZ96U4M' - }, - 'expected_warnings': [ - 'unable to extract title', - ] - }, - { + 'only_matching': True + }, { + 'url': 'https://web.archive.org/web/20050214000000_if/http://www.youtube.com/watch?v=0altSZ96U4M', + 'only_matching': True + }, { # Video not archived, only capture is unavailable video page 'url': 'https://web.archive.org/web/20210530071008/https://www.youtube.com/watch?v=lHJTf93HL1s&spfreload=10', - 'only_matching': True, - }, - { # Encoded url + 'only_matching': True + }, { # Encoded url 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fgl%3DUS%26v%3DAkhihxRKcrs%26hl%3Den', - 'only_matching': True, - }, - { + 'only_matching': True + }, { 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fv%3DAkhihxRKcrs%26gl%3DUS%26hl%3Den', - 'only_matching': True, + 'only_matching': True + }, { + 'url': 'https://web.archive.org/web/20060527081937/http://www.youtube.com:80/watch.php?v=ELTFsLT73fA&search=soccer', + 'only_matching': True + }, { + 'url': 'https://web.archive.org/http://www.youtube.com:80/watch?v=-05VVye-ffg', + 'only_matching': True } ] + _YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE + _YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE + _YT_INITIAL_BOUNDARY_RE = r'(?:(?:var\s+meta|</script|\n)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_BOUNDARY_RE + + _YT_DEFAULT_THUMB_SERVERS = ['i.ytimg.com'] # thumbnails most likely archived on these servers + _YT_ALL_THUMB_SERVERS = orderedSet( + _YT_DEFAULT_THUMB_SERVERS + ['img.youtube.com', *[f'{c}{n or ""}.ytimg.com' for c in ('i', 's') for n in (*range(0, 5), 9)]]) + + _WAYBACK_BASE_URL = 'https://web.archive.org/web/%sif_/' + _OLDEST_CAPTURE_DATE = 20050214000000 + _NEWEST_CAPTURE_DATE = 20500101000000 + + def _call_cdx_api(self, item_id, url, filters: list = None, collapse: list = None, query: dict = None, note='Downloading CDX API JSON'): + # CDX docs: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md + query = { + 'url': url, + 'output': 'json', + 'fl': 'original,mimetype,length,timestamp', + 'limit': 500, + 'filter': ['statuscode:200'] + (filters or []), + 'collapse': collapse or [], + **(query or {}) + } + res = self._download_json('https://web.archive.org/cdx/search/cdx', item_id, note, query=query) + if isinstance(res, list) and len(res) >= 2: + # format response to make it easier to use + return list(dict(zip(res[0], v)) for v in res[1:]) + elif not isinstance(res, list) or len(res) != 0: + self.report_warning('Error while parsing CDX API response' + bug_reports_message()) + + def _extract_yt_initial_variable(self, webpage, regex, video_id, name): + return self._parse_json(self._search_regex( + (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), + regex), webpage, name, default='{}'), video_id, fatal=False) + + def _extract_webpage_title(self, webpage): + page_title = self._html_search_regex( + r'<title>([^<]*)', webpage, 'title', default='') + # YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix. + return self._html_search_regex( + r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)', + page_title, 'title', default='') + + def _extract_metadata(self, video_id, webpage): + + search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None)) + player_response = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') or {} + initial_data = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_DATA_RE, video_id, 'initial player response') or {} + + initial_data_video = traverse_obj( + initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'), + expected_type=dict, get_all=False, default={}) + + video_details = traverse_obj( + player_response, 'videoDetails', expected_type=dict, get_all=False, default={}) + + microformats = traverse_obj( + player_response, ('microformat', 'playerMicroformatRenderer'), expected_type=dict, get_all=False, default={}) + + video_title = ( + video_details.get('title') + or YoutubeBaseInfoExtractor._get_text(microformats, 'title') + or YoutubeBaseInfoExtractor._get_text(initial_data_video, 'title') + or self._extract_webpage_title(webpage) + or search_meta(['og:title', 'twitter:title', 'title'])) + + channel_id = str_or_none( + video_details.get('channelId') + or microformats.get('externalChannelId') + or search_meta('channelId') + or self._search_regex( + r'data-channel-external-id=(["\'])(?P(?:(?!\1).)+)\1', # @b45a9e6 + webpage, 'channel id', default=None, group='id')) + channel_url = f'http://www.youtube.com/channel/{channel_id}' if channel_id else None + + duration = int_or_none( + video_details.get('lengthSeconds') + or microformats.get('lengthSeconds') + or parse_duration(search_meta('duration'))) + description = ( + video_details.get('shortDescription') + or YoutubeBaseInfoExtractor._get_text(microformats, 'description') + or clean_html(get_element_by_id('eow-description', webpage)) # @9e6dd23 + or search_meta(['description', 'og:description', 'twitter:description'])) + + uploader = video_details.get('author') + + # Uploader ID and URL + uploader_mobj = re.search( + r'', # @fd05024 + webpage) + if uploader_mobj is not None: + uploader_id, uploader_url = uploader_mobj.group('uploader_id'), uploader_mobj.group('uploader_url') + else: + # @a6211d2 + uploader_url = url_or_none(microformats.get('ownerProfileUrl')) + uploader_id = self._search_regex( + r'(?:user|channel)/([^/]+)', uploader_url or '', 'uploader id', default=None) + + upload_date = unified_strdate( + dict_get(microformats, ('uploadDate', 'publishDate')) + or search_meta(['uploadDate', 'datePublished']) + or self._search_regex( + [r'(?s)id="eow-date.*?>(.*?)', + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], # @7998520 + webpage, 'upload date', default=None)) + + return { + 'title': video_title, + 'description': description, + 'upload_date': upload_date, + 'uploader': uploader, + 'channel_id': channel_id, + 'channel_url': channel_url, + 'duration': duration, + 'uploader_url': uploader_url, + 'uploader_id': uploader_id, + } + + def _extract_thumbnails(self, video_id): + try_all = 'thumbnails' in self._configuration_arg('check_all') + thumbnail_base_urls = ['http://{server}/vi{webp}/{video_id}'.format( + webp='_webp' if ext == 'webp' else '', video_id=video_id, server=server) + for server in (self._YT_ALL_THUMB_SERVERS if try_all else self._YT_DEFAULT_THUMB_SERVERS) for ext in (('jpg', 'webp') if try_all else ('jpg',))] + + thumbnails = [] + for url in thumbnail_base_urls: + response = self._call_cdx_api( + video_id, url, filters=['mimetype:image/(?:webp|jpeg)'], + collapse=['urlkey'], query={'matchType': 'prefix'}) + if not response: + continue + thumbnails.extend( + { + 'url': (self._WAYBACK_BASE_URL % (int_or_none(thumbnail_dict.get('timestamp')) or self._OLDEST_CAPTURE_DATE)) + thumbnail_dict.get('original'), + 'filesize': int_or_none(thumbnail_dict.get('length')), + 'preference': int_or_none(thumbnail_dict.get('length')) + } for thumbnail_dict in response) + if not try_all: + break + + self._remove_duplicate_formats(thumbnails) + return thumbnails + + def _get_capture_dates(self, video_id, url_date): + capture_dates = [] + # Note: CDX API will not find watch pages with extra params in the url. + response = self._call_cdx_api( + video_id, f'https://www.youtube.com/watch?v={video_id}', + filters=['mimetype:text/html'], collapse=['timestamp:6', 'digest'], query={'matchType': 'prefix'}) or [] + all_captures = sorted([int_or_none(r['timestamp']) for r in response if int_or_none(r['timestamp']) is not None]) + + # Prefer the new polymer UI captures as we support extracting more metadata from them + # WBM captures seem to all switch to this layout ~July 2020 + modern_captures = list(filter(lambda x: x >= 20200701000000, all_captures)) + if modern_captures: + capture_dates.append(modern_captures[0]) + capture_dates.append(url_date) + if all_captures: + capture_dates.append(all_captures[0]) + + if 'captures' in self._configuration_arg('check_all'): + capture_dates.extend(modern_captures + all_captures) + + # Fallbacks if any of the above fail + capture_dates.extend([self._OLDEST_CAPTURE_DATE, self._NEWEST_CAPTURE_DATE]) + return orderedSet(capture_dates) def _real_extract(self, url): - video_id = self._match_id(url) - title = video_id # if we are not able get a title - def _extract_title(webpage): - page_title = self._html_search_regex( - r'([^<]*)', webpage, 'title', fatal=False) or '' - # YouTube video pages appear to always have either 'YouTube -' as suffix or '- YouTube' as prefix. - try: - page_title = self._html_search_regex( - r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)', - page_title, 'title', default='') - except RegexNotFoundError: - page_title = None + url_date, video_id = self._match_valid_url(url).groups() - if not page_title: - self.report_warning('unable to extract title', video_id=video_id) - return - return page_title - - # If the video is no longer available, the oldest capture may be one before it was removed. - # Setting the capture date in url to early date seems to redirect to earliest capture. - webpage = self._download_webpage( - 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=%s' % video_id, - video_id=video_id, fatal=False, errnote='unable to download video webpage (probably not archived).') - if webpage: - title = _extract_title(webpage) or title - - # Use link translator mentioned in https://github.com/ytdl-org/youtube-dl/issues/13655 - internal_fake_url = 'https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id + urlh = None try: - video_file_webpage = self._request_webpage( - HEADRequest(internal_fake_url), video_id, - note='Fetching video file url', expected_status=True) + urlh = self._request_webpage( + HEADRequest('https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id), + video_id, note='Fetching archived video file url', expected_status=True) except ExtractorError as e: # HTTP Error 404 is expected if the video is not saved. if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: - raise ExtractorError( - 'HTTP Error %s. Most likely the video is not archived or issue with web.archive.org.' % e.cause.code, + self.raise_no_formats( + 'The requested video is not archived, indexed, or there is an issue with web.archive.org', expected=True) - raise - video_file_url = compat_urllib_parse_unquote(video_file_webpage.url) - video_file_url_qs = parse_qs(video_file_url) + else: + raise - # Attempt to recover any ext & format info from playback url - format = {'url': video_file_url} - itag = try_get(video_file_url_qs, lambda x: x['itag'][0]) - if itag and itag in YoutubeIE._formats: # Naughty access but it works - format.update(YoutubeIE._formats[itag]) - format.update({'format_id': itag}) - else: - mime = try_get(video_file_url_qs, lambda x: x['mime'][0]) - ext = mimetype2ext(mime) or determine_ext(video_file_url) - format.update({'ext': ext}) - return { - 'id': video_id, - 'title': title, - 'formats': [format], - 'duration': str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0])) - } + capture_dates = self._get_capture_dates(video_id, int_or_none(url_date)) + self.write_debug('Captures to try: ' + ', '.join(str(i) for i in capture_dates if i is not None)) + info = {'id': video_id} + for capture in capture_dates: + if not capture: + continue + webpage = self._download_webpage( + (self._WAYBACK_BASE_URL + 'http://www.youtube.com/watch?v=%s') % (capture, video_id), + video_id=video_id, fatal=False, errnote='unable to download capture webpage (it may not be archived)', + note='Downloading capture webpage') + current_info = self._extract_metadata(video_id, webpage or '') + # Try avoid getting deleted video metadata + if current_info.get('title'): + info = merge_dicts(info, current_info) + if 'captures' not in self._configuration_arg('check_all'): + break + + info['thumbnails'] = self._extract_thumbnails(video_id) + + if urlh: + url = compat_urllib_parse_unquote(urlh.url) + video_file_url_qs = parse_qs(url) + # Attempt to recover any ext & format info from playback url & response headers + format = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))} + itag = try_get(video_file_url_qs, lambda x: x['itag'][0]) + if itag and itag in YoutubeIE._formats: + format.update(YoutubeIE._formats[itag]) + format.update({'format_id': itag}) + else: + mime = try_get(video_file_url_qs, lambda x: x['mime'][0]) + ext = (mimetype2ext(mime) + or urlhandle_detect_ext(urlh) + or mimetype2ext(urlh.headers.get('x-archive-guessed-content-type'))) + format.update({'ext': ext}) + info['formats'] = [format] + if not info.get('duration'): + info['duration'] = str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0])) + + if not info.get('title'): + info['title'] = video_id + return info diff --git a/yt_dlp/extractor/arcpublishing.py b/yt_dlp/extractor/arcpublishing.py index 5a9b8181a5..1943fd5f83 100644 --- a/yt_dlp/extractor/arcpublishing.py +++ b/yt_dlp/extractor/arcpublishing.py @@ -158,7 +158,7 @@ def _real_extract(self, url): return { 'id': uuid, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'thumbnail': try_get(video, lambda x: x['promo_image']['url']), 'description': try_get(video, lambda x: x['subheadlines']['basic']), 'formats': formats, diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py index 048d30f27d..1aff0361c1 100644 --- a/yt_dlp/extractor/ard.py +++ b/yt_dlp/extractor/ard.py @@ -280,7 +280,7 @@ def _real_extract(self, url): info.update({ 'id': video_id, - 'title': self._live_title(title) if info.get('is_live') else title, + 'title': title, 'description': description, 'thumbnail': thumbnail, }) @@ -388,7 +388,13 @@ def _real_extract(self, url): class ARDBetaMediathekIE(ARDMediathekBaseIE): - _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P[^/]+)/(?Pplayer|live|video|sendung|sammlung)/(?P(?:[^/]+/)*)(?P[a-zA-Z0-9]+)' + _VALID_URL = r'''(?x)https:// + (?:(?:beta|www)\.)?ardmediathek\.de/ + (?:(?P[^/]+)/)? + (?:player|live|video|(?Psendung|sammlung))/ + (?:(?P[^?#]+)/)? + (?P(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)''' + _TESTS = [{ 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', @@ -403,6 +409,18 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'upload_date': '20200805', 'ext': 'mp4', }, + 'skip': 'Error', + }, { + 'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll', + 'md5': 'f1837e563323b8a642a8ddeff0131f51', + 'info_dict': { + 'id': '10049223', + 'ext': 'mp4', + 'title': 'tagesschau, 20:00 Uhr', + 'timestamp': 1636398000, + 'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b', + 'upload_date': '20211108', + }, }, { 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', 'only_matching': True, @@ -426,6 +444,12 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): # playlist of type 'sammlung' 'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/', 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet', + 'only_matching': True, }] def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber): @@ -525,20 +549,12 @@ def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode): return self.playlist_result(entries, playlist_title=display_id) def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('video_id') - display_id = mobj.group('display_id') - if display_id: - display_id = display_id.rstrip('/') - if not display_id: - display_id = video_id + video_id, display_id, playlist_type, client = self._match_valid_url(url).group( + 'id', 'display_id', 'playlist', 'client') + display_id, client = display_id or video_id, client or 'ard' - if mobj.group('mode') in ('sendung', 'sammlung'): - # this is a playlist-URL - return self._ARD_extract_playlist( - url, video_id, display_id, - mobj.group('client'), - mobj.group('mode')) + if playlist_type: + return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type) player_page = self._download_json( 'https://api.ardmediathek.de/public-gateway', @@ -574,7 +590,7 @@ def _real_extract(self, url): } } } -}''' % (mobj.group('client'), video_id), +}''' % (client, video_id), }).encode(), headers={ 'Content-Type': 'application/json' })['data']['playerPage'] diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py index ed245b75fd..296b169d2a 100644 --- a/yt_dlp/extractor/arte.py +++ b/yt_dlp/extractor/arte.py @@ -174,7 +174,7 @@ def _real_extract(self, url): return { 'id': player_info.get('VID') or video_id, 'title': title, - 'description': player_info.get('VDE'), + 'description': player_info.get('VDE') or player_info.get('V7T'), 'upload_date': unified_strdate(upload_date_str), 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), 'formats': formats, diff --git a/yt_dlp/extractor/atresplayer.py b/yt_dlp/extractor/atresplayer.py index 8143eb4d75..6d843966aa 100644 --- a/yt_dlp/extractor/atresplayer.py +++ b/yt_dlp/extractor/atresplayer.py @@ -24,9 +24,6 @@ class AtresPlayerIE(InfoExtractor): 'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc', 'duration': 3413, }, - 'params': { - 'format': 'bestvideo', - }, 'skip': 'This video is only available for registered users' }, { diff --git a/yt_dlp/extractor/atvat.py b/yt_dlp/extractor/atvat.py index 95e572d70c..7c30cfcbb9 100644 --- a/yt_dlp/extractor/atvat.py +++ b/yt_dlp/extractor/atvat.py @@ -1,75 +1,106 @@ # coding: utf-8 from __future__ import unicode_literals +import datetime + from .common import InfoExtractor from ..utils import ( - determine_ext, - int_or_none, - unescapeHTML, + float_or_none, + jwt_encode_hs256, + try_get, ) class ATVAtIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?atv\.at/(?:[^/]+/){2}(?P[dv]\d+)' + _VALID_URL = r'https?://(?:www\.)?atv\.at/tv/(?:[^/]+/){2,3}(?P.*)' + _TESTS = [{ - 'url': 'http://atv.at/aktuell/di-210317-2005-uhr/v1698449/', - 'md5': 'c3b6b975fb3150fc628572939df205f2', + 'url': 'https://www.atv.at/tv/bauer-sucht-frau/staffel-18/bauer-sucht-frau/bauer-sucht-frau-staffel-18-folge-3-die-hofwochen', + 'md5': '3c3b4aaca9f63e32b35e04a9c2515903', 'info_dict': { - 'id': '1698447', + 'id': 'v-ce9cgn1e70n5-1', 'ext': 'mp4', - 'title': 'DI, 21.03.17 | 20:05 Uhr 1/1', + 'title': 'Bauer sucht Frau - Staffel 18 Folge 3 - Die Hofwochen', } }, { - 'url': 'http://atv.at/aktuell/meinrad-knapp/d8416/', + 'url': 'https://www.atv.at/tv/bauer-sucht-frau/staffel-18/episode-01/bauer-sucht-frau-staffel-18-vorstellungsfolge-1', 'only_matching': True, }] + # extracted from bootstrap.js function (search for e.encryption_key and use your browser's debugger) + _ACCESS_ID = 'x_atv' + _ENCRYPTION_KEY = 'Hohnaekeishoogh2omaeghooquooshia' + + def _extract_video_info(self, url, content, video): + clip_id = content.get('splitId', content['id']) + formats = [] + clip_urls = video['urls'] + for protocol, variant in clip_urls.items(): + source_url = try_get(variant, lambda x: x['clear']['url']) + if not source_url: + continue + if protocol == 'dash': + formats.extend(self._extract_mpd_formats( + source_url, clip_id, mpd_id=protocol, fatal=False)) + elif protocol == 'hls': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id=protocol, fatal=False)) + else: + formats.append({ + 'url': source_url, + 'format_id': protocol, + }) + self._sort_formats(formats) + + return { + 'id': clip_id, + 'title': content.get('title'), + 'duration': float_or_none(content.get('duration')), + 'series': content.get('tvShowTitle'), + 'formats': formats, + } + def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_data = self._parse_json(unescapeHTML(self._search_regex( - [r'flashPlayerOptions\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', - r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="(?P[^"]+)"'], - webpage, 'player data', group='json')), - display_id)['config']['initial_video'] + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + json_data = self._parse_json( + self._search_regex(r'', webpage, 'json_data'), + video_id=video_id) - video_id = video_data['id'] - video_title = video_data['title'] + video_title = json_data['views']['default']['page']['title'] + contentResource = json_data['views']['default']['page']['contentResource'] + content_id = contentResource[0]['id'] + content_ids = [{'id': id, 'subclip_start': content['start'], 'subclip_end': content['end']} + for id, content in enumerate(contentResource)] - parts = [] - for part in video_data.get('parts', []): - part_id = part['id'] - part_title = part['title'] - - formats = [] - for source in part.get('sources', []): - source_url = source.get('src') - if not source_url: - continue - ext = determine_ext(source_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_url, part_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'format_id': source.get('delivery'), - 'url': source_url, - }) - self._sort_formats(formats) - - parts.append({ - 'id': part_id, - 'title': part_title, - 'thumbnail': part.get('preview_image_url'), - 'duration': int_or_none(part.get('duration')), - 'is_live': part.get('is_livestream'), - 'formats': formats, + time_of_request = datetime.datetime.now() + not_before = time_of_request - datetime.timedelta(minutes=5) + expire = time_of_request + datetime.timedelta(minutes=5) + payload = { + 'content_ids': { + content_id: content_ids, + }, + 'secure_delivery': True, + 'iat': int(time_of_request.timestamp()), + 'nbf': int(not_before.timestamp()), + 'exp': int(expire.timestamp()), + } + jwt_token = jwt_encode_hs256(payload, self._ENCRYPTION_KEY, headers={'kid': self._ACCESS_ID}) + videos = self._download_json( + 'https://vas-v4.p7s1video.net/4.0/getsources', + content_id, 'Downloading videos JSON', query={ + 'token': jwt_token.decode('utf-8') }) + video_id, videos_data = list(videos['data'].items())[0] + entries = [ + self._extract_video_info(url, contentResource[video['id']], video) + for video in videos_data] + return { '_type': 'multi_video', 'id': video_id, 'title': video_title, - 'entries': parts, + 'entries': entries, } diff --git a/yt_dlp/extractor/audiomack.py b/yt_dlp/extractor/audiomack.py index cc7771354c..31fb859ae3 100644 --- a/yt_dlp/extractor/audiomack.py +++ b/yt_dlp/extractor/audiomack.py @@ -14,7 +14,7 @@ class AudiomackIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P[\w/-]+)' + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:song/|(?=.+/song/))(?P[\w/-]+)' IE_NAME = 'audiomack' _TESTS = [ # hosted on audiomack @@ -39,15 +39,16 @@ class AudiomackIE(InfoExtractor): 'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]', 'uploader': 'ILOVEMAKONNEN', 'upload_date': '20160414', - } + }, + 'skip': 'Song has been removed from the site', }, ] def _real_extract(self, url): - # URLs end with [uploader name]/[uploader title] + # URLs end with [uploader name]/song/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response - album_url_tag = self._match_id(url) + album_url_tag = self._match_id(url).replace('/song/', '/') # Request the extended version of the api for extra fields like artist and title api_response = self._download_json( @@ -73,13 +74,13 @@ def _real_extract(self, url): class AudiomackAlbumIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audiomack\.com/album/(?P[\w/-]+)' + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:album/|(?=.+/album/))(?P[\w/-]+)' IE_NAME = 'audiomack:album' _TESTS = [ # Standard album playlist { 'url': 'http://www.audiomack.com/album/flytunezcom/tha-tour-part-2-mixtape', - 'playlist_count': 15, + 'playlist_count': 11, 'info_dict': { 'id': '812251', @@ -95,24 +96,27 @@ class AudiomackAlbumIE(InfoExtractor): }, 'playlist': [{ 'info_dict': { - 'title': 'PPP (Pistol P Project) - 9. Heaven or Hell (CHIMACA) ft Zuse (prod by DJ FU)', - 'id': '837577', + 'title': 'PPP (Pistol P Project) - 8. Real (prod by SYK SENSE )', + 'id': '837576', + 'ext': 'mp3', + 'uploader': 'Lil Herb a.k.a. G Herbo', + } + }, { + 'info_dict': { + 'title': 'PPP (Pistol P Project) - 10. 4 Minutes Of Hell Part 4 (prod by DY OF 808 MAFIA)', + 'id': '837580', 'ext': 'mp3', 'uploader': 'Lil Herb a.k.a. G Herbo', } }], - 'params': { - 'playliststart': 9, - 'playlistend': 9, - } } ] def _real_extract(self, url): - # URLs end with [uploader name]/[uploader title] + # URLs end with [uploader name]/album/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response - album_url_tag = self._match_id(url) + album_url_tag = self._match_id(url).replace('/album/', '/') result = {'_type': 'playlist', 'entries': []} # There is no one endpoint for album metadata - instead it is included/repeated in each song's metadata # Therefore we don't know how many songs the album has and must infi-loop until failure @@ -134,7 +138,7 @@ def _real_extract(self, url): # Pull out the album metadata and add to result (if it exists) for resultkey, apikey in [('id', 'album_id'), ('title', 'album_title')]: if apikey in api_response and resultkey not in result: - result[resultkey] = api_response[apikey] + result[resultkey] = compat_str(api_response[apikey]) song_id = url_basename(api_response['url']).rpartition('.')[0] result['entries'].append({ 'id': compat_str(api_response.get('id', song_id)), diff --git a/yt_dlp/extractor/awaan.py b/yt_dlp/extractor/awaan.py index 22cc10d988..b5d1b57af2 100644 --- a/yt_dlp/extractor/awaan.py +++ b/yt_dlp/extractor/awaan.py @@ -41,7 +41,7 @@ def _parse_video_data(self, video_data, video_id, is_live): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': video_data.get('description_en') or video_data.get('description_ar'), 'thumbnail': 'http://admin.mangomolo.com/analytics/%s' % img if img else None, 'duration': int_or_none(video_data.get('duration')), diff --git a/yt_dlp/extractor/bandaichannel.py b/yt_dlp/extractor/bandaichannel.py index d672859132..f1bcdef7a3 100644 --- a/yt_dlp/extractor/bandaichannel.py +++ b/yt_dlp/extractor/bandaichannel.py @@ -21,7 +21,6 @@ class BandaiChannelIE(BrightcoveNewIE): 'duration': 1387.733, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }] diff --git a/yt_dlp/extractor/bannedvideo.py b/yt_dlp/extractor/bannedvideo.py index 8f8f5ef5f2..3db1151f6d 100644 --- a/yt_dlp/extractor/bannedvideo.py +++ b/yt_dlp/extractor/bannedvideo.py @@ -97,21 +97,16 @@ def _call_api(self, video_id, id, operation, note): 'query': self._GRAPHQL_QUERIES[operation] }).encode('utf8')).get('data') - def _extract_comments(self, video_id, comments, comment_data): + def _get_comments(self, video_id, comments, comment_data): + yield from comments for comment in comment_data.copy(): comment_id = comment.get('_id') if comment.get('replyCount') > 0: reply_json = self._call_api( video_id, comment_id, 'GetCommentReplies', f'Downloading replies for comment {comment_id}') - comments.extend( - self._parse_comment(reply, comment_id) - for reply in reply_json.get('getCommentReplies')) - - return { - 'comments': comments, - 'comment_count': len(comments), - } + for reply in reply_json.get('getCommentReplies'): + yield self._parse_comment(reply, comment_id) @staticmethod def _parse_comment(comment_data, parent): @@ -159,7 +154,5 @@ def _real_extract(self, url): 'tags': [tag.get('name') for tag in video_info.get('tags')], 'availability': self._availability(is_unlisted=video_info.get('unlisted')), 'comments': comments, - '__post_extractor': ( - (lambda: self._extract_comments(video_id, comments, video_json.get('getVideoComments'))) - if self.get_param('getcomments') else None) + '__post_extractor': self.extract_comments(video_id, comments, video_json.get('getVideoComments')) } diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index de497ab1d3..85ab478a65 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -451,9 +451,10 @@ def _download_playlist(self, playlist_id): playlist = self._download_json( 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id, playlist_id, 'Downloading playlist JSON') + formats = [] + subtitles = {} - version = playlist.get('defaultAvailableVersion') - if version: + for version in playlist.get('allAvailableVersions', []): smp_config = version['smpConfig'] title = smp_config['title'] description = smp_config['summary'] @@ -463,8 +464,17 @@ def _download_playlist(self, playlist_id): continue programme_id = item.get('vpid') duration = int_or_none(item.get('duration')) - formats, subtitles = self._download_media_selector(programme_id) - return programme_id, title, description, duration, formats, subtitles + version_formats, version_subtitles = self._download_media_selector(programme_id) + types = version['types'] + for f in version_formats: + f['format_note'] = ', '.join(types) + if any('AudioDescribed' in x for x in types): + f['language_preference'] = -10 + formats += version_formats + for tag, subformats in (version_subtitles or {}).items(): + subtitles.setdefault(tag, []).extend(subformats) + + return programme_id, title, description, duration, formats, subtitles except ExtractorError as ee: if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): raise @@ -588,8 +598,8 @@ class BBCIE(BBCCoUkIE): _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' _MEDIA_SETS = [ - 'mobile-tablet-main', 'pc', + 'mobile-tablet-main', ] _TESTS = [{ diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 8aab6a01b4..2cb01ff83b 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1,16 +1,13 @@ # coding: utf-8 -from __future__ import unicode_literals import hashlib import itertools -import json import functools import re import math from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( - compat_str, compat_parse_qs, compat_urlparse, compat_urllib_parse_urlparse @@ -20,13 +17,17 @@ int_or_none, float_or_none, parse_iso8601, + traverse_obj, try_get, + parse_count, smuggle_url, + srt_subtitles_timecode, str_or_none, strip_jsonp, unified_timestamp, unsmuggle_url, urlencode_postdata, + url_or_none, OnDemandPagedList ) @@ -99,7 +100,7 @@ class BiliBiliIE(InfoExtractor): 'upload_date': '20170301', }, 'params': { - 'skip_download': True, # Test metadata only + 'skip_download': True, }, }, { 'info_dict': { @@ -113,7 +114,7 @@ class BiliBiliIE(InfoExtractor): 'upload_date': '20170301', }, 'params': { - 'skip_download': True, # Test metadata only + 'skip_download': True, }, }] }, { @@ -167,7 +168,7 @@ def _real_extract(self, url): if 'anime/' not in url: cid = self._search_regex( - r'\bcid(?:["\']:|=)(\d+),["\']page(?:["\']:|=)' + compat_str(page_id), webpage, 'cid', + r'\bcid(?:["\']:|=)(\d+),["\']page(?:["\']:|=)' + str(page_id), webpage, 'cid', default=None ) or self._search_regex( r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid', @@ -257,7 +258,7 @@ def _real_extract(self, url): # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video part_title = try_get( self._download_json( - "https://api.bilibili.com/x/player/pagelist?bvid=%s&jsonp=jsonp" % bv_id, + f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp', video_id, note='Extracting videos in anthology'), lambda x: x['data'][int(page_id) - 1]['part']) title = part_title or title @@ -271,7 +272,7 @@ def _real_extract(self, url): # TODO 'view_count' requires deobfuscating Javascript info = { - 'id': compat_str(video_id) if page_id is None else '%s_p%s' % (video_id, page_id), + 'id': str(video_id) if page_id is None else '%s_part%s' % (video_id, page_id), 'cid': cid, 'title': title, 'description': description, @@ -293,29 +294,25 @@ def _real_extract(self, url): info['uploader'] = self._html_search_meta( 'author', webpage, 'uploader', default=None) - raw_danmaku = self._get_raw_danmaku(video_id, cid) - - raw_tags = self._get_tags(video_id) - tags = list(map(lambda x: x['tag_name'], raw_tags)) - top_level_info = { - 'raw_danmaku': raw_danmaku, - 'tags': tags, - 'raw_tags': raw_tags, + 'tags': traverse_obj(self._download_json( + f'https://api.bilibili.com/x/tag/archive/tags?aid={video_id}', + video_id, fatal=False, note='Downloading tags'), ('data', ..., 'tag_name')), } - if self.get_param('getcomments', False): - def get_comments(): - comments = self._get_all_comment_pages(video_id) - return { - 'comments': comments, - 'comment_count': len(comments) - } - top_level_info['__post_extractor'] = get_comments + entries[0]['subtitles'] = { + 'danmaku': [{ + 'ext': 'xml', + 'url': f'https://comment.bilibili.com/{cid}.xml', + }] + } - ''' + r''' # Requires https://github.com/m13253/danmaku2ass which is licenced under GPL3 # See https://github.com/animelover1984/youtube-dl + + raw_danmaku = self._download_webpage( + f'https://comment.bilibili.com/{cid}.xml', video_id, fatal=False, note='Downloading danmaku comments') danmaku = NiconicoIE.CreateDanmaku(raw_danmaku, commentType='Bilibili', x=1024, y=576) entries[0]['subtitles'] = { 'danmaku': [{ @@ -325,40 +322,39 @@ def get_comments(): } ''' + top_level_info['__post_extractor'] = self.extract_comments(video_id) + for entry in entries: entry.update(info) if len(entries) == 1: entries[0].update(top_level_info) return entries[0] - else: - for idx, entry in enumerate(entries): - entry['id'] = '%s_part%d' % (video_id, (idx + 1)) - global_info = { - '_type': 'multi_video', - 'id': compat_str(video_id), - 'bv_id': bv_id, - 'title': title, - 'description': description, - 'entries': entries, - } + for idx, entry in enumerate(entries): + entry['id'] = '%s_part%d' % (video_id, (idx + 1)) - global_info.update(info) - global_info.update(top_level_info) - - return global_info + return { + '_type': 'multi_video', + 'id': str(video_id), + 'bv_id': bv_id, + 'title': title, + 'description': description, + 'entries': entries, + **info, **top_level_info + } def _extract_anthology_entries(self, bv_id, video_id, webpage): title = self._html_search_regex( (r']+\btitle=(["\'])(?P(?:(?!\1).)+)\1', - r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', + r'(?s)<h1[^>]*>(?P<title>.+?)</h1>', + r'<title>(?P<title>.+?)'), webpage, 'title', group='title') json_data = self._download_json( - "https://api.bilibili.com/x/player/pagelist?bvid=%s&jsonp=jsonp" % bv_id, + f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp', video_id, note='Extracting videos in anthology') - if len(json_data['data']) > 1: + if json_data['data']: return self.playlist_from_matches( json_data['data'], bv_id, title, ie=BiliBiliIE.ie_key(), getter=lambda entry: 'https://www.bilibili.com/video/%s?p=%d' % (bv_id, entry['page'])) @@ -373,65 +369,33 @@ def _get_video_id_set(self, id, is_bv): if response['code'] == -400: raise ExtractorError('Video ID does not exist', expected=True, video_id=id) elif response['code'] != 0: - raise ExtractorError('Unknown error occurred during API check (code %s)' % response['code'], expected=True, video_id=id) - return (response['data']['aid'], response['data']['bvid']) + raise ExtractorError(f'Unknown error occurred during API check (code {response["code"]})', + expected=True, video_id=id) + return response['data']['aid'], response['data']['bvid'] - # recursive solution to getting every page of comments for the video - # we can stop when we reach a page without any comments - def _get_all_comment_pages(self, video_id, commentPageNumber=0): - comment_url = "https://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn=%s&type=1&oid=%s&sort=2&_=1567227301685" % (commentPageNumber, video_id) - json_str = self._download_webpage( - comment_url, video_id, - note='Extracting comments from page %s' % (commentPageNumber)) - replies = json.loads(json_str)['data']['replies'] - if replies is None: - return [] - return self._get_all_children(replies) + self._get_all_comment_pages(video_id, commentPageNumber + 1) + def _get_comments(self, video_id, commentPageNumber=0): + for idx in itertools.count(1): + replies = traverse_obj( + self._download_json( + f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={video_id}&type=1&jsonp=jsonp&sort=2&_=1567227301685', + video_id, note=f'Extracting comments from page {idx}', fatal=False), + ('data', 'replies')) + if not replies: + return + for children in map(self._get_all_children, replies): + yield from children - # extracts all comments in the tree - def _get_all_children(self, replies): - if replies is None: - return [] - - ret = [] - for reply in replies: - author = reply['member']['uname'] - author_id = reply['member']['mid'] - id = reply['rpid'] - text = reply['content']['message'] - timestamp = reply['ctime'] - parent = reply['parent'] if reply['parent'] != 0 else 'root' - - comment = { - "author": author, - "author_id": author_id, - "id": id, - "text": text, - "timestamp": timestamp, - "parent": parent, - } - ret.append(comment) - - # from the JSON, the comment structure seems arbitrarily deep, but I could be wrong. - # Regardless, this should work. - ret += self._get_all_children(reply['replies']) - - return ret - - def _get_raw_danmaku(self, video_id, cid): - # This will be useful if I decide to scrape all pages instead of doing them individually - # cid_url = "https://www.bilibili.com/widget/getPageList?aid=%s" % (video_id) - # cid_str = self._download_webpage(cid_url, video_id, note=False) - # cid = json.loads(cid_str)[0]['cid'] - - danmaku_url = "https://comment.bilibili.com/%s.xml" % (cid) - danmaku = self._download_webpage(danmaku_url, video_id, note='Downloading danmaku comments') - return danmaku - - def _get_tags(self, video_id): - tags_url = "https://api.bilibili.com/x/tag/archive/tags?aid=%s" % (video_id) - tags_json = self._download_json(tags_url, video_id, note='Downloading tags') - return tags_json['data'] + def _get_all_children(self, reply): + yield { + 'author': traverse_obj(reply, ('member', 'uname')), + 'author_id': traverse_obj(reply, ('member', 'mid')), + 'id': reply.get('rpid'), + 'text': traverse_obj(reply, ('content', 'message')), + 'timestamp': reply.get('ctime'), + 'parent': reply.get('parent') or 'root', + } + for children in map(self._get_all_children, reply.get('replies') or []): + yield from children class BiliBiliBangumiIE(InfoExtractor): @@ -514,11 +478,8 @@ def _entries(self, list_id): count, max_count = 0, None for page_num in itertools.count(1): - data = self._parse_json( - self._download_webpage( - self._API_URL % (list_id, page_num), list_id, - note='Downloading page %d' % page_num), - list_id)['data'] + data = self._download_json( + self._API_URL % (list_id, page_num), list_id, note=f'Downloading page {page_num}')['data'] max_count = max_count or try_get(data, lambda x: x['page']['count']) @@ -581,11 +542,11 @@ def _entries(self, category, subcategory, query): } if category not in rid_map: - raise ExtractorError('The supplied category, %s, is not supported. List of supported categories: %s' % (category, list(rid_map.keys()))) - + raise ExtractorError( + f'The category {category} isn\'t supported. Supported categories: {list(rid_map.keys())}') if subcategory not in rid_map[category]: - raise ExtractorError('The subcategory, %s, isn\'t supported for this category. Supported subcategories: %s' % (subcategory, list(rid_map[category].keys()))) - + raise ExtractorError( + f'The subcategory {subcategory} isn\'t supported for this category. Supported subcategories: {list(rid_map[category].keys())}') rid_value = rid_map[category][subcategory] api_url = 'https://api.bilibili.com/x/web-interface/newlist?rid=%d&type=1&ps=20&jsonp=jsonp' % rid_value @@ -609,44 +570,29 @@ def _real_extract(self, url): class BiliBiliSearchIE(SearchInfoExtractor): - IE_DESC = 'Bilibili video search, "bilisearch" keyword' + IE_DESC = 'Bilibili video search' _MAX_RESULTS = 100000 _SEARCH_KEY = 'bilisearch' - MAX_NUMBER_OF_RESULTS = 1000 - def _get_n_results(self, query, n): - """Get a specified number of results for a query""" - - entries = [] - pageNumber = 0 - while True: - pageNumber += 1 - # FIXME - api_url = "https://api.bilibili.com/x/web-interface/search/type?context=&page=%s&order=pubdate&keyword=%s&duration=0&tids_2=&__refresh__=true&search_type=video&tids=0&highlight=1" % (pageNumber, query) - json_str = self._download_webpage( - api_url, "None", query={"Search_key": query}, - note='Extracting results from page %s' % pageNumber) - data = json.loads(json_str)['data'] - - # FIXME: this is hideous - if "result" not in data: - return { - '_type': 'playlist', - 'id': query, - 'entries': entries[:n] - } - - videos = data['result'] + def _search_results(self, query): + for page_num in itertools.count(1): + videos = self._download_json( + 'https://api.bilibili.com/x/web-interface/search/type', query, + note=f'Extracting results from page {page_num}', query={ + 'Search_key': query, + 'keyword': query, + 'page': page_num, + 'context': '', + 'order': 'pubdate', + 'duration': 0, + 'tids_2': '', + '__refresh__': 'true', + 'search_type': 'video', + 'tids': 0, + 'highlight': 1, + })['data'].get('result') or [] for video in videos: - e = self.url_result(video['arcurl'], 'BiliBili', compat_str(video['aid'])) - entries.append(e) - - if(len(entries) >= n or len(videos) >= BiliBiliSearchIE.MAX_NUMBER_OF_RESULTS): - return { - '_type': 'playlist', - 'id': query, - 'entries': entries[:n] - } + yield self.url_result(video['arcurl'], 'BiliBili', str(video['aid'])) class BilibiliAudioBaseIE(InfoExtractor): @@ -774,3 +720,171 @@ def _real_extract(self, url): return self.url_result( 'http://www.bilibili.tv/video/av%s/' % video_id, ie=BiliBiliIE.ie_key(), video_id=video_id) + + +class BiliIntlBaseIE(InfoExtractor): + _API_URL = 'https://api.bilibili.tv/intl/gateway' + + def _call_api(self, endpoint, *args, **kwargs): + return self._download_json(self._API_URL + endpoint, *args, **kwargs)['data'] + + def json2srt(self, json): + data = '\n\n'.join( + f'{i + 1}\n{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n{line["content"]}' + for i, line in enumerate(json['body'])) + return data + + def _get_subtitles(self, ep_id): + sub_json = self._call_api(f'/web/v2/subtitle?episode_id={ep_id}&platform=web', ep_id) + subtitles = {} + for sub in sub_json.get('subtitles') or []: + sub_url = sub.get('url') + if not sub_url: + continue + sub_data = self._download_json( + sub_url, ep_id, errnote='Unable to download subtitles', fatal=False, + note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '') + if not sub_data: + continue + subtitles.setdefault(sub.get('lang_key', 'en'), []).append({ + 'ext': 'srt', + 'data': self.json2srt(sub_data) + }) + return subtitles + + def _get_formats(self, ep_id): + video_json = self._call_api(f'/web/playurl?ep_id={ep_id}&platform=web', ep_id, + note='Downloading video formats', errnote='Unable to download video formats') + if video_json.get('code'): + if video_json['code'] in (10004004, 10004005, 10023006): + self.raise_login_required(method='cookies') + elif video_json['code'] == 10004001: + self.raise_geo_restricted() + elif video_json.get('message') and str(video_json['code']) != video_json['message']: + raise ExtractorError( + f'Unable to download video formats: {self.IE_NAME} said: {video_json["message"]}', expected=True) + else: + raise ExtractorError('Unable to download video formats') + video_json = video_json['playurl'] + formats = [] + for vid in video_json.get('video') or []: + video_res = vid.get('video_resource') or {} + video_info = vid.get('stream_info') or {} + if not video_res.get('url'): + continue + formats.append({ + 'url': video_res['url'], + 'ext': 'mp4', + 'format_note': video_info.get('desc_words'), + 'width': video_res.get('width'), + 'height': video_res.get('height'), + 'vbr': video_res.get('bandwidth'), + 'acodec': 'none', + 'vcodec': video_res.get('codecs'), + 'filesize': video_res.get('size'), + }) + for aud in video_json.get('audio_resource') or []: + if not aud.get('url'): + continue + formats.append({ + 'url': aud['url'], + 'ext': 'mp4', + 'abr': aud.get('bandwidth'), + 'acodec': aud.get('codecs'), + 'vcodec': 'none', + 'filesize': aud.get('size'), + }) + + self._sort_formats(formats) + return formats + + def _extract_ep_info(self, episode_data, ep_id): + return { + 'id': ep_id, + 'title': episode_data.get('title_display') or episode_data['title'], + 'thumbnail': episode_data.get('cover'), + 'episode_number': int_or_none(self._search_regex( + r'^E(\d+)(?:$| - )', episode_data.get('title_display'), 'episode number', default=None)), + 'formats': self._get_formats(ep_id), + 'subtitles': self._get_subtitles(ep_id), + 'extractor_key': BiliIntlIE.ie_key(), + } + + +class BiliIntlIE(BiliIntlBaseIE): + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P\d+)/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.bilibili.tv/en/play/34613/341736', + 'info_dict': { + 'id': '341736', + 'ext': 'mp4', + 'title': 'E2 - The First Night', + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', + 'episode_number': 2, + } + }, { + 'url': 'https://www.bilibili.tv/en/play/1033760/11005006', + 'info_dict': { + 'id': '11005006', + 'ext': 'mp4', + 'title': 'E3 - Who?', + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', + 'episode_number': 3, + } + }, { + 'url': 'https://www.biliintl.com/en/play/34613/341736', + 'only_matching': True, + }] + + def _real_extract(self, url): + season_id, video_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, video_id) + # Bstation layout + initial_data = self._parse_json(self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), video_id, fatal=False) or {} + episode_data = traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict) + + if not episode_data: + # Non-Bstation layout, read through episode list + season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id) + episode_data = next( + episode for episode in traverse_obj(season_json, ('sections', ..., 'episodes', ...), expected_type=dict) + if str(episode.get('episode_id')) == video_id) + return self._extract_ep_info(episode_data, video_id) + + +class BiliIntlSeriesIE(BiliIntlBaseIE): + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P\d+)$' + _TESTS = [{ + 'url': 'https://www.bilibili.tv/en/play/34613', + 'playlist_mincount': 15, + 'info_dict': { + 'id': '34613', + 'title': 'Fly Me to the Moon', + 'description': 'md5:a861ee1c4dc0acfad85f557cc42ac627', + 'categories': ['Romance', 'Comedy', 'Slice of life'], + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.biliintl.com/en/play/34613', + 'only_matching': True, + }] + + def _entries(self, series_id): + series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id) + for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]): + episode_id = str(episode.get('episode_id')) + yield self._extract_ep_info(episode, episode_id) + + def _real_extract(self, url): + series_id = self._match_id(url) + series_info = self._call_api(f'/web/v2/ogv/play/season_info?season_id={series_id}&platform=web', series_id).get('season') or {} + return self.playlist_result( + self._entries(series_id), series_id, series_info.get('title'), series_info.get('description'), + categories=traverse_obj(series_info, ('styles', ..., 'title'), expected_type=str_or_none), + thumbnail=url_or_none(series_info.get('horizontal_cover')), view_count=parse_count(series_info.get('view'))) diff --git a/yt_dlp/extractor/bitwave.py b/yt_dlp/extractor/bitwave.py index eb16c469df..e6e093f597 100644 --- a/yt_dlp/extractor/bitwave.py +++ b/yt_dlp/extractor/bitwave.py @@ -51,7 +51,7 @@ def _real_extract(self, url): return { 'id': username, - 'title': self._live_title(channel['data']['title']), + 'title': channel['data']['title'], 'uploader': username, 'uploader_id': username, 'formats': formats, diff --git a/yt_dlp/extractor/blogger.py b/yt_dlp/extractor/blogger.py new file mode 100644 index 0000000000..dba131cb05 --- /dev/null +++ b/yt_dlp/extractor/blogger.py @@ -0,0 +1,54 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from ..utils import ( + mimetype2ext, + parse_duration, + parse_qs, + str_or_none, + traverse_obj, +) +from .common import InfoExtractor + + +class BloggerIE(InfoExtractor): + IE_NAME = 'blogger.com' + _VALID_URL = r'https?://(?:www\.)?blogger\.com/video\.g\?token=(?P.+)' + _VALID_EMBED = r''']+src=["']((?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']''' + _TESTS = [{ + 'url': 'https://www.blogger.com/video.g?token=AD6v5dzEe9hfcARr5Hlq1WTkYy6t-fXH3BBahVhGvVHe5szdEUBEloSEDSTA8-b111089KbfWuBvTN7fnbxMtymsHhXAXwVvyzHH4Qch2cfLQdGxKQrrEuFpC1amSl_9GuLWODjPgw', + 'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac', + 'info_dict': { + 'id': 'BLOGGER-video-3c740e3a49197e16-796', + 'title': 'BLOGGER-video-3c740e3a49197e16-796', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*', + 'duration': 76.068, + } + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall(BloggerIE._VALID_EMBED, webpage) + + def _real_extract(self, url): + token_id = self._match_id(url) + webpage = self._download_webpage(url, token_id) + data_json = self._search_regex(r'var\s+VIDEO_CONFIG\s*=\s*(\{.*)', webpage, 'JSON data') + data = self._parse_json(data_json.encode('utf-8').decode('unicode_escape'), token_id) + streams = data['streams'] + formats = [{ + 'ext': mimetype2ext(traverse_obj(parse_qs(stream['play_url']), ('mime', 0))), + 'url': stream['play_url'], + 'format_id': str_or_none(stream.get('format_id')), + } for stream in streams] + + return { + 'id': data.get('iframe_id', token_id), + 'title': data.get('iframe_id', token_id), + 'formats': formats, + 'thumbnail': data.get('thumbnail'), + 'duration': parse_duration(traverse_obj(parse_qs(streams[0]['play_url']), ('dur', 0))), + } diff --git a/yt_dlp/extractor/bongacams.py b/yt_dlp/extractor/bongacams.py index 9e7551136e..4e346e7b6e 100644 --- a/yt_dlp/extractor/bongacams.py +++ b/yt_dlp/extractor/bongacams.py @@ -49,7 +49,7 @@ def _real_extract(self, url): return { 'id': channel_id, - 'title': self._live_title(uploader or uploader_id), + 'title': uploader or uploader_id, 'uploader': uploader, 'uploader_id': uploader_id, 'like_count': like_count, diff --git a/yt_dlp/extractor/breitbart.py b/yt_dlp/extractor/breitbart.py new file mode 100644 index 0000000000..f50f719dc2 --- /dev/null +++ b/yt_dlp/extractor/breitbart.py @@ -0,0 +1,39 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class BreitBartIE(InfoExtractor): + _VALID_URL = r'https?:\/\/(?:www\.)breitbart.com/videos/v/(?P[^/]+)' + _TESTS = [{ + 'url': 'https://www.breitbart.com/videos/v/5cOz1yup/?pl=Ij6NDOji', + 'md5': '0aa6d1d6e183ac5ca09207fe49f17ade', + 'info_dict': { + 'id': '5cOz1yup', + 'ext': 'mp4', + 'title': 'Watch \u2013 Clyburn: Statues in Congress Have to Go Because they Are Honoring Slavery', + 'description': 'md5:bac35eb0256d1cb17f517f54c79404d5', + 'thumbnail': 'https://cdn.jwplayer.com/thumbs/5cOz1yup-1920.jpg', + 'age_limit': 0, + } + }, { + 'url': 'https://www.breitbart.com/videos/v/eaiZjVOn/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + formats = self._extract_m3u8_formats(f'https://cdn.jwplayer.com/manifests/{video_id}.m3u8', video_id, ext='mp4') + self._sort_formats(formats) + return { + 'id': video_id, + 'title': self._og_search_title( + webpage, default=None) or self._html_search_regex( + r'(?s)(.*?)', webpage, 'video title'), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'age_limit': self._rta_search(webpage), + 'formats': formats + } diff --git a/yt_dlp/extractor/brightcove.py b/yt_dlp/extractor/brightcove.py index bb68dc481f..dcd332b435 100644 --- a/yt_dlp/extractor/brightcove.py +++ b/yt_dlp/extractor/brightcove.py @@ -16,6 +16,7 @@ ) from ..utils import ( clean_html, + dict_get, extract_attributes, ExtractorError, find_xpath_attr, @@ -471,29 +472,23 @@ def _extract_urls(ie, webpage): def _parse_brightcove_metadata(self, json_data, video_id, headers={}): title = json_data['name'].strip() - num_drm_sources = 0 - formats = [] + formats, subtitles = [], {} sources = json_data.get('sources') or [] for source in sources: container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') - skip_unplayable = not self.get_param('allow_unplayable_formats') - # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object - if skip_unplayable and (container == 'WVM' or source.get('key_systems')): - num_drm_sources += 1 - continue - elif ext == 'ism' and skip_unplayable: - continue - elif ext == 'm3u8' or container == 'M2TS': + if ext == 'm3u8' or container == 'M2TS': if not src: continue - formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + fmts, subs = self._extract_m3u8_formats_and_subtitles( + src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + subtitles = self._merge_subtitles(subtitles, subs) elif ext == 'mpd': if not src: continue - formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False)) + fmts, subs = self._extract_mpd_formats_and_subtitles(src, video_id, 'dash', fatal=False) + subtitles = self._merge_subtitles(subtitles, subs) else: streaming_src = source.get('streaming_src') stream_name, app_name = source.get('stream_name'), source.get('app_name') @@ -539,7 +534,13 @@ def build_format_id(kind): 'play_path': stream_name, 'format_id': build_format_id('rtmp'), }) - formats.append(f) + fmts = [f] + + # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object + if container == 'WVM' or source.get('key_systems') or ext == 'ism': + for f in fmts: + f['has_drm'] = True + formats.extend(fmts) if not formats: errors = json_data.get('errors') @@ -547,16 +548,12 @@ def build_format_id(kind): error = errors[0] self.raise_no_formats( error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) - elif (not self.get_param('allow_unplayable_formats') - and sources and num_drm_sources == len(sources)): - self.report_drm(video_id) self._sort_formats(formats) for f in formats: f.setdefault('http_headers', {}).update(headers) - subtitles = {} for text_track in json_data.get('text_tracks', []): if text_track.get('kind') != 'captions': continue @@ -574,11 +571,19 @@ def build_format_id(kind): if duration is not None and duration <= 0: is_live = True + common_res = [(160, 90), (320, 180), (480, 720), (640, 360), (768, 432), (1024, 576), (1280, 720), (1366, 768), (1920, 1080)] + thumb_base_url = dict_get(json_data, ('poster', 'thumbnail')) + thumbnails = [{ + 'url': re.sub(r'\d+x\d+', f'{w}x{h}', thumb_base_url), + 'width': w, + 'height': h, + } for w, h in common_res] if thumb_base_url else None + return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': clean_html(json_data.get('description')), - 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), + 'thumbnails': thumbnails, 'duration': duration, 'timestamp': parse_iso8601(json_data.get('published_at')), 'uploader_id': json_data.get('account_id'), diff --git a/yt_dlp/extractor/cableav.py b/yt_dlp/extractor/cableav.py new file mode 100644 index 0000000000..77efdf45af --- /dev/null +++ b/yt_dlp/extractor/cableav.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from .common import InfoExtractor + + +class CableAVIE(InfoExtractor): + _VALID_URL = r'https://cableav\.tv/(?P[a-zA-Z0-9]+)' + _TESTS = [{ + 'url': 'https://cableav.tv/lS4iR9lWjN8/', + 'md5': '7e3fe5e49d61c4233b7f5b0f69b15e18', + 'info_dict': { + 'id': 'lS4iR9lWjN8', + 'ext': 'mp4', + 'title': '國產麻豆AV 叮叮映畫 DDF001 情欲小說家 - CableAV', + 'description': '國產AV 480p, 720p 国产麻豆AV 叮叮映画 DDF001 情欲小说家', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_url = self._og_search_video_url(webpage, secure=False) + + formats = self._extract_m3u8_formats(video_url, video_id, 'mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': formats, + } diff --git a/yt_dlp/extractor/cam4.py b/yt_dlp/extractor/cam4.py new file mode 100644 index 0000000000..f47de9176d --- /dev/null +++ b/yt_dlp/extractor/cam4.py @@ -0,0 +1,32 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class CAM4IE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?cam4\.com/(?P[a-z0-9_]+)' + _TEST = { + 'url': 'https://www.cam4.com/foxynesss', + 'info_dict': { + 'id': 'foxynesss', + 'ext': 'mp4', + 'title': 're:^foxynesss [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'age_limit': 18, + } + } + + def _real_extract(self, url): + channel_id = self._match_id(url) + m3u8_playlist = self._download_json('https://www.cam4.com/rest/v1.0/profile/{}/streamInfo'.format(channel_id), channel_id).get('cdnURL') + + formats = self._extract_m3u8_formats(m3u8_playlist, channel_id, 'mp4', m3u8_id='hls', live=True) + self._sort_formats(formats) + + return { + 'id': channel_id, + 'title': channel_id, + 'is_live': True, + 'age_limit': 18, + 'formats': formats, + } diff --git a/yt_dlp/extractor/cammodels.py b/yt_dlp/extractor/cammodels.py index eb2a8b4c6a..3dc19377b3 100644 --- a/yt_dlp/extractor/cammodels.py +++ b/yt_dlp/extractor/cammodels.py @@ -91,7 +91,7 @@ def _real_extract(self, url): return { 'id': user_id, - 'title': self._live_title(user_id), + 'title': user_id, 'is_live': True, 'formats': formats, 'age_limit': 18 diff --git a/yt_dlp/extractor/camtube.py b/yt_dlp/extractor/camtube.py deleted file mode 100644 index b3be3bdcf7..0000000000 --- a/yt_dlp/extractor/camtube.py +++ /dev/null @@ -1,71 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_timestamp, -) - - -class CamTubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|api)\.)?camtube\.co/recordings?/(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'https://camtube.co/recording/minafay-030618-1136-chaturbate-female', - 'info_dict': { - 'id': '42ad3956-dd5b-445a-8313-803ea6079fac', - 'display_id': 'minafay-030618-1136-chaturbate-female', - 'ext': 'mp4', - 'title': 'minafay-030618-1136-chaturbate-female', - 'duration': 1274, - 'timestamp': 1528018608, - 'upload_date': '20180603', - 'age_limit': 18 - }, - 'params': { - 'skip_download': True, - }, - }] - - _API_BASE = 'https://api.camtube.co' - - def _real_extract(self, url): - display_id = self._match_id(url) - - token = self._download_json( - '%s/rpc/session/new' % self._API_BASE, display_id, - 'Downloading session token')['token'] - - self._set_cookie('api.camtube.co', 'session', token) - - video = self._download_json( - '%s/recordings/%s' % (self._API_BASE, display_id), display_id, - headers={'Referer': url}) - - video_id = video['uuid'] - timestamp = unified_timestamp(video.get('createdAt')) - duration = int_or_none(video.get('duration')) - view_count = int_or_none(video.get('viewCount')) - like_count = int_or_none(video.get('likeCount')) - creator = video.get('stageName') - - formats = [{ - 'url': '%s/recordings/%s/manifest.m3u8' - % (self._API_BASE, video_id), - 'format_id': 'hls', - 'ext': 'mp4', - 'protocol': 'm3u8_native', - }] - - return { - 'id': video_id, - 'display_id': display_id, - 'title': display_id, - 'timestamp': timestamp, - 'duration': duration, - 'view_count': view_count, - 'like_count': like_count, - 'creator': creator, - 'formats': formats, - 'age_limit': 18 - } diff --git a/yt_dlp/extractor/canalalpha.py b/yt_dlp/extractor/canalalpha.py new file mode 100644 index 0000000000..51d30a3213 --- /dev/null +++ b/yt_dlp/extractor/canalalpha.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + dict_get, + try_get, + unified_strdate, +) + + +class CanalAlphaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?canalalpha\.ch/play/[^/]+/[^/]+/(?P\d+)/?.*' + + _TESTS = [{ + 'url': 'https://www.canalalpha.ch/play/le-journal/episode/24520/jeudi-28-octobre-2021', + 'info_dict': { + 'id': '24520', + 'ext': 'mp4', + 'title': 'Jeudi 28 octobre 2021', + 'description': 'md5:d30c6c3e53f8ad40d405379601973b30', + 'thumbnail': 'https://static.canalalpha.ch/poster/journal/journal_20211028.jpg', + 'upload_date': '20211028', + 'duration': 1125, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.canalalpha.ch/play/le-journal/topic/24512/la-poste-fait-de-neuchatel-un-pole-cryptographique', + 'info_dict': { + 'id': '24512', + 'ext': 'mp4', + 'title': 'La Poste fait de Neuchâtel un pôle cryptographique', + 'description': 'md5:4ba63ae78a0974d1a53d6703b6e1dedf', + 'thumbnail': 'https://static.canalalpha.ch/poster/news/news_39712.jpg', + 'upload_date': '20211028', + 'duration': 138, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.canalalpha.ch/play/eureka/episode/24484/ces-innovations-qui-veulent-rendre-lagriculture-plus-durable', + 'info_dict': { + 'id': '24484', + 'ext': 'mp4', + 'title': 'Ces innovations qui veulent rendre l’agriculture plus durable', + 'description': 'md5:3de3f151180684621e85be7c10e4e613', + 'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_10236.jpg', + 'upload_date': '20211026', + 'duration': 360, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.canalalpha.ch/play/avec-le-temps/episode/23516/redonner-de-leclat-grace-au-polissage', + 'info_dict': { + 'id': '23516', + 'ext': 'mp4', + 'title': 'Redonner de l\'éclat grâce au polissage', + 'description': 'md5:0d8fbcda1a5a4d6f6daa3165402177e1', + 'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_9990.png', + 'upload_date': '20210726', + 'duration': 360, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._parse_json(self._search_regex( + r'window\.__SERVER_STATE__\s?=\s?({(?:(?!};)[^"]|"([^"]|\\")*")+})\s?;', + webpage, 'data_json'), id)['1']['data']['data'] + manifests = try_get(data_json, lambda x: x['video']['manifests'], expected_type=dict) or {} + subtitles = {} + formats = [{ + 'url': video['$url'], + 'ext': 'mp4', + 'width': try_get(video, lambda x: x['res']['width'], expected_type=int), + 'height': try_get(video, lambda x: x['res']['height'], expected_type=int), + } for video in try_get(data_json, lambda x: x['video']['mp4'], expected_type=list) or [] if video.get('$url')] + if manifests.get('hls'): + m3u8_frmts, m3u8_subs = self._parse_m3u8_formats_and_subtitles(manifests['hls'], id) + formats.extend(m3u8_frmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + if manifests.get('dash'): + dash_frmts, dash_subs = self._parse_mpd_formats_and_subtitles(manifests['dash'], id) + formats.extend(dash_frmts) + subtitles = self._merge_subtitles(subtitles, dash_subs) + self._sort_formats(formats) + return { + 'id': id, + 'title': data_json.get('title').strip(), + 'description': clean_html(dict_get(data_json, ('longDesc', 'shortDesc'))), + 'thumbnail': data_json.get('poster'), + 'upload_date': unified_strdate(dict_get(data_json, ('webPublishAt', 'featuredAt', 'diffusionDate'))), + 'duration': try_get(data_json, lambda x: x['video']['duration'], expected_type=int), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/canvas.py b/yt_dlp/extractor/canvas.py index b417f8577a..e97c919299 100644 --- a/yt_dlp/extractor/canvas.py +++ b/yt_dlp/extractor/canvas.py @@ -1,4 +1,5 @@ from __future__ import unicode_literals +import json from .common import InfoExtractor @@ -41,9 +42,9 @@ class CanvasIE(InfoExtractor): _GEO_BYPASS = False _HLS_ENTRY_PROTOCOLS_MAP = { 'HLS': 'm3u8_native', - 'HLS_AES': 'm3u8', + 'HLS_AES': 'm3u8_native', } - _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1' + _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2' def _real_extract(self, url): mobj = self._match_valid_url(url) @@ -59,16 +60,21 @@ def _real_extract(self, url): # New API endpoint if not data: + vrtnutoken = self._download_json('https://token.vrt.be/refreshtoken', + video_id, note='refreshtoken: Retrieve vrtnutoken', + errnote='refreshtoken failed')['vrtnutoken'] headers = self.geo_verification_headers() - headers.update({'Content-Type': 'application/json'}) - token = self._download_json( + headers.update({'Content-Type': 'application/json; charset=utf-8'}) + vrtPlayerToken = self._download_json( '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', data=b'', headers=headers)['vrtPlayerToken'] + 'Downloading token', headers=headers, data=json.dumps({ + 'identityToken': vrtnutoken + }).encode('utf-8'))['vrtPlayerToken'] data = self._download_json( '%s/videos/%s' % (self._REST_API_BASE, video_id), video_id, 'Downloading video JSON', query={ - 'vrtPlayerToken': token, - 'client': '%s@PROD' % site_id, + 'vrtPlayerToken': vrtPlayerToken, + 'client': 'null', }, expected_status=400) if not data.get('title'): code = data.get('code') @@ -264,7 +270,7 @@ class VrtNUIE(GigyaBaseIE): 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], }] _NETRC_MACHINE = 'vrtnu' - _APIKEY = '3_qhEcPa5JGFROVwu5SWKqJ4mVOIkwlFNMSKwzPDAh8QZOtHqu6L4nD5Q7lk0eXOOG' + _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' _CONTEXT_ID = 'R3595707040' def _real_initialize(self): @@ -275,16 +281,16 @@ def _login(self): if username is None: return - auth_info = self._download_json( - 'https://accounts.vrt.be/accounts.login', None, - note='Login data', errnote='Could not get Login data', - headers={}, data=urlencode_postdata({ - 'loginID': username, - 'password': password, - 'sessionExpiration': '-2', - 'APIKey': self._APIKEY, - 'targetEnv': 'jssdk', - })) + auth_info = self._gigya_login({ + 'APIKey': self._APIKEY, + 'targetEnv': 'jssdk', + 'loginID': username, + 'password': password, + 'authMode': 'cookie', + }) + + if auth_info.get('errorDetails'): + raise ExtractorError('Unable to login: VrtNU said: ' + auth_info.get('errorDetails'), expected=True) # Sometimes authentication fails for no good reason, retry login_attempt = 1 @@ -298,14 +304,15 @@ def _login(self): 'UID': auth_info['UID'], 'UIDSignature': auth_info['UIDSignature'], 'signatureTimestamp': auth_info['signatureTimestamp'], - 'client_id': 'vrtnu-site', '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, } self._request_webpage( 'https://login.vrt.be/perform_login', - None, note='Requesting a token', errnote='Could not get a token', - headers={}, data=urlencode_postdata(post_data)) + None, note='Performing login', errnote='perform login failed', + headers={}, query={ + 'client_id': 'vrtnu-site' + }, data=urlencode_postdata(post_data)) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index fd5ec6033b..ac1272f7b5 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -1,30 +1,23 @@ # coding: utf-8 from __future__ import unicode_literals -import hashlib -import json import re -from xml.sax.saxutils import escape +import json +import base64 +import time from .common import InfoExtractor from ..compat import ( compat_str, - compat_HTTPError, ) from ..utils import ( - js_to_json, - smuggle_url, - try_get, - xpath_text, - xpath_element, - xpath_with_ns, - find_xpath_attr, - orderedSet, - parse_duration, - parse_iso8601, - parse_age_limit, - strip_or_none, int_or_none, + join_nonempty, + js_to_json, + orderedSet, + smuggle_url, + strip_or_none, + try_get, ExtractorError, ) @@ -59,6 +52,7 @@ class CBCIE(InfoExtractor): 'uploader': 'CBCC-NEW', 'timestamp': 1382717907, }, + 'skip': 'No longer available', }, { # with clipId, feed only available via tpfeed.cbc.ca 'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live', @@ -209,289 +203,321 @@ def _real_extract(self, url): } -class CBCWatchBaseIE(InfoExtractor): - _device_id = None - _device_token = None - _API_BASE_URL = 'https://api-cbc.cloud.clearleap.com/cloffice/client/' - _NS_MAP = { - 'media': 'http://search.yahoo.com/mrss/', - 'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/', - } - _GEO_COUNTRIES = ['CA'] - _LOGIN_URL = 'https://api.loginradius.com/identity/v2/auth/login' - _TOKEN_URL = 'https://cloud-api.loginradius.com/sso/jwt/api/token' - _API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37' - _NETRC_MACHINE = 'cbcwatch' +class CBCGemIE(InfoExtractor): + IE_NAME = 'gem.cbc.ca' + _VALID_URL = r'https?://gem\.cbc\.ca/media/(?P[0-9a-z-]+/s[0-9]+[a-z][0-9]+)' + _TESTS = [{ + # This is a normal, public, TV show video + 'url': 'https://gem.cbc.ca/media/schitts-creek/s06e01', + 'md5': '93dbb31c74a8e45b378cf13bd3f6f11e', + 'info_dict': { + 'id': 'schitts-creek/s06e01', + 'ext': 'mp4', + 'title': 'Smoke Signals', + 'description': 'md5:929868d20021c924020641769eb3e7f1', + 'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/episode/perso/cbc_schitts_creek_season_06e01_thumbnail_v01.jpg?im=Resize=(Size)', + 'duration': 1314, + 'categories': ['comedy'], + 'series': 'Schitt\'s Creek', + 'season': 'Season 6', + 'season_number': 6, + 'episode': 'Smoke Signals', + 'episode_number': 1, + 'episode_id': 'schitts-creek/s06e01', + }, + 'params': {'format': 'bv'}, + 'skip': 'Geo-restricted to Canada', + }, { + # This video requires an account in the browser, but works fine in yt-dlp + 'url': 'https://gem.cbc.ca/media/schitts-creek/s01e01', + 'md5': '297a9600f554f2258aed01514226a697', + 'info_dict': { + 'id': 'schitts-creek/s01e01', + 'ext': 'mp4', + 'title': 'The Cup Runneth Over', + 'description': 'md5:9bca14ea49ab808097530eb05a29e797', + 'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/episode/perso/cbc_schitts_creek_season_01e01_thumbnail_v01.jpg?im=Resize=(Size)', + 'series': 'Schitt\'s Creek', + 'season_number': 1, + 'season': 'Season 1', + 'episode_number': 1, + 'episode': 'The Cup Runneth Over', + 'episode_id': 'schitts-creek/s01e01', + 'duration': 1309, + 'categories': ['comedy'], + }, + 'params': {'format': 'bv'}, + 'skip': 'Geo-restricted to Canada', + }] - def _signature(self, email, password): + _GEO_COUNTRIES = ['CA'] + _TOKEN_API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37' + _NETRC_MACHINE = 'cbcgem' + _claims_token = None + + def _new_claims_token(self, email, password): data = json.dumps({ 'email': email, 'password': password, }).encode() headers = {'content-type': 'application/json'} - query = {'apikey': self._API_KEY} - resp = self._download_json(self._LOGIN_URL, None, data=data, headers=headers, query=query) + query = {'apikey': self._TOKEN_API_KEY} + resp = self._download_json('https://api.loginradius.com/identity/v2/auth/login', + None, data=data, headers=headers, query=query) access_token = resp['access_token'] - # token query = { 'access_token': access_token, - 'apikey': self._API_KEY, + 'apikey': self._TOKEN_API_KEY, 'jwtapp': 'jwt', } - resp = self._download_json(self._TOKEN_URL, None, headers=headers, query=query) - return resp['signature'] + resp = self._download_json('https://cloud-api.loginradius.com/sso/jwt/api/token', + None, headers=headers, query=query) + sig = resp['signature'] - def _call_api(self, path, video_id): - url = path if path.startswith('http') else self._API_BASE_URL + path - for _ in range(2): - try: - result = self._download_xml(url, video_id, headers={ - 'X-Clearleap-DeviceId': self._device_id, - 'X-Clearleap-DeviceToken': self._device_token, - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - # Device token has expired, re-acquiring device token - self._register_device() - continue - raise - error_message = xpath_text(result, 'userMessage') or xpath_text(result, 'systemMessage') - if error_message: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message)) - return result + data = json.dumps({'jwt': sig}).encode() + headers = {'content-type': 'application/json', 'ott-device-type': 'web'} + resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/token', + None, data=data, headers=headers) + cbc_access_token = resp['accessToken'] + + headers = {'content-type': 'application/json', 'ott-device-type': 'web', 'ott-access-token': cbc_access_token} + resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/profile', + None, headers=headers) + return resp['claimsToken'] + + def _get_claims_token_expiry(self): + # Token is a JWT + # JWT is decoded here and 'exp' field is extracted + # It is a Unix timestamp for when the token expires + b64_data = self._claims_token.split('.')[1] + data = base64.urlsafe_b64decode(b64_data + "==") + return json.loads(data)['exp'] + + def claims_token_expired(self): + exp = self._get_claims_token_expiry() + if exp - time.time() < 10: + # It will expire in less than 10 seconds, or has already expired + return True + return False + + def claims_token_valid(self): + return self._claims_token is not None and not self.claims_token_expired() + + def _get_claims_token(self, email, password): + if not self.claims_token_valid(): + self._claims_token = self._new_claims_token(email, password) + self._downloader.cache.store(self._NETRC_MACHINE, 'claims_token', self._claims_token) + return self._claims_token def _real_initialize(self): - if self._valid_device_token(): + if self.claims_token_valid(): return - device = self._downloader.cache.load( - 'cbcwatch', self._cache_device_key()) or {} - self._device_id, self._device_token = device.get('id'), device.get('token') - if self._valid_device_token(): + self._claims_token = self._downloader.cache.load(self._NETRC_MACHINE, 'claims_token') + + def _find_secret_formats(self, formats, video_id): + """ Find a valid video url and convert it to the secret variant """ + base_format = next((f for f in formats if f.get('vcodec') != 'none'), None) + if not base_format: return - self._register_device() - def _valid_device_token(self): - return self._device_id and self._device_token + base_url = re.sub(r'(Manifest\(.*?),filter=[\w-]+(.*?\))', r'\1\2', base_format['url']) + url = re.sub(r'(Manifest\(.*?),format=[\w-]+(.*?\))', r'\1\2', base_url) - def _cache_device_key(self): - email, _ = self._get_login_info() - return '%s_device' % hashlib.sha256(email.encode()).hexdigest() if email else 'device' + secret_xml = self._download_xml(url, video_id, note='Downloading secret XML', fatal=False) + if not secret_xml: + return + + for child in secret_xml: + if child.attrib.get('Type') != 'video': + continue + for video_quality in child: + bitrate = int_or_none(video_quality.attrib.get('Bitrate')) + if not bitrate or 'Index' not in video_quality.attrib: + continue + height = int_or_none(video_quality.attrib.get('MaxHeight')) + + yield { + **base_format, + 'format_id': join_nonempty('sec', height), + # Note: \g<1> is necessary instead of \1 since bitrate is a number + 'url': re.sub(r'(QualityLevels\()\d+(\))', fr'\g<1>{bitrate}\2', base_url), + 'width': int_or_none(video_quality.attrib.get('MaxWidth')), + 'tbr': bitrate / 1000.0, + 'height': height, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_info = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + video_id, video_id) - def _register_device(self): - result = self._download_xml( - self._API_BASE_URL + 'device/register', - None, 'Acquiring device token', - data=b'web') - self._device_id = xpath_text(result, 'deviceId', fatal=True) email, password = self._get_login_info() if email and password: - signature = self._signature(email, password) - data = '{0}{1}web'.format( - escape(signature), escape(self._device_id)).encode() - url = self._API_BASE_URL + 'device/login' - result = self._download_xml( - url, None, data=data, - headers={'content-type': 'application/xml'}) - self._device_token = xpath_text(result, 'token', fatal=True) + claims_token = self._get_claims_token(email, password) + headers = {'x-claims-token': claims_token} else: - self._device_token = xpath_text(result, 'deviceToken', fatal=True) - self._downloader.cache.store( - 'cbcwatch', self._cache_device_key(), { - 'id': self._device_id, - 'token': self._device_token, - }) + headers = {} + m3u8_info = self._download_json(video_info['playSession']['url'], video_id, headers=headers) + m3u8_url = m3u8_info.get('url') - def _parse_rss_feed(self, rss): - channel = xpath_element(rss, 'channel', fatal=True) + if m3u8_info.get('errorCode') == 1: + self.raise_geo_restricted(countries=['CA']) + elif m3u8_info.get('errorCode') == 35: + self.raise_login_required(method='password') + elif m3u8_info.get('errorCode') != 0: + raise ExtractorError(f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}') - def _add_ns(path): - return xpath_with_ns(path, self._NS_MAP) + formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls') + self._remove_duplicate_formats(formats) + formats.extend(self._find_secret_formats(formats, video_id)) - entries = [] - for item in channel.findall('item'): - guid = xpath_text(item, 'guid', fatal=True) - title = xpath_text(item, 'title', fatal=True) + for format in formats: + if format.get('vcodec') == 'none': + if format.get('ext') is None: + format['ext'] = 'm4a' + if format.get('acodec') is None: + format['acodec'] = 'mp4a.40.2' - media_group = xpath_element(item, _add_ns('media:group'), fatal=True) - content = xpath_element(media_group, _add_ns('media:content'), fatal=True) - content_url = content.attrib['url'] + # Put described audio at the beginning of the list, so that it + # isn't chosen by default, as most people won't want it. + if 'descriptive' in format['format_id'].lower(): + format['preference'] = -2 - thumbnails = [] - for thumbnail in media_group.findall(_add_ns('media:thumbnail')): - thumbnail_url = thumbnail.get('url') - if not thumbnail_url: - continue - thumbnails.append({ - 'id': thumbnail.get('profile'), - 'url': thumbnail_url, - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - }) - - timestamp = None - release_date = find_xpath_attr( - item, _add_ns('media:credit'), 'role', 'releaseDate') - if release_date is not None: - timestamp = parse_iso8601(release_date.text) - - entries.append({ - '_type': 'url_transparent', - 'url': content_url, - 'id': guid, - 'title': title, - 'description': xpath_text(item, 'description'), - 'timestamp': timestamp, - 'duration': int_or_none(content.get('duration')), - 'age_limit': parse_age_limit(xpath_text(item, _add_ns('media:rating'))), - 'episode': xpath_text(item, _add_ns('clearleap:episode')), - 'episode_number': int_or_none(xpath_text(item, _add_ns('clearleap:episodeInSeason'))), - 'series': xpath_text(item, _add_ns('clearleap:series')), - 'season_number': int_or_none(xpath_text(item, _add_ns('clearleap:season'))), - 'thumbnails': thumbnails, - 'ie_key': 'CBCWatchVideo', - }) - - return self.playlist_result( - entries, xpath_text(channel, 'guid'), - xpath_text(channel, 'title'), - xpath_text(channel, 'description')) - - -class CBCWatchVideoIE(CBCWatchBaseIE): - IE_NAME = 'cbc.ca:watch:video' - _VALID_URL = r'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' - _TEST = { - # geo-restricted to Canada, bypassable - 'url': 'https://api-cbc.cloud.clearleap.com/cloffice/client/web/play/?contentId=3c84472a-1eea-4dee-9267-2655d5055dcf&categoryId=ebc258f5-ee40-4cca-b66b-ba6bd55b7235', - 'only_matching': True, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - result = self._call_api(url, video_id) - - m3u8_url = xpath_text(result, 'url', fatal=True) - formats = self._extract_m3u8_formats(re.sub(r'/([^/]+)/[^/?]+\.m3u8', r'/\1/\1.m3u8', m3u8_url), video_id, 'mp4', fatal=False) - if len(formats) < 2: - formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') - for f in formats: - format_id = f.get('format_id') - if format_id.startswith('AAC'): - f['acodec'] = 'aac' - elif format_id.startswith('AC3'): - f['acodec'] = 'ac-3' - self._sort_formats(formats) - - info = { - 'id': video_id, - 'title': video_id, - 'formats': formats, - } - - rss = xpath_element(result, 'rss') - if rss: - info.update(self._parse_rss_feed(rss)['entries'][0]) - del info['url'] - del info['_type'] - del info['ie_key'] - return info - - -class CBCWatchIE(CBCWatchBaseIE): - IE_NAME = 'cbc.ca:watch' - _VALID_URL = r'https?://(?:gem|watch)\.cbc\.ca/(?:[^/]+/)+(?P[0-9a-f-]+)' - _TESTS = [{ - # geo-restricted to Canada, bypassable - 'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4', - 'info_dict': { - 'id': '9673749a-5e77-484c-8b62-a1092a6b5168', - 'ext': 'mp4', - 'title': 'Customer (Dis)Service', - 'description': 'md5:8bdd6913a0fe03d4b2a17ebe169c7c87', - 'upload_date': '20160219', - 'timestamp': 1455840000, - }, - 'params': { - # m3u8 download - 'skip_download': True, - 'format': 'bestvideo', - }, - }, { - # geo-restricted to Canada, bypassable - 'url': 'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057', - 'info_dict': { - 'id': '1ed4b385-cd84-49cf-95f0-80f004680057', - 'title': 'Arthur', - 'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.', - }, - 'playlist_mincount': 30, - }, { - 'url': 'https://gem.cbc.ca/media/this-hour-has-22-minutes/season-26/episode-20/38e815a-0108c6c6a42', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - rss = self._call_api('web/browse/' + video_id, video_id) - return self._parse_rss_feed(rss) - - -class CBCOlympicsIE(InfoExtractor): - IE_NAME = 'cbc.ca:olympics' - _VALID_URL = r'https?://olympics\.cbc\.ca/video/[^/]+/(?P[^/?#]+)' - _TESTS = [{ - 'url': 'https://olympics.cbc.ca/video/whats-on-tv/olympic-morning-featuring-the-opening-ceremony/', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._hidden_inputs(webpage)['videoId'] - video_doc = self._download_xml( - 'https://olympics.cbc.ca/videodata/%s.xml' % video_id, video_id) - title = xpath_text(video_doc, 'title', fatal=True) - is_live = xpath_text(video_doc, 'kind') == 'Live' - if is_live: - title = self._live_title(title) - - formats = [] - for video_source in video_doc.findall('videoSources/videoSource'): - uri = xpath_text(video_source, 'uri') - if not uri: - continue - tokenize = self._download_json( - 'https://olympics.cbc.ca/api/api-akamai/tokenize', - video_id, data=json.dumps({ - 'VideoSource': uri, - }).encode(), headers={ - 'Content-Type': 'application/json', - 'Referer': url, - # d3.VideoPlayer._init in https://olympics.cbc.ca/components/script/base.js - 'Cookie': '_dvp=TK:C0ObxjerU', # AKAMAI CDN cookie - }, fatal=False) - if not tokenize: - continue - content_url = tokenize['ContentUrl'] - video_source_format = video_source.get('format') - if video_source_format == 'IIS': - formats.extend(self._extract_ism_formats( - content_url, video_id, ism_id=video_source_format, fatal=False)) - else: - formats.extend(self._extract_m3u8_formats( - content_url, video_id, 'mp4', - 'm3u8' if is_live else 'm3u8_native', - m3u8_id=video_source_format, fatal=False)) self._sort_formats(formats) return { 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': xpath_text(video_doc, 'description'), - 'thumbnail': xpath_text(video_doc, 'thumbnailUrl'), - 'duration': parse_duration(xpath_text(video_doc, 'duration')), + 'title': video_info['title'], + 'description': video_info.get('description'), + 'thumbnail': video_info.get('image'), + 'series': video_info.get('series'), + 'season_number': video_info.get('season'), + 'season': f'Season {video_info.get("season")}', + 'episode_number': video_info.get('episode'), + 'episode': video_info.get('title'), + 'episode_id': video_id, + 'duration': video_info.get('duration'), + 'categories': [video_info.get('category')], 'formats': formats, - 'is_live': is_live, + 'release_timestamp': video_info.get('airDate'), + 'timestamp': video_info.get('availableDate'), + } + + +class CBCGemPlaylistIE(InfoExtractor): + IE_NAME = 'gem.cbc.ca:playlist' + _VALID_URL = r'https?://gem\.cbc\.ca/media/(?P(?P[0-9a-z-]+)/s(?P[0-9]+))/?(?:[?#]|$)' + _TESTS = [{ + # TV show playlist, all public videos + 'url': 'https://gem.cbc.ca/media/schitts-creek/s06', + 'playlist_count': 16, + 'info_dict': { + 'id': 'schitts-creek/s06', + 'title': 'Season 6', + 'description': 'md5:6a92104a56cbeb5818cc47884d4326a2', + }, + }] + _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/shows/' + + def _real_extract(self, url): + match = self._match_valid_url(url) + season_id = match.group('id') + show = match.group('show') + show_info = self._download_json(self._API_BASE + show, season_id) + season = int(match.group('season')) + + season_info = next((s for s in show_info['seasons'] if s.get('season') == season), None) + + if season_info is None: + raise ExtractorError(f'Couldn\'t find season {season} of {show}') + + episodes = [] + for episode in season_info['assets']: + episodes.append({ + '_type': 'url_transparent', + 'ie_key': 'CBCGem', + 'url': 'https://gem.cbc.ca/media/' + episode['id'], + 'id': episode['id'], + 'title': episode.get('title'), + 'description': episode.get('description'), + 'thumbnail': episode.get('image'), + 'series': episode.get('series'), + 'season_number': episode.get('season'), + 'season': season_info['title'], + 'season_id': season_info.get('id'), + 'episode_number': episode.get('episode'), + 'episode': episode.get('title'), + 'episode_id': episode['id'], + 'duration': episode.get('duration'), + 'categories': [episode.get('category')], + }) + + thumbnail = None + tn_uri = season_info.get('image') + # the-national was observed to use a "data:image/png;base64" + # URI for their 'image' value. The image was 1x1, and is + # probably just a placeholder, so it is ignored. + if tn_uri is not None and not tn_uri.startswith('data:'): + thumbnail = tn_uri + + return { + '_type': 'playlist', + 'entries': episodes, + 'id': season_id, + 'title': season_info['title'], + 'description': season_info.get('description'), + 'thumbnail': thumbnail, + 'series': show_info.get('title'), + 'season_number': season_info.get('season'), + 'season': season_info['title'], + } + + +class CBCGemLiveIE(InfoExtractor): + IE_NAME = 'gem.cbc.ca:live' + _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P\d+)' + _TEST = { + 'url': 'https://gem.cbc.ca/live/920604739687', + 'info_dict': { + 'title': 'Ottawa', + 'description': 'The live TV channel and local programming from Ottawa', + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg', + 'is_live': True, + 'id': 'AyqZwxRqh8EH', + 'ext': 'mp4', + 'timestamp': 1492106160, + 'upload_date': '20170413', + 'uploader': 'CBCC-NEW', + }, + 'skip': 'Live might have ended', + } + + # It's unclear where the chars at the end come from, but they appear to be + # constant. Might need updating in the future. + # There are two URLs, some livestreams are in one, and some + # in the other. The JSON schema is the same for both. + _API_URLS = ['https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT', 'https://tpfeed.cbc.ca/f/ExhSPC/FNiv9xQx_BnT'] + + def _real_extract(self, url): + video_id = self._match_id(url) + + for api_url in self._API_URLS: + video_info = next(( + stream for stream in self._download_json(api_url, video_id)['entries'] + if stream.get('guid') == video_id), None) + if video_info: + break + else: + raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected=True) + + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': video_info['content'][0]['url'], + 'id': video_id, + 'title': video_info.get('title'), + 'description': video_info.get('description'), + 'tags': try_get(video_info, lambda x: x['keywords'].split(', ')), + 'thumbnail': video_info.get('cbc$staticImage'), + 'is_live': True, } diff --git a/yt_dlp/extractor/cbs.py b/yt_dlp/extractor/cbs.py index ed5dc84a76..ae9ce58628 100644 --- a/yt_dlp/extractor/cbs.py +++ b/yt_dlp/extractor/cbs.py @@ -130,6 +130,7 @@ def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517): title = xpath_text(video_data, 'videoTitle', 'title') or xpath_text(video_data, 'videotitle', 'title') asset_types = {} + has_drm = False for item in items_data.findall('.//item'): asset_type = xpath_text(item, 'assetType') query = { @@ -144,6 +145,8 @@ def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517): if asset_type in asset_types: continue elif any(excluded in asset_type for excluded in ('HLS_FPS', 'DASH_CENC', 'OnceURL')): + if 'DASH_CENC' in asset_type: + has_drm = True continue if asset_type.startswith('HLS') or 'StreamPack' in asset_type: query['formats'] = 'MPEG4,M3U' @@ -151,6 +154,9 @@ def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517): query['formats'] = 'MPEG4,FLV' asset_types[asset_type] = query + if not asset_types and has_drm: + self.report_drm(content_id) + return self._extract_common_video_info(content_id, asset_types, mpx_acc, extra_info={ 'title': title, 'series': xpath_text(video_data, 'seriesTitle'), diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py index e1b3919371..72c47050ff 100644 --- a/yt_dlp/extractor/cda.py +++ b/yt_dlp/extractor/cda.py @@ -3,6 +3,7 @@ import codecs import re +import json from .common import InfoExtractor from ..compat import ( @@ -19,6 +20,7 @@ parse_duration, random_birthday, urljoin, + try_get, ) @@ -38,6 +40,8 @@ class CDAIE(InfoExtractor): 'average_rating': float, 'duration': 39, 'age_limit': 0, + 'upload_date': '20160221', + 'timestamp': 1456078244, } }, { 'url': 'http://www.cda.pl/video/57413289', @@ -143,7 +147,7 @@ def decrypt_file(a): b = [] for c in a: f = compat_ord(c) - b.append(compat_chr(33 + (f + 14) % 94) if 33 <= f and 126 >= f else compat_chr(f)) + b.append(compat_chr(33 + (f + 14) % 94) if 33 <= f <= 126 else compat_chr(f)) a = ''.join(b) a = a.replace('.cda.mp4', '') for p in ('.2cda.pl', '.3cda.pl'): @@ -173,18 +177,34 @@ def extract_format(page, version): video['file'] = video['file'].replace('adc.mp4', '.mp4') elif not video['file'].startswith('http'): video['file'] = decrypt_file(video['file']) - f = { + video_quality = video.get('quality') + qualities = video.get('qualities', {}) + video_quality = next((k for k, v in qualities.items() if v == video_quality), video_quality) + info_dict['formats'].append({ 'url': video['file'], - } - m = re.search( - r']+data-quality="(?P[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P[0-9]+)p', - page) - if m: - f.update({ - 'format_id': m.group('format_id'), - 'height': int(m.group('height')), - }) - info_dict['formats'].append(f) + 'format_id': video_quality, + 'height': int_or_none(video_quality[:-1]), + }) + for quality, cda_quality in qualities.items(): + if quality == video_quality: + continue + data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2, + 'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]} + data = json.dumps(data).encode('utf-8') + video_url = self._download_json( + f'https://www.cda.pl/video/{video_id}', video_id, headers={ + 'Content-Type': 'application/json', + 'X-Requested-With': 'XMLHttpRequest' + }, data=data, note=f'Fetching {quality} url', + errnote=f'Failed to fetch {quality} url', fatal=False) + if try_get(video_url, lambda x: x['result']['status']) == 'ok': + video_url = try_get(video_url, lambda x: x['result']['resp']) + info_dict['formats'].append({ + 'url': video_url, + 'format_id': quality, + 'height': int_or_none(quality[:-1]) + }) + if not info_dict['duration']: info_dict['duration'] = parse_duration(video.get('duration')) diff --git a/yt_dlp/extractor/ceskatelevize.py b/yt_dlp/extractor/ceskatelevize.py index 5e04d38a25..6c90b247ee 100644 --- a/yt_dlp/extractor/ceskatelevize.py +++ b/yt_dlp/extractor/ceskatelevize.py @@ -12,30 +12,15 @@ ExtractorError, float_or_none, sanitized_Request, - unescapeHTML, - update_url_query, + traverse_obj, urlencode_postdata, USER_AGENTS, ) class CeskaTelevizeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P[^/#?]+)' + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady)/(?:[^/?#&]+/)*(?P[^/#?]+)' _TESTS = [{ - 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', - 'info_dict': { - 'id': '61924494877246241', - 'ext': 'mp4', - 'title': 'Hyde Park Civilizace: Život v Grónsku', - 'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 3350, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', 'info_dict': { 'id': '61924494877028507', @@ -66,12 +51,60 @@ class CeskaTelevizeIE(InfoExtractor): }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', 'only_matching': True, + }, { + # video with 18+ caution trailer + 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', + 'info_dict': { + 'id': '215562210900007-bogotart', + 'title': 'Queer: Bogotart', + 'description': 'Hlavní město Kolumbie v doprovodu queer umělců. Vroucí svět plný vášně, sebevědomí, ale i násilí a bolesti. Připravil Peter Serge Butko', + }, + 'playlist': [{ + 'info_dict': { + 'id': '61924494877311053', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Varování 18+)', + 'duration': 11.9, + }, + }, { + 'info_dict': { + 'id': '61924494877068022', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Queer)', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 1558.3, + }, + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # iframe embed + 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', + 'only_matching': True, }] def _real_extract(self, url): playlist_id = self._match_id(url) - + parsed_url = compat_urllib_parse_urlparse(url) webpage = self._download_webpage(url, playlist_id) + site_name = self._og_search_property('site_name', webpage, fatal=False, default=None) + playlist_title = self._og_search_title(webpage, default=None) + if site_name and playlist_title: + playlist_title = playlist_title.replace(f' — {site_name}', '', 1) + playlist_description = self._og_search_description(webpage, default=None) + if playlist_description: + playlist_description = playlist_description.replace('\xa0', ' ') + + if parsed_url.path.startswith('/porady/'): + next_data = self._search_nextjs_data(webpage, playlist_id) + idec = traverse_obj(next_data, ('props', 'pageProps', 'data', ('show', 'mediaMeta'), 'idec'), get_all=False) + if not idec: + raise ExtractorError('Failed to find IDEC id') + iframe_hash = self._download_webpage('https://www.ceskatelevize.cz/v-api/iframe-hash/', playlist_id) + webpage = self._download_webpage('https://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php', playlist_id, + query={'hash': iframe_hash, 'origin': 'iVysilani', 'autoStart': 'true', 'IDEC': idec}) NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' if '%s

' % NOT_AVAILABLE_STRING in webpage: @@ -100,7 +133,7 @@ def _real_extract(self, url): data = { 'playlist[0][type]': type_, 'playlist[0][id]': episode_id, - 'requestUrl': compat_urllib_parse_urlparse(url).path, + 'requestUrl': parsed_url.path, 'requestSource': 'iVysilani', } @@ -108,7 +141,7 @@ def _real_extract(self, url): for user_agent in (None, USER_AGENTS['Safari']): req = sanitized_Request( - 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/', data=urlencode_postdata(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') @@ -130,9 +163,6 @@ def _real_extract(self, url): req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) req.add_header('Referer', url) - playlist_title = self._og_search_title(webpage, default=None) - playlist_description = self._og_search_description(webpage, default=None) - playlist = self._download_json(req, playlist_id, fatal=False) if not playlist: continue @@ -182,8 +212,6 @@ def _real_extract(self, url): if playlist_len == 1: final_title = playlist_title or title - if is_live: - final_title = self._live_title(final_title) else: final_title = '%s (%s)' % (playlist_title, title) @@ -237,54 +265,3 @@ def _fix_subtitle(subtitle): yield line return '\r\n'.join(_fix_subtitle(subtitles)) - - -class CeskaTelevizePoradyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P[^/#?]+)' - _TESTS = [{ - # video with 18+ caution trailer - 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', - 'info_dict': { - 'id': '215562210900007-bogotart', - 'title': 'Queer: Bogotart', - 'description': 'Alternativní průvodce současným queer světem', - }, - 'playlist': [{ - 'info_dict': { - 'id': '61924494876844842', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Varování 18+)', - 'duration': 10.2, - }, - }, { - 'info_dict': { - 'id': '61924494877068022', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Queer)', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 1558.3, - }, - }], - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # iframe embed - 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - data_url = update_url_query(unescapeHTML(self._search_regex( - (r']*\bdata-url=(["\'])(?P(?:(?!\1).)+)\1', - r']+\bsrc=(["\'])(?P(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'), - webpage, 'iframe player url', group='url')), query={ - 'autoStart': 'true', - }) - - return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key()) diff --git a/yt_dlp/extractor/cgtn.py b/yt_dlp/extractor/cgtn.py new file mode 100644 index 0000000000..89f173887e --- /dev/null +++ b/yt_dlp/extractor/cgtn.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + try_get, + unified_timestamp, +) + + +class CGTNIE(InfoExtractor): + _VALID_URL = r'https?://news\.cgtn\.com/news/[0-9]{4}-[0-9]{2}-[0-9]{2}/[a-zA-Z0-9-]+-(?P[a-zA-Z0-9-]+)/index\.html' + _TESTS = [ + { + 'url': 'https://news.cgtn.com/news/2021-03-09/Up-and-Out-of-Poverty-Ep-1-A-solemn-promise-YuOUaOzGQU/index.html', + 'info_dict': { + 'id': 'YuOUaOzGQU', + 'ext': 'mp4', + 'title': 'Up and Out of Poverty Ep. 1: A solemn promise', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1615295940, + 'upload_date': '20210309', + }, + 'params': { + 'skip_download': True + } + }, { + 'url': 'https://news.cgtn.com/news/2021-06-06/China-Indonesia-vow-to-further-deepen-maritime-cooperation-10REvJCewCY/index.html', + 'info_dict': { + 'id': '10REvJCewCY', + 'ext': 'mp4', + 'title': 'China, Indonesia vow to further deepen maritime cooperation', + 'thumbnail': r're:^https?://.*\.png$', + 'description': 'China and Indonesia vowed to upgrade their cooperation into the maritime sector and also for political security, economy, and cultural and people-to-people exchanges.', + 'author': 'CGTN', + 'category': 'China', + 'timestamp': 1622950200, + 'upload_date': '20210606', + }, + 'params': { + 'skip_download': False + } + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + download_url = self._html_search_regex(r'data-video ="(?P.+m3u8)"', webpage, 'download_url') + datetime_str = self._html_search_regex(r'\s*(.+?)\s*', webpage, 'datetime_str', fatal=False) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': self._extract_m3u8_formats(download_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls'), + 'category': self._html_search_regex(r'\s*(.+?)\s*', + webpage, 'category', fatal=False), + 'author': self._html_search_regex(r'
\s*(.+?)\s*
', + webpage, 'author', default=None, fatal=False), + 'timestamp': try_get(unified_timestamp(datetime_str), lambda x: x - 8 * 3600), + } diff --git a/yt_dlp/extractor/chaturbate.py b/yt_dlp/extractor/chaturbate.py index a459dcb8d5..8da51f9196 100644 --- a/yt_dlp/extractor/chaturbate.py +++ b/yt_dlp/extractor/chaturbate.py @@ -101,7 +101,7 @@ def _real_extract(self, url): return { 'id': video_id, - 'title': self._live_title(video_id), + 'title': video_id, 'thumbnail': 'https://roomimg.stream.highwebmedia.com/ri/%s.jpg' % video_id, 'age_limit': self._rta_search(webpage), 'is_live': True, diff --git a/yt_dlp/extractor/chingari.py b/yt_dlp/extractor/chingari.py new file mode 100644 index 0000000000..e6841fb8b2 --- /dev/null +++ b/yt_dlp/extractor/chingari.py @@ -0,0 +1,209 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import json + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote_plus +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + str_to_int, + url_or_none, +) + + +class ChingariBaseIE(InfoExtractor): + def _get_post(self, id, post_data): + media_data = post_data['mediaLocation'] + base_url = media_data['base'] + author_data = post_data.get('authorData', {}) + song_data = post_data.get('song', {}) # revist this in future for differentiating b/w 'art' and 'author' + + formats = [{ + 'format_id': frmt, + 'width': str_to_int(frmt[1:]), + 'url': base_url + frmt_path, + } for frmt, frmt_path in media_data.get('transcoded', {}).items()] + + if media_data.get('path'): + formats.append({ + 'format_id': 'original', + 'format_note': 'Direct video.', + 'url': base_url + '/apipublic' + media_data['path'], + 'quality': 10, + }) + self._sort_formats(formats) + timestamp = str_to_int(post_data.get('created_at')) + if timestamp: + timestamp = int_or_none(timestamp, 1000) + + thumbnail, uploader_url = None, None + if media_data.get('thumbnail'): + thumbnail = base_url + media_data.get('thumbnail') + if author_data.get('username'): + uploader_url = 'https://chingari.io/' + author_data.get('username') + + return { + 'id': id, + 'title': compat_urllib_parse_unquote_plus(clean_html(post_data.get('caption'))), + 'description': compat_urllib_parse_unquote_plus(clean_html(post_data.get('caption'))), + 'duration': media_data.get('duration'), + 'thumbnail': url_or_none(thumbnail), + 'like_count': post_data.get('likeCount'), + 'view_count': post_data.get('viewsCount'), + 'comment_count': post_data.get('commentCount'), + 'repost_count': post_data.get('shareCount'), + 'timestamp': timestamp, + 'uploader_id': post_data.get('userId') or author_data.get('_id'), + 'uploader': author_data.get('name'), + 'uploader_url': url_or_none(uploader_url), + 'track': song_data.get('title'), + 'artist': song_data.get('author'), + 'formats': formats, + } + + +class ChingariIE(ChingariBaseIE): + _VALID_URL = r'https?://(?:www\.)?chingari\.io/share/post\?id=(?P[^&/#?]+)' + _TESTS = [{ + 'url': 'https://chingari.io/share/post?id=612f8f4ce1dc57090e8a7beb', + 'info_dict': { + 'id': '612f8f4ce1dc57090e8a7beb', + 'ext': 'mp4', + 'title': 'Happy birthday Srila Prabhupada', + 'description': 'md5:c7080ebfdfeb06016e638c286d6bc3fa', + 'duration': 0, + 'thumbnail': 'https://media.chingari.io/uploads/c41d30e2-06b6-4e3b-9b4b-edbb929cec06-1630506826911/thumbnail/198f993f-ce87-4623-82c6-cd071bd6d4f4-1630506828016.jpg', + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'repost_count': int, + 'timestamp': 1630506828, + 'upload_date': '20210901', + 'uploader_id': '5f0403982c8bd344f4813f8c', + 'uploader': 'ISKCON,Inc.', + 'uploader_url': 'https://chingari.io/iskcon,inc', + 'track': None, + 'artist': None, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + post_json = self._download_json(f'https://api.chingari.io/post/post_details/{id}', id) + if post_json['code'] != 200: + raise ExtractorError(post_json['message'], expected=True) + post_data = post_json['data'] + return self._get_post(id, post_data) + + +class ChingariUserIE(ChingariBaseIE): + _VALID_URL = r'https?://(?:www\.)?chingari\.io/(?!share/post)(?P[^/?]+)' + _TESTS = [{ + 'url': 'https://chingari.io/dada1023', + 'playlist_mincount': 3, + 'info_dict': { + 'id': 'dada1023', + }, + 'entries': [{ + 'url': 'https://chingari.io/share/post?id=614781f3ade60b3a0bfff42a', + 'info_dict': { + 'id': '614781f3ade60b3a0bfff42a', + 'ext': 'mp4', + 'title': '#chingaribappa ', + 'description': 'md5:d1df21d84088770468fa63afe3b17857', + 'duration': 7, + 'thumbnail': 'https://media.chingari.io/uploads/346d86d4-abb2-474e-a164-ffccf2bbcb72-1632076273717/thumbnail/b0b3aac2-2b86-4dd1-909d-9ed6e57cf77c-1632076275552.jpg', + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'repost_count': int, + 'timestamp': 1632076275, + 'upload_date': '20210919', + 'uploader_id': '5efc4b12cca35c3d1794c2d3', + 'uploader': 'dada (girish) dhawale', + 'uploader_url': 'https://chingari.io/dada1023', + 'track': None, + 'artist': None + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://chingari.io/share/post?id=6146b132bcbf860959e12cba', + 'info_dict': { + 'id': '6146b132bcbf860959e12cba', + 'ext': 'mp4', + 'title': 'Tactor harvesting', + 'description': 'md5:8403f12dce68828b77ecee7eb7e887b7', + 'duration': 59.3, + 'thumbnail': 'https://media.chingari.io/uploads/b353ca70-7a87-400d-93a6-fa561afaec86-1632022814584/thumbnail/c09302e3-2043-41b1-a2fe-77d97e5bd676-1632022834260.jpg', + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'repost_count': int, + 'timestamp': 1632022834, + 'upload_date': '20210919', + 'uploader_id': '5efc4b12cca35c3d1794c2d3', + 'uploader': 'dada (girish) dhawale', + 'uploader_url': 'https://chingari.io/dada1023', + 'track': None, + 'artist': None + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://chingari.io/share/post?id=6145651b74cb030a64c40b82', + 'info_dict': { + 'id': '6145651b74cb030a64c40b82', + 'ext': 'mp4', + 'title': '#odiabhajan ', + 'description': 'md5:687ea36835b9276cf2af90f25e7654cb', + 'duration': 56.67, + 'thumbnail': 'https://media.chingari.io/uploads/6cbf216b-babc-4cce-87fe-ceaac8d706ac-1631937782708/thumbnail/8855754f-6669-48ce-b269-8cc0699ed6da-1631937819522.jpg', + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'repost_count': int, + 'timestamp': 1631937819, + 'upload_date': '20210918', + 'uploader_id': '5efc4b12cca35c3d1794c2d3', + 'uploader': 'dada (girish) dhawale', + 'uploader_url': 'https://chingari.io/dada1023', + 'track': None, + 'artist': None + }, + 'params': {'skip_download': True} + }], + }, { + 'url': 'https://chingari.io/iskcon%2Cinc', + 'playlist_mincount': 1025, + 'info_dict': { + 'id': 'iskcon%2Cinc', + }, + }] + + def _entries(self, id): + skip = 0 + has_more = True + for page in itertools.count(): + posts = self._download_json('https://api.chingari.io/users/getPosts', id, + data=json.dumps({'userId': id, 'ownerId': id, 'skip': skip, 'limit': 20}).encode(), + headers={'content-type': 'application/json;charset=UTF-8'}, + note='Downloading page %s' % page) + for post in posts.get('data', []): + post_data = post['post'] + yield self._get_post(post_data['_id'], post_data) + skip += 20 + has_more = posts['hasMoreData'] + if not has_more: + break + + def _real_extract(self, url): + alt_id = self._match_id(url) + post_json = self._download_json(f'https://api.chingari.io/user/{alt_id}', alt_id) + if post_json['code'] != 200: + raise ExtractorError(post_json['message'], expected=True) + id = post_json['data']['_id'] + return self.playlist_result(self._entries(id), playlist_id=alt_id) diff --git a/yt_dlp/extractor/ciscowebex.py b/yt_dlp/extractor/ciscowebex.py new file mode 100644 index 0000000000..882dae91b5 --- /dev/null +++ b/yt_dlp/extractor/ciscowebex.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, + unified_timestamp, +) + + +class CiscoWebexIE(InfoExtractor): + IE_NAME = 'ciscowebex' + IE_DESC = 'Cisco Webex' + _VALID_URL = r'''(?x) + (?Phttps?://(?P[^/#?]*)\.webex\.com/(?: + (?P[^/#?]*)/(?:ldr|lsr).php\?(?:[^#]*&)*RCID=(?P[0-9a-f]{32})| + (?:recordingservice|webappng)/sites/(?P[^/#?]*)/recording/(?:playback/|play/)?(?P[0-9a-f]{32}) + ))''' + + _TESTS = [{ + 'url': 'https://demosubdomain.webex.com/demositeurl/ldr.php?RCID=e58e803bc0f766bb5f6376d2e86adb5b', + 'only_matching': True, + }, { + 'url': 'http://demosubdomain.webex.com/demositeurl/lsr.php?RCID=bc04b4a7b5ea2cc3a493d5ae6aaff5d7', + 'only_matching': True, + }, { + 'url': 'https://demosubdomain.webex.com/recordingservice/sites/demositeurl/recording/88e7a42f7b19f5b423c54754aecc2ce9/playback', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + rcid = mobj.group('rcid') + if rcid: + webpage = self._download_webpage(url, None, note='Getting video ID') + url = self._search_regex(self._VALID_URL, webpage, 'redirection url', group='url') + url = self._request_webpage(url, None, note='Resolving final URL').geturl() + mobj = self._match_valid_url(url) + subdomain = mobj.group('subdomain') + siteurl = mobj.group('siteurl_1') or mobj.group('siteurl_2') + video_id = mobj.group('id') + + stream = self._download_json( + 'https://%s.webex.com/webappng/api/v1/recordings/%s/stream' % (subdomain, video_id), + video_id, fatal=False, query={'siteurl': siteurl}) + if not stream: + self.raise_login_required(method='cookies') + + video_id = stream.get('recordUUID') or video_id + + formats = [{ + 'format_id': 'video', + 'url': stream['fallbackPlaySrc'], + 'ext': 'mp4', + 'vcodec': 'avc1.640028', + 'acodec': 'mp4a.40.2', + }] + if stream.get('preventDownload') is False: + mp4url = try_get(stream, lambda x: x['downloadRecordingInfo']['downloadInfo']['mp4URL']) + if mp4url: + formats.append({ + 'format_id': 'video', + 'url': mp4url, + 'ext': 'mp4', + 'vcodec': 'avc1.640028', + 'acodec': 'mp4a.40.2', + }) + audiourl = try_get(stream, lambda x: x['downloadRecordingInfo']['downloadInfo']['audioURL']) + if audiourl: + formats.append({ + 'format_id': 'audio', + 'url': audiourl, + 'ext': 'mp3', + 'vcodec': 'none', + 'acodec': 'mp3', + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': stream['recordName'], + 'description': stream.get('description'), + 'uploader': stream.get('ownerDisplayName'), + 'uploader_id': stream.get('ownerUserName') or stream.get('ownerId'), # mail or id + 'timestamp': unified_timestamp(stream.get('createTime')), + 'duration': int_or_none(stream.get('duration'), 1000), + 'webpage_url': 'https://%s.webex.com/recordingservice/sites/%s/recording/playback/%s' % (subdomain, siteurl, video_id), + 'formats': formats, + } diff --git a/yt_dlp/extractor/comedycentral.py b/yt_dlp/extractor/comedycentral.py index 1bfa912be4..5a12ab5e69 100644 --- a/yt_dlp/extractor/comedycentral.py +++ b/yt_dlp/extractor/comedycentral.py @@ -4,7 +4,7 @@ class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?)/(?P[0-9a-z]{6})' + _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?|collection-playlist)/(?P[0-9a-z]{6})' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TESTS = [{ @@ -24,6 +24,9 @@ class ComedyCentralIE(MTVServicesInfoExtractor): }, { 'url': 'https://www.cc.com/video/k3sdvm/the-daily-show-with-jon-stewart-exclusive-the-fourth-estate', 'only_matching': True, + }, { + 'url': 'https://www.cc.com/collection-playlist/cosnej/stand-up-specials/t6vtjb', + 'only_matching': True, }] diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index b7a55177f9..3260399cb8 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2,8 +2,9 @@ from __future__ import unicode_literals import base64 -import datetime +import collections import hashlib +import itertools import json import netrc import os @@ -18,6 +19,7 @@ compat_cookies_SimpleCookie, compat_etree_Element, compat_etree_fromstring, + compat_expanduser, compat_getpass, compat_http_client, compat_os_name, @@ -52,6 +54,7 @@ GeoRestrictedError, GeoUtils, int_or_none, + join_nonempty, js_to_json, JSON_LD_RE, mimetype2ext, @@ -72,6 +75,7 @@ strip_or_none, traverse_obj, unescapeHTML, + UnsupportedError, unified_strdate, unified_timestamp, update_Request, @@ -145,6 +149,8 @@ class InfoExtractor(object): * width Width of the video, if known * height Height of the video, if known * resolution Textual description of width and height + * dynamic_range The dynamic range of the video. One of: + "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV" * tbr Average bitrate of audio and video in KBit/s * abr Average audio bitrate in KBit/s * acodec Name of the audio codec in use @@ -157,9 +163,8 @@ class InfoExtractor(object): * filesize_approx An estimate for the number of bytes * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual - download, lower-case. - "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe", - "m3u8", "m3u8_native" or "http_dash_segments". + download, lower-case. One of "http", "https" or + one of the protocols defined in downloader.PROTOCOL_MAP * fragment_base_url Base URL for fragments. Each fragment's path value (if present) will be relative to @@ -175,6 +180,8 @@ class InfoExtractor(object): fragment_base_url * "duration" (optional, int or float) * "filesize" (optional, int) + * is_from_start Is a live format that can be downloaded + from the start. Boolean * preference Order number of this format. If this field is present and not None, the formats get sorted by this field, regardless of all other values. @@ -231,7 +238,6 @@ class InfoExtractor(object): * "resolution" (optional, string "{width}x{height}", deprecated) * "filesize" (optional, int) - * "_test_url" (optional, bool) - If true, test the URL thumbnail: Full URL to a video thumbnail image. description: Full video description. uploader: Full name of the video uploader. @@ -337,6 +343,7 @@ class InfoExtractor(object): series, programme or podcast: series: Title of the series or programme the video episode belongs to. + series_id: Id of the series or programme the video episode belongs to, as a unicode string. season: Title of the season the video episode belongs to. season_number: Number of the season the video episode belongs to, as an integer. season_id: Id of the season the video episode belongs to, as a unicode string. @@ -405,6 +412,10 @@ class InfoExtractor(object): _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. + Subclasses may also override suitable() if necessary, but ensure the function + signature is preserved and that this function imports everything it needs + (except other extractors), so that lazy_extractors works correctly + _GEO_BYPASS attribute may be set to False in order to disable geo restriction bypass mechanisms for a particular extractor. Though it won't disable explicit geo restriction bypass based on @@ -420,7 +431,7 @@ class InfoExtractor(object): will be used by geo restriction bypass mechanism similarly to _GEO_COUNTRIES. - Finally, the _WORKING attribute should be set to False for broken IEs + The _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. """ @@ -433,15 +444,17 @@ class InfoExtractor(object): _WORKING = True _LOGIN_HINTS = { - 'any': 'Use --cookies, --username and --password or --netrc to provide account credentials', + 'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials', 'cookies': ( - 'Use --cookies for the authentication. ' - 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to pass cookies'), - 'password': 'Use --username and --password or --netrc to provide account credentials', + 'Use --cookies-from-browser or --cookies for the authentication. ' + 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'), + 'password': 'Use --username and --password, or --netrc to provide account credentials', } def __init__(self, downloader=None): - """Constructor. Receives an optional downloader.""" + """Constructor. Receives an optional downloader (a YoutubeDL instance). + If a downloader is not passed during initialization, + it must be set using "set_downloader()" before "extract()" is called""" self._ready = False self._x_forwarded_for_ip = None self._printed_messages = set() @@ -453,6 +466,8 @@ def _match_valid_url(cls, url): # we have cached the regexp for *this* class, whereas getattr would also # match the superclass if '_VALID_URL_RE' not in cls.__dict__: + if '_VALID_URL' not in cls.__dict__: + cls._VALID_URL = cls._make_valid_url() cls._VALID_URL_RE = re.compile(cls._VALID_URL) return cls._VALID_URL_RE.match(url) @@ -595,10 +610,19 @@ def extract(self, url): if self.__maybe_fake_ip_and_retry(e.countries): continue raise + except UnsupportedError: + raise except ExtractorError as e: - video_id = e.video_id or self.get_temp_id(url) - raise ExtractorError( - e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause) + kwargs = { + 'video_id': e.video_id or self.get_temp_id(url), + 'ie': self.IE_NAME, + 'tb': e.traceback or sys.exc_info()[2], + 'expected': e.expected, + 'cause': e.cause + } + if hasattr(e, 'countries'): + kwargs['countries'] = e.countries + raise type(e)(e.msg, **kwargs) except compat_http_client.IncompleteRead as e: raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url)) except (KeyError, StopIteration) as e: @@ -657,7 +681,7 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa See _download_webpage docstring for arguments specification. """ if not self._downloader._first_webpage_request: - sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0 + sleep_interval = self.get_param('sleep_interval_requests') or 0 if sleep_interval > 0: self.to_screen('Sleeping %s seconds ...' % sleep_interval) time.sleep(sleep_interval) @@ -788,9 +812,10 @@ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errno self._downloader.to_screen(dump) if self.get_param('write_pages', False): basen = '%s_%s' % (video_id, urlh.geturl()) - if len(basen) > 240: + trim_length = self.get_param('trim_file_name') or 240 + if len(basen) > trim_length: h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() - basen = basen[:240 - len(h)] + h + basen = basen[:trim_length - len(h)] + h raw_filename = basen + '.dump' filename = sanitize_filename(raw_filename, restricted=True) self.to_screen('Saving request to ' + filename) @@ -1056,7 +1081,8 @@ def report_login(self): def raise_login_required( self, msg='This video is only available for registered users', metadata_available=False, method='any'): - if metadata_available and self.get_param('ignore_no_formats_error'): + if metadata_available and ( + self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')): self.report_warning(msg) if method is not None: msg = '%s. %s' % (msg, self._LOGIN_HINTS[method]) @@ -1065,13 +1091,15 @@ def raise_login_required( def raise_geo_restricted( self, msg='This video is not available from your location due to geo restriction', countries=None, metadata_available=False): - if metadata_available and self.get_param('ignore_no_formats_error'): + if metadata_available and ( + self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')): self.report_warning(msg) else: raise GeoRestrictedError(msg, countries=countries) def raise_no_formats(self, msg, expected=False, video_id=None): - if expected and self.get_param('ignore_no_formats_error'): + if expected and ( + self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')): self.report_warning(msg, video_id) elif isinstance(msg, ExtractorError): raise msg @@ -1080,12 +1108,13 @@ def raise_no_formats(self, msg, expected=False, video_id=None): # Methods for following #608 @staticmethod - def url_result(url, ie=None, video_id=None, video_title=None): + def url_result(url, ie=None, video_id=None, video_title=None, **kwargs): """Returns a URL that points to a page that should be processed""" # TODO: ie should be the class used for getting the info video_info = {'_type': 'url', 'url': url, 'ie_key': ie} + video_info.update(kwargs) if video_id is not None: video_info['id'] = video_id if video_title is not None: @@ -1128,10 +1157,7 @@ def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, f if mobj: break - if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): - _name = '\033[0;34m%s\033[0m' % name - else: - _name = name + _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS) if mobj: if group is None: @@ -1166,7 +1192,10 @@ def _get_netrc_login_info(self, netrc_machine=None): if self.get_param('usenetrc', False): try: - info = netrc.netrc().authenticators(netrc_machine) + netrc_file = compat_expanduser(self.get_param('netrc_location') or '~') + if os.path.isdir(netrc_file): + netrc_file = os.path.join(netrc_file, '.netrc') + info = netrc.netrc(file=netrc_file).authenticators(netrc_machine) if info is not None: username = info[0] password = info[2] @@ -1423,11 +1452,19 @@ def extract_video_object(e): }) extract_interaction_statistic(e) - for e in json_ld: - if '@context' in e: + def traverse_json_ld(json_ld, at_top_level=True): + for e in json_ld: + if at_top_level and '@context' not in e: + continue + if at_top_level and set(e.keys()) == {'@context', '@graph'}: + traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False) + break item_type = e.get('@type') if expected_type is not None and expected_type != item_type: continue + rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none) + if rating is not None: + info['average_rating'] = rating if item_type in ('TVEpisode', 'Episode'): episode_name = unescapeHTML(e.get('name')) info.update({ @@ -1457,7 +1494,7 @@ def extract_video_object(e): info.update({ 'timestamp': parse_iso8601(e.get('datePublished')), 'title': unescapeHTML(e.get('headline')), - 'description': unescapeHTML(e.get('articleBody')), + 'description': unescapeHTML(e.get('articleBody') or e.get('description')), }) elif item_type == 'VideoObject': extract_video_object(e) @@ -1472,8 +1509,35 @@ def extract_video_object(e): continue else: break + traverse_json_ld(json_ld) + return dict((k, v) for k, v in info.items() if v is not None) + def _search_nextjs_data(self, webpage, video_id, **kw): + return self._parse_json( + self._search_regex( + r'(?s)]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)', + webpage, 'next.js data', **kw), + video_id, **kw) + + def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'): + ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. ''' + # not all website do this, but it can be changed + # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source + rectx = re.escape(context_name) + js, arg_keys, arg_vals = self._search_regex( + (r'' % rectx, + r'%s\(.*?\(function\((?P.*?)\)\{return\s(?P\{.*?\})\}\((?P.*?)\)' % rectx), + webpage, context_name, group=['js', 'arg_keys', 'arg_vals']) + + args = dict(zip(arg_keys.split(','), arg_vals.split(','))) + + for key, val in args.items(): + if val in ('undefined', 'void 0'): + args[key] = 'null' + + return self._parse_json(js_to_json(js, args), video_id)['data'][0] + @staticmethod def _hidden_inputs(html): html = re.sub(r'', '', html) @@ -1500,19 +1564,21 @@ class FormatSort: regex = r' *((?P\+)?(?P[a-zA-Z0-9_]+)((?P[~:])(?P.*?))?)? *$' default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality', - 'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr', - 'proto', 'ext', 'hasaud', 'source', 'format_id') # These must not be aliases - ytdl_default = ('hasaud', 'quality', 'tbr', 'filesize', 'vbr', + 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr', + 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases + ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr', 'height', 'width', 'proto', 'vext', 'abr', 'aext', - 'fps', 'fs_approx', 'source', 'format_id') + 'fps', 'fs_approx', 'source', 'id') settings = { 'vcodec': {'type': 'ordered', 'regex': True, 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, 'acodec': {'type': 'ordered', 'regex': True, - 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']}, + 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']}, + 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range', + 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]}, 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol', - 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']}, + 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']}, 'vext': {'type': 'ordered', 'field': 'video_ext', 'order': ('mp4', 'webm', 'flv', '', 'none'), 'order_free': ('webm', 'mp4', 'flv', '', 'none')}, @@ -1526,8 +1592,8 @@ class FormatSort: 'ie_pref': {'priority': True, 'type': 'extractor'}, 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)}, 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)}, - 'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'}, - 'quality': {'convert': 'float_none', 'default': -1}, + 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1}, + 'quality': {'convert': 'float', 'default': -1}, 'filesize': {'convert': 'bytes'}, 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'}, 'id': {'convert': 'string', 'field': 'format_id'}, @@ -1538,7 +1604,7 @@ class FormatSort: 'vbr': {'convert': 'float_none'}, 'abr': {'convert': 'float_none'}, 'asr': {'convert': 'float_none'}, - 'source': {'convert': 'ignore', 'field': 'source_preference'}, + 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1}, 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')}, 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True}, @@ -1547,7 +1613,12 @@ class FormatSort: 'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))}, - # Most of these exist only for compatibility reasons + # For compatibility with youtube-dl + 'format_id': {'type': 'alias', 'field': 'id'}, + 'preference': {'type': 'alias', 'field': 'ie_pref'}, + 'language_preference': {'type': 'alias', 'field': 'lang'}, + + # Deprecated 'dimension': {'type': 'alias', 'field': 'res'}, 'resolution': {'type': 'alias', 'field': 'res'}, 'extension': {'type': 'alias', 'field': 'ext'}, @@ -1556,7 +1627,6 @@ class FormatSort: 'video_bitrate': {'type': 'alias', 'field': 'vbr'}, 'audio_bitrate': {'type': 'alias', 'field': 'abr'}, 'framerate': {'type': 'alias', 'field': 'fps'}, - 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists 'protocol': {'type': 'alias', 'field': 'proto'}, 'source_preference': {'type': 'alias', 'field': 'source'}, 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'}, @@ -1571,15 +1641,23 @@ class FormatSort: 'audio': {'type': 'alias', 'field': 'hasaud'}, 'has_audio': {'type': 'alias', 'field': 'hasaud'}, 'extractor': {'type': 'alias', 'field': 'ie_pref'}, - 'preference': {'type': 'alias', 'field': 'ie_pref'}, 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'}, - 'format_id': {'type': 'alias', 'field': 'id'}, } - _order = [] + def __init__(self, ie, field_preference): + self._order = [] + self.ydl = ie._downloader + self.evaluate_params(self.ydl.params, field_preference) + if ie.get_param('verbose'): + self.print_verbose_info(self.ydl.write_debug) def _get_field_setting(self, field, key): if field not in self.settings: + if key in ('forced', 'priority'): + return False + self.ydl.deprecation_warning( + f'Using arbitrary fields ({field}) for format sorting is deprecated ' + 'and may be removed in a future version') self.settings[field] = {} propObj = self.settings[field] if key not in propObj: @@ -1662,7 +1740,11 @@ def add_item(field, reverse, closest, limit_text): if field is None: continue if self._get_field_setting(field, 'type') == 'alias': - field = self._get_field_setting(field, 'field') + alias, field = field, self._get_field_setting(field, 'field') + if alias not in ('format_id', 'preference', 'language_preference'): + self.ydl.deprecation_warning( + f'Format sorting alias {alias} is deprecated ' + f'and may be removed in a future version. Please use {field} instead') reverse = match.group('reverse') is not None closest = match.group('separator') == '~' limit_text = match.group('limit') @@ -1672,7 +1754,7 @@ def add_item(field, reverse, closest, limit_text): has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit') fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,) - limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple() + limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple() limit_count = len(limits) for (i, f) in enumerate(fields): add_item(f, reverse, closest, @@ -1756,9 +1838,9 @@ def calculate_preference(self, format): if format.get('vbr') is not None and format.get('abr') is not None: format['tbr'] = format.get('vbr', 0) + format.get('abr', 0) else: - if format.get('vcodec') != "none" and format.get('vbr') is None: + if format.get('vcodec') != 'none' and format.get('vbr') is None: format['vbr'] = format.get('tbr') - format.get('abr', 0) - if format.get('acodec') != "none" and format.get('abr') is None: + if format.get('acodec') != 'none' and format.get('abr') is None: format['abr'] = format.get('tbr') - format.get('vbr', 0) return tuple(self._calculate_field_preference(format, field) for field in self._order) @@ -1766,10 +1848,7 @@ def calculate_preference(self, format): def _sort_formats(self, formats, field_preference=[]): if not formats: return - format_sort = self.FormatSort() # params and to_screen are taken from the downloader - format_sort.evaluate_params(self._downloader.params, field_preference) - if self.get_param('verbose', False): - format_sort.print_verbose_info(self._downloader.write_debug) + format_sort = self.FormatSort(self, field_preference) formats.sort(key=lambda f: format_sort.calculate_preference(f)) def _check_formats(self, formats, video_id): @@ -1888,7 +1967,7 @@ def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, tbr = int_or_none(media_el.attrib.get('bitrate')) width = int_or_none(media_el.attrib.get('width')) height = int_or_none(media_el.attrib.get('height')) - format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) + format_id = join_nonempty(f4m_id, tbr or i) # If is present, the specified f4m is a # stream-level manifest, and only set-level manifests may refer to # external resources. See section 11.4 and section 4 of F4M spec @@ -1950,7 +2029,7 @@ def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None): return { - 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), + 'format_id': join_nonempty(m3u8_id, 'meta'), 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', @@ -1960,13 +2039,16 @@ def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m 'format_note': 'Quality selection URL', } + def _report_ignoring_subs(self, name): + self.report_warning(bug_reports_message( + f'Ignoring subtitle tracks found in the {name} manifest; ' + 'if any subtitle tracks are missing,' + ), only_once=True) + def _extract_m3u8_formats(self, *args, **kwargs): fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs) if subs: - self.report_warning(bug_reports_message( - "Ignoring subtitle tracks found in the HLS manifest; " - "if any subtitle tracks are missing," - ), only_once=True) + self._report_ignoring_subs('HLS') return fmts def _extract_m3u8_formats_and_subtitles( @@ -2000,10 +2082,10 @@ def _parse_m3u8_formats_and_subtitles( video_id=None): formats, subtitles = [], {} - if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access - return formats, subtitles - - has_drm = re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc) + has_drm = re.search('|'.join([ + r'#EXT-X-FAXS-CM:', # Adobe Flash Access + r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay + ]), m3u8_doc) def format_url(url): return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url) @@ -2042,7 +2124,7 @@ def _extract_m3u8_playlist_indices(*args, **kwargs): if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is formats = [{ - 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))), + 'format_id': join_nonempty(m3u8_id, idx), 'format_index': idx, 'url': m3u8_url, 'ext': ext, @@ -2091,7 +2173,7 @@ def extract_media(x_media_line): if media_url: manifest_url = format_url(media_url) formats.extend({ - 'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))), + 'format_id': join_nonempty(m3u8_id, group_id, name, idx), 'format_note': name, 'format_index': idx, 'url': manifest_url, @@ -2148,9 +2230,9 @@ def build_stream_name(): # format_id intact. if not live: stream_name = build_stream_name() - format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats)) + format_id[1] = stream_name or '%d' % (tbr or len(formats)) f = { - 'format_id': '-'.join(map(str, filter(None, format_id))), + 'format_id': join_nonempty(*format_id), 'format_index': idx, 'url': manifest_url, 'manifest_url': m3u8_url, @@ -2214,6 +2296,25 @@ def build_stream_name(): last_stream_inf = {} return formats, subtitles + def _extract_m3u8_vod_duration( + self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}): + + m3u8_vod = self._download_webpage( + m3u8_vod_url, video_id, + note='Downloading m3u8 VOD manifest' if note is None else note, + errnote='Failed to download VOD manifest' if errnote is None else errnote, + fatal=False, data=data, headers=headers, query=query) + + return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id) + + def _parse_m3u8_vod_duration(self, m3u8_vod, video_id): + if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod: + return None + + return int(sum( + float(line[len('#EXTINF:'):].split(',')[0]) + for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None + @staticmethod def _xpath_ns(path, namespace=None): if not namespace: @@ -2231,7 +2332,7 @@ def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4 if smil is False: assert not fatal - return [] + return [], {} namespace = self._parse_smil_namespace(smil) @@ -2245,10 +2346,7 @@ def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4 def _extract_smil_formats(self, *args, **kwargs): fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs) if subs: - self.report_warning(bug_reports_message( - "Ignoring subtitle tracks found in the SMIL manifest; " - "if any subtitle tracks are missing," - ), only_once=True) + self._report_ignoring_subs('SMIL') return fmts def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): @@ -2318,14 +2416,15 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para rtmp_count = 0 http_count = 0 m3u8_count = 0 + imgs_count = 0 - srcs = [] + srcs = set() media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) for medium in media: src = medium.get('src') if not src or src in srcs: continue - srcs.append(src) + srcs.add(src) bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000) filesize = int_or_none(medium.get('size') or medium.get('fileSize')) @@ -2399,6 +2498,24 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para 'height': height, }) + for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)): + src = medium.get('src') + if not src or src in srcs: + continue + srcs.add(src) + + imgs_count += 1 + formats.append({ + 'format_id': 'imagestream-%d' % (imgs_count), + 'url': src, + 'ext': mimetype2ext(medium.get('type')), + 'acodec': 'none', + 'vcodec': 'none', + 'width': int_or_none(medium.get('width')), + 'height': int_or_none(medium.get('height')), + 'format_note': 'SMIL storyboards', + }) + return formats def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): @@ -2471,10 +2588,7 @@ def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None): def _extract_mpd_formats(self, *args, **kwargs): fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs) if subs: - self.report_warning(bug_reports_message( - "Ignoring subtitle tracks found in the DASH manifest; " - "if any subtitle tracks are missing," - ), only_once=True) + self._report_ignoring_subs('DASH') return fmts def _extract_mpd_formats_and_subtitles( @@ -2498,10 +2612,7 @@ def _extract_mpd_formats_and_subtitles( def _parse_mpd_formats(self, *args, **kwargs): fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs) if subs: - self.report_warning(bug_reports_message( - "Ignoring subtitle tracks found in the DASH manifest; " - "if any subtitle tracks are missing," - ), only_once=True) + self._report_ignoring_subs('DASH') return fmts def _parse_mpd_formats_and_subtitles( @@ -2585,7 +2696,7 @@ def extract_Initialization(source): mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) formats, subtitles = [], {} - stream_numbers = {'audio': 0, 'video': 0} + stream_numbers = collections.defaultdict(int) for period in mpd_doc.findall(_add_ns('Period')): period_duration = parse_duration(period.get('duration')) or mpd_duration period_ms_info = extract_multisegment_info(period, { @@ -2607,6 +2718,8 @@ def extract_Initialization(source): content_type = mime_type elif codecs.split('.')[0] == 'stpp': content_type = 'text' + elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'): + content_type = 'text' else: self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) continue @@ -2618,8 +2731,10 @@ def extract_Initialization(source): base_url = base_url_e.text + base_url if re.match(r'^https?://', base_url): break - if mpd_base_url and not re.match(r'^https?://', base_url): - if not mpd_base_url.endswith('/') and not base_url.startswith('/'): + if mpd_base_url and base_url.startswith('/'): + base_url = compat_urlparse.urljoin(mpd_base_url, base_url) + elif mpd_base_url and not re.match(r'^https?://', base_url): + if not mpd_base_url.endswith('/'): mpd_base_url += '/' base_url = mpd_base_url + base_url representation_id = representation_attrib.get('id') @@ -2647,10 +2762,8 @@ def extract_Initialization(source): 'format_note': 'DASH %s' % content_type, 'filesize': filesize, 'container': mimetype2ext(mime_type) + '_dash', - 'manifest_stream_number': stream_numbers[content_type] } f.update(parse_codecs(codecs)) - stream_numbers[content_type] += 1 elif content_type == 'text': f = { 'ext': mimetype2ext(mime_type), @@ -2817,7 +2930,9 @@ def add_segment_url(): else: # Assuming direct URL to unfragmented media. f['url'] = base_url - if content_type in ('video', 'audio') or mime_type == 'image/jpeg': + if content_type in ('video', 'audio', 'image/jpeg'): + f['manifest_stream_number'] = stream_numbers[f['url']] + stream_numbers[f['url']] += 1 formats.append(f) elif content_type == 'text': subtitles.setdefault(lang or 'und', []).append(f) @@ -2827,10 +2942,7 @@ def add_segment_url(): def _extract_ism_formats(self, *args, **kwargs): fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs) if subs: - self.report_warning(bug_reports_message( - "Ignoring subtitle tracks found in the ISM manifest; " - "if any subtitle tracks are missing," - )) + self._report_ignoring_subs('ISM') return fmts def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): @@ -2909,13 +3021,6 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): }) fragment_ctx['time'] += fragment_ctx['duration'] - format_id = [] - if ism_id: - format_id.append(ism_id) - if stream_name: - format_id.append(stream_name) - format_id.append(compat_str(tbr)) - if stream_type == 'text': subtitles.setdefault(stream_language, []).append({ 'ext': 'ismt', @@ -2934,7 +3039,7 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): }) elif stream_type in ('video', 'audio'): formats.append({ - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(ism_id, stream_name, tbr), 'url': ism_url, 'manifest_url': ism_url, 'ext': 'ismv' if stream_type == 'video' else 'isma', @@ -3090,10 +3195,7 @@ def _media_formats(src, cur_media_type, type_info={}): def _extract_akamai_formats(self, *args, **kwargs): fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs) if subs: - self.report_warning(bug_reports_message( - "Ignoring subtitle tracks found in the manifests; " - "if any subtitle tracks are missing," - )) + self._report_ignoring_subs('akamai') return fmts def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}): @@ -3361,10 +3463,8 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, return formats def _live_title(self, name): - """ Generate the title for a live video """ - now = datetime.datetime.now() - now_str = now.strftime('%Y-%m-%d %H:%M') - return name + ' ' + now_str + self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected') + return name def _int(self, v, name, fatal=False, **kwargs): res = int_or_none(v, **kwargs) @@ -3467,6 +3567,36 @@ def extract_subtitles(self, *args, **kwargs): def _get_subtitles(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') + def extract_comments(self, *args, **kwargs): + if not self.get_param('getcomments'): + return None + generator = self._get_comments(*args, **kwargs) + + def extractor(): + comments = [] + interrupted = True + try: + while True: + comments.append(next(generator)) + except StopIteration: + interrupted = False + except KeyboardInterrupt: + self.to_screen('Interrupted by user') + except Exception as e: + if self.get_param('ignoreerrors') is not True: + raise + self._downloader.report_error(e) + comment_count = len(comments) + self.to_screen(f'Extracted {comment_count} comments') + return { + 'comments': comments, + 'comment_count': None if interrupted else comment_count + } + return extractor + + def _get_comments(self, *args, **kwargs): + raise NotImplementedError('This method must be implemented by subclasses') + @staticmethod def _merge_subtitle_items(subtitle_list1, subtitle_list2): """ Merge subtitle items for one language. Items with duplicated URLs @@ -3496,9 +3626,11 @@ def _get_automatic_captions(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') def mark_watched(self, *args, **kwargs): - if (self.get_param('mark_watched', False) - and (self._get_login_info()[0] is not None - or self.get_param('cookiefile') is not None)): + if not self.get_param('mark_watched', False): + return + if (self._get_login_info()[0] is not None + or self.get_param('cookiefile') + or self.get_param('cookiesfrombrowser')): self._mark_watched(*args, **kwargs) def _mark_watched(self, *args, **kwargs): @@ -3531,7 +3663,7 @@ def _availability(is_private=None, needs_premium=None, needs_subscription=None, else 'public' if all_known else None) - def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False): + def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False): ''' @returns A list of values for the extractor argument given by "key" or "default" if no such key is present @@ -3539,7 +3671,7 @@ def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False): @param casesense When false, the values are converted to lower case ''' val = traverse_obj( - self._downloader.params, ('extractor_args', self.ie_key().lower(), key)) + self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key)) if val is None: return [] if default is NO_DEFAULT else default return list(val) if casesense else [x.lower() for x in val] @@ -3549,24 +3681,17 @@ class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query} - Instances should define _SEARCH_KEY and _MAX_RESULTS. + Instances should define _SEARCH_KEY and optionally _MAX_RESULTS """ + _MAX_RESULTS = float('inf') + @classmethod def _make_valid_url(cls): return r'%s(?P|[1-9][0-9]*|all):(?P[\s\S]+)' % cls._SEARCH_KEY - @classmethod - def suitable(cls, url): - return re.match(cls._make_valid_url(), url) is not None - def _real_extract(self, query): - mobj = re.match(self._make_valid_url(), query) - if mobj is None: - raise ExtractorError('Invalid search query "%s"' % query) - - prefix = mobj.group('prefix') - query = mobj.group('query') + prefix, query = self._match_valid_url(query).group('prefix', 'query') if prefix == '': return self._get_n_results(query, 1) elif prefix == 'all': @@ -3581,7 +3706,14 @@ def _real_extract(self, query): return self._get_n_results(query, n) def _get_n_results(self, query, n): - """Get a specified number of results for a query""" + """Get a specified number of results for a query. + Either this function or _search_results must be overridden by subclasses """ + return self.playlist_result( + itertools.islice(self._search_results(query), 0, None if n == float('inf') else n), + query, query) + + def _search_results(self, query): + """Returns an iterator of search results""" raise NotImplementedError('This method must be implemented by subclasses') @property diff --git a/yt_dlp/extractor/corus.py b/yt_dlp/extractor/corus.py index 352951e201..119461375e 100644 --- a/yt_dlp/extractor/corus.py +++ b/yt_dlp/extractor/corus.py @@ -55,7 +55,6 @@ class CorusIE(ThePlatformFeedIE): 'timestamp': 1486392197, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, 'expected_warnings': ['Failed to parse JSON'], diff --git a/yt_dlp/extractor/coub.py b/yt_dlp/extractor/coub.py index eba6b73baa..e90aa1954f 100644 --- a/yt_dlp/extractor/coub.py +++ b/yt_dlp/extractor/coub.py @@ -57,7 +57,7 @@ def _real_extract(self, url): file_versions = coub['file_versions'] - QUALITIES = ('low', 'med', 'high') + QUALITIES = ('low', 'med', 'high', 'higher') MOBILE = 'mobile' IPHONE = 'iphone' @@ -86,6 +86,7 @@ def _real_extract(self, url): 'format_id': '%s-%s-%s' % (HTML5, kind, quality), 'filesize': int_or_none(item.get('size')), 'vcodec': 'none' if kind == 'audio' else None, + 'acodec': 'none' if kind == 'video' else None, 'quality': quality_key(quality), 'source_preference': preference_key(HTML5), }) diff --git a/yt_dlp/extractor/cozytv.py b/yt_dlp/extractor/cozytv.py new file mode 100644 index 0000000000..d49f1ca744 --- /dev/null +++ b/yt_dlp/extractor/cozytv.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class CozyTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cozy\.tv/(?P[^/]+)/replays/(?P[^/$#&?]+)' + + _TESTS = [{ + 'url': 'https://cozy.tv/beardson/replays/2021-11-19_1', + 'info_dict': { + 'id': 'beardson-2021-11-19_1', + 'ext': 'mp4', + 'title': 'pokemon pt2', + 'uploader': 'beardson', + 'upload_date': '20211119', + 'was_live': True, + 'duration': 7981, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + uploader, date = self._match_valid_url(url).groups() + id = f'{uploader}-{date}' + data_json = self._download_json(f'https://api.cozy.tv/cache/{uploader}/replay/{date}', id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'https://cozycdn.foxtrotstream.xyz/replays/{uploader}/{date}/index.m3u8', id, ext='mp4') + return { + 'id': id, + 'title': data_json.get('title'), + 'uploader': data_json.get('user') or uploader, + 'upload_date': unified_strdate(data_json.get('date')), + 'was_live': True, + 'duration': data_json.get('duration'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/crackle.py b/yt_dlp/extractor/crackle.py index 2c9d28d2e2..db4962c422 100644 --- a/yt_dlp/extractor/crackle.py +++ b/yt_dlp/extractor/crackle.py @@ -23,32 +23,35 @@ class CrackleIE(InfoExtractor): _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?(?:sony)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P\d+)' _TESTS = [{ - # geo restricted to CA - 'url': 'https://www.crackle.com/andromeda/2502343', + # Crackle is available in the United States and territories + 'url': 'https://www.crackle.com/thanksgiving/2510064', 'info_dict': { - 'id': '2502343', + 'id': '2510064', 'ext': 'mp4', - 'title': 'Under The Night', - 'description': 'md5:d2b8ca816579ae8a7bf28bfff8cefc8a', - 'duration': 2583, + 'title': 'Touch Football', + 'description': 'md5:cfbb513cf5de41e8b56d7ab756cff4df', + 'duration': 1398, 'view_count': int, 'average_rating': 0, - 'age_limit': 14, - 'genre': 'Action, Sci-Fi', - 'creator': 'Allan Kroeker', - 'artist': 'Keith Hamilton Cobb, Kevin Sorbo, Lisa Ryder, Lexa Doig, Robert Hewitt Wolfe', - 'release_year': 2000, - 'series': 'Andromeda', - 'episode': 'Under The Night', + 'age_limit': 17, + 'genre': 'Comedy', + 'creator': 'Daniel Powell', + 'artist': 'Chris Elliott, Amy Sedaris', + 'release_year': 2016, + 'series': 'Thanksgiving', + 'episode': 'Touch Football', 'season_number': 1, 'episode_number': 1, }, 'params': { # m3u8 download 'skip_download': True, - } + }, + 'expected_warnings': [ + 'Trying with a list of known countries' + ], }, { - 'url': 'https://www.sonycrackle.com/andromeda/2502343', + 'url': 'https://www.sonycrackle.com/thanksgiving/2510064', 'only_matching': True, }] @@ -129,7 +132,6 @@ def _real_extract(self, url): break ignore_no_formats = self.get_param('ignore_no_formats_error') - allow_unplayable_formats = self.get_param('allow_unplayable_formats') if not media or (not media.get('MediaURLs') and not ignore_no_formats): raise ExtractorError( @@ -143,9 +145,9 @@ def _real_extract(self, url): for e in media.get('MediaURLs') or []: if e.get('UseDRM'): has_drm = True - if not allow_unplayable_formats: - continue - format_url = url_or_none(e.get('Path')) + format_url = url_or_none(e.get('DRMPath')) + else: + format_url = url_or_none(e.get('Path')) if not format_url: continue ext = determine_ext(format_url) diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index 256c6943f2..cd35728e58 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -27,6 +27,7 @@ int_or_none, lowercase_escape, merge_dicts, + qualities, remove_end, sanitized_Request, try_get, @@ -478,19 +479,24 @@ def _real_extract(self, url): [r']+href="/publisher/[^"]+"[^>]*>([^<]+)', r'
\s*Publisher:\s*\s*(.+?)\s*\s*
'], webpage, 'video_uploader', default=False) + requested_languages = self._configuration_arg('language') + requested_hardsubs = [('' if val == 'none' else val) for val in self._configuration_arg('hardsub')] + language_preference = qualities((requested_languages or [language or ''])[::-1]) + hardsub_preference = qualities((requested_hardsubs or ['', language or ''])[::-1]) + formats = [] for stream in media.get('streams', []): - audio_lang = stream.get('audio_lang') - hardsub_lang = stream.get('hardsub_lang') + audio_lang = stream.get('audio_lang') or '' + hardsub_lang = stream.get('hardsub_lang') or '' + if (requested_languages and audio_lang.lower() not in requested_languages + or requested_hardsubs and hardsub_lang.lower() not in requested_hardsubs): + continue vrv_formats = self._extract_vrv_formats( stream.get('url'), video_id, stream.get('format'), audio_lang, hardsub_lang) for f in vrv_formats: - f['language_preference'] = 1 if audio_lang == language else 0 - f['quality'] = ( - 1 if not hardsub_lang - else 0 if hardsub_lang == language - else -1) + f['language_preference'] = language_preference(audio_lang) + f['quality'] = hardsub_preference(hardsub_lang) formats.extend(vrv_formats) if not formats: available_fmts = [] @@ -650,7 +656,7 @@ def _real_extract(self, url): class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): IE_NAME = 'crunchyroll:playlist' - _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P[\w\-]+))/?(?:\?|$)' + _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.com/(?:\w{1,2}/)?(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P[\w\-]+))/?(?:\?|$)' _TESTS = [{ 'url': 'https://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', @@ -672,6 +678,9 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): # geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14 'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1', 'only_matching': True, + }, { + 'url': 'http://www.crunchyroll.com/fr/ladies-versus-butlers', + 'only_matching': True, }] def _real_extract(self, url): @@ -683,18 +692,72 @@ def _real_extract(self, url): headers=self.geo_verification_headers()) title = self._html_search_meta('name', webpage, default=None) - episode_paths = re.findall( - r'(?s)
  • ]+>.*?]+>.*?]+season-dropdown[^>]+>([^<]+)' + paths = re.findall(f'(?s){episode_re}|{season_re}', webpage) + + entries, current_season = [], None + for ep_id, ep, season in paths: + if season: + current_season = season + continue + entries.append(self.url_result( + f'http://www.crunchyroll.com{ep}', CrunchyrollIE.ie_key(), ep_id, season=current_season)) return { '_type': 'playlist', 'id': show_id, 'title': title, - 'entries': entries, + 'entries': reversed(entries), } + + +class CrunchyrollBetaIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:beta' + _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P(?:\w{1,2}/)?)watch/(?P\w+)/(?P[\w\-]+)/?(?:\?|$)' + _TESTS = [{ + 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', + 'info_dict': { + 'id': '696363', + 'ext': 'mp4', + 'timestamp': 1459610100, + 'description': 'md5:a022fbec4fbb023d43631032c91ed64b', + 'uploader': 'Toei Animation', + 'title': 'World Trigger Episode 73 – To the Future', + 'upload_date': '20160402', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Unable to download XML'] + }] + + def _real_extract(self, url): + lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'internal_id', 'id') + webpage = self._download_webpage(url, display_id) + episode_data = self._parse_json( + self._search_regex(r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'episode data'), + display_id)['content']['byId'][internal_id] + video_id = episode_data['external_id'].split('.')[1] + series_id = episode_data['episode_metadata']['series_slug_title'] + return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id}/{display_id}-{video_id}', + CrunchyrollIE.ie_key(), video_id) + + +class CrunchyrollBetaShowIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:playlist:beta' + _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P(?:\w{1,2}/)?)series/\w+/(?P[\w\-]+)/?(?:\?|$)' + _TESTS = [{ + 'url': 'https://beta.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', + 'info_dict': { + 'id': 'girl-friend-beta', + 'title': 'Girl Friend BETA', + }, + 'playlist_mincount': 10, + }, { + 'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR/Girl-Friend-BETA', + 'only_matching': True, + }] + + def _real_extract(self, url): + lang, series_id = self._match_valid_url(url).group('lang', 'id') + return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id.lower()}', + CrunchyrollShowPlaylistIE.ie_key(), series_id) diff --git a/yt_dlp/extractor/cspan.py b/yt_dlp/extractor/cspan.py index 2e01aff488..c717aec3ac 100644 --- a/yt_dlp/extractor/cspan.py +++ b/yt_dlp/extractor/cspan.py @@ -18,7 +18,7 @@ str_to_int, unescapeHTML, ) -from .senateisvp import SenateISVPIE +from .senategov import SenateISVPIE from .ustream import UstreamIE diff --git a/yt_dlp/extractor/curiositystream.py b/yt_dlp/extractor/curiositystream.py index 034a5c92ad..485b6031fc 100644 --- a/yt_dlp/extractor/curiositystream.py +++ b/yt_dlp/extractor/curiositystream.py @@ -15,7 +15,6 @@ class CuriosityStreamBaseIE(InfoExtractor): _NETRC_MACHINE = 'curiositystream' _auth_token = None - _API_BASE_URL = 'https://api.curiositystream.com/v1/' def _handle_errors(self, result): error = result.get('error', {}).get('message') @@ -39,38 +38,44 @@ def _real_initialize(self): if email is None: return result = self._download_json( - self._API_BASE_URL + 'login', None, data=urlencode_postdata({ + 'https://api.curiositystream.com/v1/login', None, + note='Logging in', data=urlencode_postdata({ 'email': email, 'password': password, })) self._handle_errors(result) - self._auth_token = result['message']['auth_token'] + CuriosityStreamBaseIE._auth_token = result['message']['auth_token'] class CuriosityStreamIE(CuriosityStreamBaseIE): IE_NAME = 'curiositystream' _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'https://app.curiositystream.com/video/2', 'info_dict': { 'id': '2', 'ext': 'mp4', 'title': 'How Did You Develop The Internet?', 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', + 'channel': 'Curiosity Stream', + 'categories': ['Technology', 'Interview'], + 'average_rating': 96.79, + 'series_id': '2', }, 'params': { - 'format': 'bestvideo', # m3u8 download 'skip_download': True, }, - } + }] + + _API_BASE_URL = 'https://api.curiositystream.com/v1/media/' def _real_extract(self, url): video_id = self._match_id(url) formats = [] for encoding_format in ('m3u8', 'mpd'): - media = self._call_api('media/' + video_id, video_id, query={ + media = self._call_api(video_id, video_id, query={ 'encodingsNew': 'true', 'encodingsFormat': encoding_format, }) @@ -140,12 +145,33 @@ def _real_extract(self, url): 'duration': int_or_none(media.get('duration')), 'tags': media.get('tags'), 'subtitles': subtitles, + 'channel': media.get('producer'), + 'categories': [media.get('primary_category'), media.get('type')], + 'average_rating': media.get('rating_percentage'), + 'series_id': str(media.get('collection_id') or '') or None, } -class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): - IE_NAME = 'curiositystream:collection' - _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collections?|series)/(?P\d+)' +class CuriosityStreamCollectionBaseIE(CuriosityStreamBaseIE): + + def _real_extract(self, url): + collection_id = self._match_id(url) + collection = self._call_api(collection_id, collection_id) + entries = [] + for media in collection.get('media', []): + media_id = compat_str(media.get('id')) + media_type, ie = ('series', CuriosityStreamSeriesIE) if media.get('is_collection') else ('video', CuriosityStreamIE) + entries.append(self.url_result( + 'https://curiositystream.com/%s/%s' % (media_type, media_id), + ie=ie.ie_key(), video_id=media_id)) + return self.playlist_result( + entries, collection_id, + collection.get('title'), collection.get('description')) + + +class CuriosityStreamCollectionsIE(CuriosityStreamCollectionBaseIE): + IE_NAME = 'curiositystream:collections' + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/collections/(?P\d+)' _API_BASE_URL = 'https://api.curiositystream.com/v2/collections/' _TESTS = [{ 'url': 'https://curiositystream.com/collections/86', @@ -156,7 +182,17 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): }, 'playlist_mincount': 7, }, { - 'url': 'https://app.curiositystream.com/collection/2', + 'url': 'https://curiositystream.com/collections/36', + 'only_matching': True, + }] + + +class CuriosityStreamSeriesIE(CuriosityStreamCollectionBaseIE): + IE_NAME = 'curiositystream:series' + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:series|collection)/(?P\d+)' + _API_BASE_URL = 'https://api.curiositystream.com/v2/series/' + _TESTS = [{ + 'url': 'https://curiositystream.com/series/2', 'info_dict': { 'id': '2', 'title': 'Curious Minds: The Internet', @@ -164,23 +200,6 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): }, 'playlist_mincount': 16, }, { - 'url': 'https://curiositystream.com/series/2', - 'only_matching': True, - }, { - 'url': 'https://curiositystream.com/collections/36', + 'url': 'https://curiositystream.com/collection/2', 'only_matching': True, }] - - def _real_extract(self, url): - collection_id = self._match_id(url) - collection = self._call_api(collection_id, collection_id) - entries = [] - for media in collection.get('media', []): - media_id = compat_str(media.get('id')) - media_type, ie = ('series', CuriosityStreamCollectionIE) if media.get('is_collection') else ('video', CuriosityStreamIE) - entries.append(self.url_result( - 'https://curiositystream.com/%s/%s' % (media_type, media_id), - ie=ie.ie_key(), video_id=media_id)) - return self.playlist_result( - entries, collection_id, - collection.get('title'), collection.get('description')) diff --git a/yt_dlp/extractor/dailymotion.py b/yt_dlp/extractor/dailymotion.py index e04e10b865..b4211e1e44 100644 --- a/yt_dlp/extractor/dailymotion.py +++ b/yt_dlp/extractor/dailymotion.py @@ -305,7 +305,7 @@ def _real_extract(self, url): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': clean_html(media.get('description')), 'thumbnails': thumbnails, 'duration': int_or_none(metadata.get('duration')) or None, diff --git a/yt_dlp/extractor/damtomo.py b/yt_dlp/extractor/damtomo.py new file mode 100644 index 0000000000..456cd35a44 --- /dev/null +++ b/yt_dlp/extractor/damtomo.py @@ -0,0 +1,113 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError, clean_html, int_or_none, try_get, unified_strdate +from ..compat import compat_str + + +class DamtomoBaseIE(InfoExtractor): + def _real_extract(self, url): + video_id = self._match_id(url) + webpage, handle = self._download_webpage_handle(self._WEBPAGE_URL_TMPL % video_id, video_id, encoding='sjis') + + if handle.url == 'https://www.clubdam.com/sorry/': + raise ExtractorError('You are rate-limited. Try again later.', expected=True) + if '

    予期せぬエラーが発生しました。

    ' in webpage: + raise ExtractorError('There is an error on server-side. Try again later.', expected=True) + + description = self._search_regex(r'(?m)
    \s*

    \s*([^<]*?)\s*

    ', webpage, 'description', default=None) + uploader_id = self._search_regex(r'
    (?P.+?)', webpage)} + + # since videos do not have title, give the name of song instead + data_dict['user_name'] = re.sub(r'\s*さん\s*$', '', data_dict['user_name']) + title = data_dict.get('song_title') + + stream_tree = self._download_xml( + self._DKML_XML_URL % video_id, video_id, note='Requesting stream information', encoding='sjis', + # doing this has no problem since there is no character outside ASCII, + # and never likely to happen in the future + transform_source=lambda x: re.sub(r'\s*encoding="[^"]+?"', '', x)) + m3u8_url = try_get(stream_tree, lambda x: x.find( + './/d:streamingUrl', {'d': self._DKML_XML_NS}).text.strip(), compat_str) + if not m3u8_url: + raise ExtractorError('Failed to obtain m3u8 URL') + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'uploader_id': uploader_id, + 'description': description, + 'uploader': data_dict.get('user_name'), + 'upload_date': unified_strdate(self._search_regex(r'(\d{4}/\d{2}/\d{2})', data_dict.get('date'), 'upload_date', default=None)), + 'view_count': int_or_none(self._search_regex(r'(\d+)', data_dict['audience'], 'view_count', default=None)), + 'like_count': int_or_none(self._search_regex(r'(\d+)', data_dict['nice'], 'like_count', default=None)), + 'track': title, + 'artist': data_dict.get('song_artist'), + 'formats': formats, + } + + +class DamtomoVideoIE(DamtomoBaseIE): + IE_NAME = 'damtomo:video' + _VALID_URL = r'https?://(?:www\.)?clubdam\.com/app/damtomo/(?:SP/)?karaokeMovie/StreamingDkm\.do\?karaokeMovieId=(?P\d+)' + _WEBPAGE_URL_TMPL = 'https://www.clubdam.com/app/damtomo/karaokeMovie/StreamingDkm.do?karaokeMovieId=%s' + _DKML_XML_URL = 'https://www.clubdam.com/app/damtomo/karaokeMovie/GetStreamingDkmUrlXML.do?movieSelectFlg=2&karaokeMovieId=%s' + _DKML_XML_NS = 'https://www.clubdam.com/app/damtomo/karaokeMovie/GetStreamingDkmUrlXML' + _TESTS = [{ + 'url': 'https://www.clubdam.com/app/damtomo/karaokeMovie/StreamingDkm.do?karaokeMovieId=2414316', + 'info_dict': { + 'id': '2414316', + 'title': 'Get Wild', + 'uploader': 'Kドロン', + 'uploader_id': 'ODk5NTQwMzQ', + 'track': 'Get Wild', + 'artist': 'TM NETWORK(TMN)', + 'upload_date': '20201226', + } + }] + + +class DamtomoRecordIE(DamtomoBaseIE): + IE_NAME = 'damtomo:record' + _VALID_URL = r'https?://(?:www\.)?clubdam\.com/app/damtomo/(?:SP/)?karaokePost/StreamingKrk\.do\?karaokeContributeId=(?P\d+)' + _WEBPAGE_URL_TMPL = 'https://www.clubdam.com/app/damtomo/karaokePost/StreamingKrk.do?karaokeContributeId=%s' + _DKML_XML_URL = 'https://www.clubdam.com/app/damtomo/karaokePost/GetStreamingKrkUrlXML.do?karaokeContributeId=%s' + _DKML_XML_NS = 'https://www.clubdam.com/app/damtomo/karaokePost/GetStreamingKrkUrlXML' + _TESTS = [{ + 'url': 'https://www.clubdam.com/app/damtomo/karaokePost/StreamingKrk.do?karaokeContributeId=27376862', + 'info_dict': { + 'id': '27376862', + 'title': 'イカSUMMER [良音]', + 'description': None, + 'uploader': 'NANA', + 'uploader_id': 'MzAyMDExNTY', + 'upload_date': '20210721', + 'view_count': 4, + 'like_count': 1, + 'track': 'イカSUMMER [良音]', + 'artist': 'ORANGE RANGE', + } + }, { + 'url': 'https://www.clubdam.com/app/damtomo/karaokePost/StreamingKrk.do?karaokeContributeId=27489418', + 'info_dict': { + 'id': '27489418', + 'title': '心みだれて〜say it with flowers〜(生音)', + 'uploader_id': 'NjI1MjI2MjU', + 'description': 'やっぱりキーを下げて正解だった感じ。リベンジ成功ということで。', + 'uploader': '箱の「中の人」', + 'upload_date': '20210815', + 'view_count': 5, + 'like_count': 3, + 'track': '心みだれて〜say it with flowers〜(生音)', + 'artist': '小林明子', + } + }] diff --git a/yt_dlp/extractor/discoverynetworks.py b/yt_dlp/extractor/discoverynetworks.py deleted file mode 100644 index f43c871602..0000000000 --- a/yt_dlp/extractor/discoverynetworks.py +++ /dev/null @@ -1,42 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - - -from .dplay import DPlayIE - - -class DiscoveryNetworksDeIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?(?P(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P[^/]+)/(?:video/)?(?P[^/]+)' - - _TESTS = [{ - 'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100', - 'info_dict': { - 'id': '78867', - 'ext': 'mp4', - 'title': 'Die Welt da draußen', - 'description': 'md5:61033c12b73286e409d99a41742ef608', - 'timestamp': 1554069600, - 'upload_date': '20190331', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - }, { - 'url': 'https://www.dmax.de/programme/dmax-highlights/video/tuning-star-sidney-hoffmann-exklusiv-bei-dmax/191023082312316', - 'only_matching': True, - }, { - 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B', - 'only_matching': True, - }, { - 'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/', - 'only_matching': True, - }] - - def _real_extract(self, url): - domain, programme, alternate_id = self._match_valid_url(url).groups() - country = 'GB' if domain == 'dplay.co.uk' else 'DE' - realm = 'questuk' if country == 'GB' else domain.replace('.', '') - return self._get_disco_api_info( - url, '%s/%s' % (programme, alternate_id), - 'sonic-eu1-prod.disco-api.com', realm, country) diff --git a/yt_dlp/extractor/discoveryplusindia.py b/yt_dlp/extractor/discoveryplusindia.py deleted file mode 100644 index 51801402c3..0000000000 --- a/yt_dlp/extractor/discoveryplusindia.py +++ /dev/null @@ -1,98 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json - -from ..compat import compat_str -from ..utils import try_get -from .common import InfoExtractor -from .dplay import DPlayIE - - -class DiscoveryPlusIndiaIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?discoveryplus\.in/videos?' + DPlayIE._PATH_REGEX - _TESTS = [{ - 'url': 'https://www.discoveryplus.in/videos/how-do-they-do-it/fugu-and-more?seasonId=8&type=EPISODE', - 'info_dict': { - 'id': '27104', - 'ext': 'mp4', - 'display_id': 'how-do-they-do-it/fugu-and-more', - 'title': 'Fugu and More', - 'description': 'The Japanese catch, prepare and eat the deadliest fish on the planet.', - 'duration': 1319, - 'timestamp': 1582309800, - 'upload_date': '20200221', - 'series': 'How Do They Do It?', - 'season_number': 8, - 'episode_number': 2, - 'creator': 'Discovery Channel', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - 'skip': 'Cookies (not necessarily logged in) are needed' - }] - - def _update_disco_api_headers(self, headers, disco_base, display_id, realm): - headers['x-disco-params'] = 'realm=%s' % realm - headers['x-disco-client'] = 'WEB:UNKNOWN:dplus-india:17.0.0' - - def _download_video_playback_info(self, disco_base, video_id, headers): - return self._download_json( - disco_base + 'playback/v3/videoPlaybackInfo', - video_id, headers=headers, data=json.dumps({ - 'deviceInfo': { - 'adBlocker': False, - }, - 'videoId': video_id, - }).encode('utf-8'))['data']['attributes']['streaming'] - - def _real_extract(self, url): - display_id = self._match_id(url) - return self._get_disco_api_info( - url, display_id, 'ap2-prod-direct.discoveryplus.in', 'dplusindia', 'in') - - -class DiscoveryPlusIndiaShowIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?discoveryplus\.in/show/(?P[^/]+)/?(?:[?#]|$)' - _TESTS = [{ - 'url': 'https://www.discoveryplus.in/show/how-do-they-do-it', - 'playlist_mincount': 140, - 'info_dict': { - 'id': 'how-do-they-do-it', - }, - }] - - def _entries(self, show_name): - headers = { - 'x-disco-client': 'WEB:UNKNOWN:dplus-india:prod', - 'x-disco-params': 'realm=dplusindia', - 'referer': 'https://www.discoveryplus.in/', - } - show_url = 'https://ap2-prod-direct.discoveryplus.in/cms/routes/show/{}?include=default'.format(show_name) - show_json = self._download_json(show_url, - video_id=show_name, - headers=headers)['included'][4]['attributes']['component'] - show_id = show_json['mandatoryParams'].split('=')[-1] - season_url = 'https://ap2-prod-direct.discoveryplus.in/content/videos?sort=episodeNumber&filter[seasonNumber]={}&filter[show.id]={}&page[size]=100&page[number]={}' - for season in show_json['filters'][0]['options']: - season_id = season['id'] - total_pages, page_num = 1, 0 - while page_num < total_pages: - season_json = self._download_json(season_url.format(season_id, show_id, compat_str(page_num + 1)), - video_id=show_id, headers=headers, - note='Downloading JSON metadata%s' % (' page %d' % page_num if page_num else '')) - if page_num == 0: - total_pages = try_get(season_json, lambda x: x['meta']['totalPages'], int) or 1 - episodes_json = season_json['data'] - for episode in episodes_json: - video_id = episode['attributes']['path'] - yield self.url_result( - 'https://discoveryplus.in/videos/%s' % video_id, - ie=DiscoveryPlusIndiaIE.ie_key(), video_id=video_id) - page_num += 1 - - def _real_extract(self, url): - show_name = self._match_valid_url(url).group('show_name') - return self.playlist_result(self._entries(show_name), playlist_id=show_name) diff --git a/yt_dlp/extractor/disney.py b/yt_dlp/extractor/disney.py index f018cbe9dd..0ad7b1f462 100644 --- a/yt_dlp/extractor/disney.py +++ b/yt_dlp/extractor/disney.py @@ -7,8 +7,8 @@ from ..utils import ( int_or_none, unified_strdate, - compat_str, determine_ext, + join_nonempty, update_url_query, ) @@ -119,18 +119,13 @@ def _real_extract(self, url): continue formats.append(f) continue - format_id = [] - if flavor_format: - format_id.append(flavor_format) - if tbr: - format_id.append(compat_str(tbr)) ext = determine_ext(flavor_url) if flavor_format == 'applehttp' or ext == 'm3u8': ext = 'mp4' width = int_or_none(flavor.get('width')) height = int_or_none(flavor.get('height')) formats.append({ - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(flavor_format, tbr), 'url': flavor_url, 'width': width, 'height': height, diff --git a/yt_dlp/extractor/dlive.py b/yt_dlp/extractor/dlive.py index 90462c0abb..7410eb6c87 100644 --- a/yt_dlp/extractor/dlive.py +++ b/yt_dlp/extractor/dlive.py @@ -84,7 +84,7 @@ def _real_extract(self, url): self._sort_formats(formats) return { 'id': display_name, - 'title': self._live_title(title), + 'title': title, 'uploader': display_name, 'uploader_id': username, 'formats': formats, diff --git a/yt_dlp/extractor/douyin.py b/yt_dlp/extractor/douyin.py deleted file mode 100644 index 7f3176be7a..0000000000 --- a/yt_dlp/extractor/douyin.py +++ /dev/null @@ -1,145 +0,0 @@ -# coding: utf-8 - -from ..utils import ( - int_or_none, - traverse_obj, - url_or_none, -) -from .common import ( - InfoExtractor, - compat_urllib_parse_unquote, -) - - -class DouyinIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P[0-9]+)' - _TESTS = [{ - 'url': 'https://www.douyin.com/video/6961737553342991651', - 'md5': '10523312c8b8100f353620ac9dc8f067', - 'info_dict': { - 'id': '6961737553342991651', - 'ext': 'mp4', - 'title': '#杨超越 小小水手带你去远航❤️', - 'uploader': '杨超越', - 'upload_date': '20210513', - 'timestamp': 1620905839, - 'uploader_id': '110403406559', - 'view_count': int, - 'like_count': int, - 'repost_count': int, - 'comment_count': int, - } - }, { - 'url': 'https://www.douyin.com/video/6982497745948921092', - 'md5': 'd78408c984b9b5102904cf6b6bc2d712', - 'info_dict': { - 'id': '6982497745948921092', - 'ext': 'mp4', - 'title': '这个夏日和小羊@杨超越 一起遇见白色幻想', - 'uploader': '杨超越工作室', - 'upload_date': '20210708', - 'timestamp': 1625739481, - 'uploader_id': '408654318141572', - 'view_count': int, - 'like_count': int, - 'repost_count': int, - 'comment_count': int, - } - }, { - 'url': 'https://www.douyin.com/video/6953975910773099811', - 'md5': '72e882e24f75064c218b76c8b713c185', - 'info_dict': { - 'id': '6953975910773099811', - 'ext': 'mp4', - 'title': '#一起看海 出现在你的夏日里', - 'uploader': '杨超越', - 'upload_date': '20210422', - 'timestamp': 1619098692, - 'uploader_id': '110403406559', - 'view_count': int, - 'like_count': int, - 'repost_count': int, - 'comment_count': int, - } - }, { - 'url': 'https://www.douyin.com/video/6950251282489675042', - 'md5': 'b4db86aec367ef810ddd38b1737d2fed', - 'info_dict': { - 'id': '6950251282489675042', - 'ext': 'mp4', - 'title': '哈哈哈,成功了哈哈哈哈哈哈', - 'uploader': '杨超越', - 'upload_date': '20210412', - 'timestamp': 1618231483, - 'uploader_id': '110403406559', - 'view_count': int, - 'like_count': int, - 'repost_count': int, - 'comment_count': int, - } - }, { - 'url': 'https://www.douyin.com/video/6963263655114722595', - 'md5': '1abe1c477d05ee62efb40bf2329957cf', - 'info_dict': { - 'id': '6963263655114722595', - 'ext': 'mp4', - 'title': '#哪个爱豆的105度最甜 换个角度看看我哈哈', - 'uploader': '杨超越', - 'upload_date': '20210517', - 'timestamp': 1621261163, - 'uploader_id': '110403406559', - 'view_count': int, - 'like_count': int, - 'repost_count': int, - 'comment_count': int, - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - render_data = self._parse_json( - self._search_regex( - r'', - webpage, 'render data'), - video_id, transform_source=compat_urllib_parse_unquote) - details = traverse_obj(render_data, (..., 'aweme', 'detail'), get_all=False) - - thumbnails = [{'url': self._proto_relative_url(url)} for url in traverse_obj( - details, ('video', ('cover', 'dynamicCover', 'originCover')), expected_type=url_or_none, default=[])] - - common = { - 'width': traverse_obj(details, ('video', 'width'), expected_type=int), - 'height': traverse_obj(details, ('video', 'height'), expected_type=int), - 'ext': 'mp4', - } - formats = [{**common, 'url': self._proto_relative_url(url)} for url in traverse_obj( - details, ('video', 'playAddr', ..., 'src'), expected_type=url_or_none, default=[]) if url] - self._remove_duplicate_formats(formats) - - download_url = traverse_obj(details, ('download', 'url'), expected_type=url_or_none) - if download_url: - formats.append({ - **common, - 'format_id': 'download', - 'url': self._proto_relative_url(download_url), - 'quality': 1, - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': details.get('desc') or self._html_search_meta('title', webpage), - 'formats': formats, - 'thumbnails': thumbnails, - 'uploader': traverse_obj(details, ('authorInfo', 'nickname'), expected_type=str), - 'uploader_id': traverse_obj(details, ('authorInfo', 'uid'), expected_type=str), - 'uploader_url': 'https://www.douyin.com/user/%s' % traverse_obj( - details, ('authorInfo', 'secUid'), expected_type=str), - 'timestamp': int_or_none(details.get('createTime')), - 'duration': traverse_obj(details, ('video', 'duration'), expected_type=int), - 'view_count': traverse_obj(details, ('stats', 'playCount'), expected_type=int), - 'like_count': traverse_obj(details, ('stats', 'diggCount'), expected_type=int), - 'repost_count': traverse_obj(details, ('stats', 'shareCount'), expected_type=int), - 'comment_count': traverse_obj(details, ('stats', 'commentCount'), expected_type=int), - } diff --git a/yt_dlp/extractor/douyutv.py b/yt_dlp/extractor/douyutv.py index 9757f44225..26a8d645cd 100644 --- a/yt_dlp/extractor/douyutv.py +++ b/yt_dlp/extractor/douyutv.py @@ -105,7 +105,7 @@ def _real_extract(self, url): 'aid': 'pcclient' })['data']['live_url'] - title = self._live_title(unescapeHTML(room['room_name'])) + title = unescapeHTML(room['room_name']) description = room.get('show_details') thumbnail = room.get('room_src') uploader = room.get('nickname') diff --git a/yt_dlp/extractor/dplay.py b/yt_dlp/extractor/dplay.py index fcc4ce4dcc..e1f5e9dc86 100644 --- a/yt_dlp/extractor/dplay.py +++ b/yt_dlp/extractor/dplay.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import json +import uuid from .common import InfoExtractor from ..compat import compat_HTTPError @@ -11,12 +12,172 @@ float_or_none, int_or_none, strip_or_none, + try_get, unified_timestamp, ) -class DPlayIE(InfoExtractor): +class DPlayBaseIE(InfoExtractor): _PATH_REGEX = r'/(?P[^/]+/[^/?#]+)' + _auth_token_cache = {} + + def _get_auth(self, disco_base, display_id, realm, needs_device_id=True): + key = (disco_base, realm) + st = self._get_cookies(disco_base).get('st') + token = (st and st.value) or self._auth_token_cache.get(key) + + if not token: + query = {'realm': realm} + if needs_device_id: + query['deviceId'] = uuid.uuid4().hex + token = self._download_json( + disco_base + 'token', display_id, 'Downloading token', + query=query)['data']['attributes']['token'] + + # Save cache only if cookies are not being set + if not self._get_cookies(disco_base).get('st'): + self._auth_token_cache[key] = token + + return f'Bearer {token}' + + def _process_errors(self, e, geo_countries): + info = self._parse_json(e.cause.read().decode('utf-8'), None) + error = info['errors'][0] + error_code = error.get('code') + if error_code == 'access.denied.geoblocked': + self.raise_geo_restricted(countries=geo_countries) + elif error_code in ('access.denied.missingpackage', 'invalid.token'): + raise ExtractorError( + 'This video is only available for registered users. You may want to use --cookies.', expected=True) + raise ExtractorError(info['errors'][0]['detail'], expected=True) + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers['Authorization'] = self._get_auth(disco_base, display_id, realm, False) + + def _download_video_playback_info(self, disco_base, video_id, headers): + streaming = self._download_json( + disco_base + 'playback/videoPlaybackInfo/' + video_id, + video_id, headers=headers)['data']['attributes']['streaming'] + streaming_list = [] + for format_id, format_dict in streaming.items(): + streaming_list.append({ + 'type': format_id, + 'url': format_dict.get('url'), + }) + return streaming_list + + def _get_disco_api_info(self, url, display_id, disco_host, realm, country, domain=''): + geo_countries = [country.upper()] + self._initialize_geo_bypass({ + 'countries': geo_countries, + }) + disco_base = 'https://%s/' % disco_host + headers = { + 'Referer': url, + } + self._update_disco_api_headers(headers, disco_base, display_id, realm) + try: + video = self._download_json( + disco_base + 'content/videos/' + display_id, display_id, + headers=headers, query={ + 'fields[channel]': 'name', + 'fields[image]': 'height,src,width', + 'fields[show]': 'name', + 'fields[tag]': 'name', + 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration', + 'include': 'images,primaryChannel,show,tags' + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + self._process_errors(e, geo_countries) + raise + video_id = video['data']['id'] + info = video['data']['attributes'] + title = info['name'].strip() + formats = [] + subtitles = {} + try: + streaming = self._download_video_playback_info( + disco_base, video_id, headers) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self._process_errors(e, geo_countries) + raise + for format_dict in streaming: + if not isinstance(format_dict, dict): + continue + format_url = format_dict.get('url') + if not format_url: + continue + format_id = format_dict.get('type') + ext = determine_ext(format_url) + if format_id == 'dash' or ext == 'mpd': + dash_fmts, dash_subs = self._extract_mpd_formats_and_subtitles( + format_url, display_id, mpd_id='dash', fatal=False) + formats.extend(dash_fmts) + subtitles = self._merge_subtitles(subtitles, dash_subs) + elif format_id == 'hls' or ext == 'm3u8': + m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + format_url, display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False) + formats.extend(m3u8_fmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + }) + self._sort_formats(formats) + + creator = series = None + tags = [] + thumbnails = [] + included = video.get('included') or [] + if isinstance(included, list): + for e in included: + attributes = e.get('attributes') + if not attributes: + continue + e_type = e.get('type') + if e_type == 'channel': + creator = attributes.get('name') + elif e_type == 'image': + src = attributes.get('src') + if src: + thumbnails.append({ + 'url': src, + 'width': int_or_none(attributes.get('width')), + 'height': int_or_none(attributes.get('height')), + }) + if e_type == 'show': + series = attributes.get('name') + elif e_type == 'tag': + name = attributes.get('name') + if name: + tags.append(name) + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': strip_or_none(info.get('description')), + 'duration': float_or_none(info.get('videoDuration'), 1000), + 'timestamp': unified_timestamp(info.get('publishStart')), + 'series': series, + 'season_number': int_or_none(info.get('seasonNumber')), + 'episode_number': int_or_none(info.get('episodeNumber')), + 'creator': creator, + 'tags': tags, + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, + 'http_headers': { + 'referer': domain, + }, + } + + +class DPlayIE(DPlayBaseIE): _VALID_URL = r'''(?x)https?:// (?P (?:www\.)?(?Pd @@ -26,7 +187,7 @@ class DPlayIE(InfoExtractor): ) )| (?Pes|it)\.dplay\.com - )/[^/]+''' + _PATH_REGEX + )/[^/]+''' + DPlayBaseIE._PATH_REGEX _TESTS = [{ # non geo restricted, via secure api, unsigned download hls URL @@ -46,7 +207,6 @@ class DPlayIE(InfoExtractor): 'episode_number': 1, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { @@ -67,7 +227,6 @@ class DPlayIE(InfoExtractor): 'episode_number': 1, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { @@ -87,7 +246,6 @@ class DPlayIE(InfoExtractor): 'episode_number': 7, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, 'skip': 'Available for Premium users', @@ -153,138 +311,6 @@ class DPlayIE(InfoExtractor): 'only_matching': True, }] - def _process_errors(self, e, geo_countries): - info = self._parse_json(e.cause.read().decode('utf-8'), None) - error = info['errors'][0] - error_code = error.get('code') - if error_code == 'access.denied.geoblocked': - self.raise_geo_restricted(countries=geo_countries) - elif error_code in ('access.denied.missingpackage', 'invalid.token'): - raise ExtractorError( - 'This video is only available for registered users. You may want to use --cookies.', expected=True) - raise ExtractorError(info['errors'][0]['detail'], expected=True) - - def _update_disco_api_headers(self, headers, disco_base, display_id, realm): - headers['Authorization'] = 'Bearer ' + self._download_json( - disco_base + 'token', display_id, 'Downloading token', - query={ - 'realm': realm, - })['data']['attributes']['token'] - - def _download_video_playback_info(self, disco_base, video_id, headers): - streaming = self._download_json( - disco_base + 'playback/videoPlaybackInfo/' + video_id, - video_id, headers=headers)['data']['attributes']['streaming'] - streaming_list = [] - for format_id, format_dict in streaming.items(): - streaming_list.append({ - 'type': format_id, - 'url': format_dict.get('url'), - }) - return streaming_list - - def _get_disco_api_info(self, url, display_id, disco_host, realm, country): - geo_countries = [country.upper()] - self._initialize_geo_bypass({ - 'countries': geo_countries, - }) - disco_base = 'https://%s/' % disco_host - headers = { - 'Referer': url, - } - self._update_disco_api_headers(headers, disco_base, display_id, realm) - try: - video = self._download_json( - disco_base + 'content/videos/' + display_id, display_id, - headers=headers, query={ - 'fields[channel]': 'name', - 'fields[image]': 'height,src,width', - 'fields[show]': 'name', - 'fields[tag]': 'name', - 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration', - 'include': 'images,primaryChannel,show,tags' - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - self._process_errors(e, geo_countries) - raise - video_id = video['data']['id'] - info = video['data']['attributes'] - title = info['name'].strip() - formats = [] - try: - streaming = self._download_video_playback_info( - disco_base, video_id, headers) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self._process_errors(e, geo_countries) - raise - for format_dict in streaming: - if not isinstance(format_dict, dict): - continue - format_url = format_dict.get('url') - if not format_url: - continue - format_id = format_dict.get('type') - ext = determine_ext(format_url) - if format_id == 'dash' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, display_id, mpd_id='dash', fatal=False)) - elif format_id == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, display_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - }) - self._sort_formats(formats) - - creator = series = None - tags = [] - thumbnails = [] - included = video.get('included') or [] - if isinstance(included, list): - for e in included: - attributes = e.get('attributes') - if not attributes: - continue - e_type = e.get('type') - if e_type == 'channel': - creator = attributes.get('name') - elif e_type == 'image': - src = attributes.get('src') - if src: - thumbnails.append({ - 'url': src, - 'width': int_or_none(attributes.get('width')), - 'height': int_or_none(attributes.get('height')), - }) - if e_type == 'show': - series = attributes.get('name') - elif e_type == 'tag': - name = attributes.get('name') - if name: - tags.append(name) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': strip_or_none(info.get('description')), - 'duration': float_or_none(info.get('videoDuration'), 1000), - 'timestamp': unified_timestamp(info.get('publishStart')), - 'series': series, - 'season_number': int_or_none(info.get('seasonNumber')), - 'episode_number': int_or_none(info.get('episodeNumber')), - 'creator': creator, - 'tags': tags, - 'thumbnails': thumbnails, - 'formats': formats, - } - def _real_extract(self, url): mobj = self._match_valid_url(url) display_id = mobj.group('id') @@ -292,11 +318,11 @@ def _real_extract(self, url): country = mobj.group('country') or mobj.group('subdomain_country') or mobj.group('plus_country') host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com' return self._get_disco_api_info( - url, display_id, host, 'dplay' + country, country) + url, display_id, host, 'dplay' + country, country, domain) -class HGTVDeIE(DPlayIE): - _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayIE._PATH_REGEX +class HGTVDeIE(DPlayBaseIE): + _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayBaseIE._PATH_REGEX _TESTS = [{ 'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/', 'info_dict': { @@ -313,9 +339,6 @@ class HGTVDeIE(DPlayIE): 'season_number': 3, 'episode_number': 3, }, - 'params': { - 'format': 'bestvideo', - }, }] def _real_extract(self, url): @@ -324,8 +347,8 @@ def _real_extract(self, url): url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de') -class DiscoveryPlusIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video' + DPlayIE._PATH_REGEX +class DiscoveryPlusIE(DPlayBaseIE): + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/(?!it/)(?:\w{2}/)?video' + DPlayBaseIE._PATH_REGEX _TESTS = [{ 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family', 'info_dict': { @@ -343,13 +366,16 @@ class DiscoveryPlusIE(DPlayIE): 'episode_number': 1, }, 'skip': 'Available for Premium users', + }, { + 'url': 'https://discoveryplus.com/ca/video/bering-sea-gold-discovery-ca/goldslingers', + 'only_matching': True, }] _PRODUCT = 'dplus_us' _API_URL = 'us1-prod-direct.discoveryplus.com' def _update_disco_api_headers(self, headers, disco_base, display_id, realm): - headers['x-disco-client'] = f'WEB:UNKNOWN:{self._PRODUCT}:15.0.0' + headers['x-disco-client'] = f'WEB:UNKNOWN:{self._PRODUCT}:25.2.6' def _download_video_playback_info(self, disco_base, video_id, headers): return self._download_json( @@ -372,7 +398,7 @@ def _real_extract(self, url): class ScienceChannelIE(DiscoveryPlusIE): - _VALID_URL = r'https?://(?:www\.)?sciencechannel\.com/video' + DPlayIE._PATH_REGEX + _VALID_URL = r'https?://(?:www\.)?sciencechannel\.com/video' + DPlayBaseIE._PATH_REGEX _TESTS = [{ 'url': 'https://www.sciencechannel.com/video/strangest-things-science-atve-us/nazi-mystery-machine', 'info_dict': { @@ -389,3 +415,211 @@ class ScienceChannelIE(DiscoveryPlusIE): _PRODUCT = 'sci' _API_URL = 'us1-prod-direct.sciencechannel.com' + + +class DIYNetworkIE(DiscoveryPlusIE): + _VALID_URL = r'https?://(?:watch\.)?diynetwork\.com/video' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://watch.diynetwork.com/video/pool-kings-diy-network/bringing-beach-life-to-texas', + 'info_dict': { + 'id': '2309730', + 'display_id': 'pool-kings-diy-network/bringing-beach-life-to-texas', + 'ext': 'mp4', + 'title': 'Bringing Beach Life to Texas', + 'description': 'The Pool Kings give a family a day at the beach in their own backyard.', + 'season_number': 10, + 'episode_number': 2, + }, + 'skip': 'Available for Premium users', + }] + + _PRODUCT = 'diy' + _API_URL = 'us1-prod-direct.watch.diynetwork.com' + + +class AnimalPlanetIE(DiscoveryPlusIE): + _VALID_URL = r'https?://(?:www\.)?animalplanet\.com/video' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.animalplanet.com/video/north-woods-law-animal-planet/squirrel-showdown', + 'info_dict': { + 'id': '3338923', + 'display_id': 'north-woods-law-animal-planet/squirrel-showdown', + 'ext': 'mp4', + 'title': 'Squirrel Showdown', + 'description': 'A woman is suspected of being in possession of flying squirrel kits.', + 'season_number': 16, + 'episode_number': 11, + }, + 'skip': 'Available for Premium users', + }] + + _PRODUCT = 'apl' + _API_URL = 'us1-prod-direct.animalplanet.com' + + +class DiscoveryPlusIndiaIE(DPlayBaseIE): + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.in/videos?' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.discoveryplus.in/videos/how-do-they-do-it/fugu-and-more?seasonId=8&type=EPISODE', + 'info_dict': { + 'id': '27104', + 'ext': 'mp4', + 'display_id': 'how-do-they-do-it/fugu-and-more', + 'title': 'Fugu and More', + 'description': 'The Japanese catch, prepare and eat the deadliest fish on the planet.', + 'duration': 1319, + 'timestamp': 1582309800, + 'upload_date': '20200221', + 'series': 'How Do They Do It?', + 'season_number': 8, + 'episode_number': 2, + 'creator': 'Discovery Channel', + }, + 'params': { + 'skip_download': True, + } + }] + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers.update({ + 'x-disco-params': 'realm=%s' % realm, + 'x-disco-client': 'WEB:UNKNOWN:dplus-india:17.0.0', + 'Authorization': self._get_auth(disco_base, display_id, realm), + }) + + def _download_video_playback_info(self, disco_base, video_id, headers): + return self._download_json( + disco_base + 'playback/v3/videoPlaybackInfo', + video_id, headers=headers, data=json.dumps({ + 'deviceInfo': { + 'adBlocker': False, + }, + 'videoId': video_id, + }).encode('utf-8'))['data']['attributes']['streaming'] + + def _real_extract(self, url): + display_id = self._match_id(url) + return self._get_disco_api_info( + url, display_id, 'ap2-prod-direct.discoveryplus.in', 'dplusindia', 'in', 'https://www.discoveryplus.in/') + + +class DiscoveryNetworksDeIE(DPlayBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?P(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P[^/]+)/(?:video/)?(?P[^/]+)' + + _TESTS = [{ + 'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100', + 'info_dict': { + 'id': '78867', + 'ext': 'mp4', + 'title': 'Die Welt da draußen', + 'description': 'md5:61033c12b73286e409d99a41742ef608', + 'timestamp': 1554069600, + 'upload_date': '20190331', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.dmax.de/programme/dmax-highlights/video/tuning-star-sidney-hoffmann-exklusiv-bei-dmax/191023082312316', + 'only_matching': True, + }, { + 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B', + 'only_matching': True, + }, { + 'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/', + 'only_matching': True, + }] + + def _real_extract(self, url): + domain, programme, alternate_id = self._match_valid_url(url).groups() + country = 'GB' if domain == 'dplay.co.uk' else 'DE' + realm = 'questuk' if country == 'GB' else domain.replace('.', '') + return self._get_disco_api_info( + url, '%s/%s' % (programme, alternate_id), + 'sonic-eu1-prod.disco-api.com', realm, country) + + +class DiscoveryPlusShowBaseIE(DPlayBaseIE): + + def _entries(self, show_name): + headers = { + 'x-disco-client': self._X_CLIENT, + 'x-disco-params': f'realm={self._REALM}', + 'referer': self._DOMAIN, + 'Authentication': self._get_auth(self._BASE_API, None, self._REALM), + } + show_json = self._download_json( + f'{self._BASE_API}cms/routes/{self._SHOW_STR}/{show_name}?include=default', + video_id=show_name, headers=headers)['included'][self._INDEX]['attributes']['component'] + show_id = show_json['mandatoryParams'].split('=')[-1] + season_url = self._BASE_API + 'content/videos?sort=episodeNumber&filter[seasonNumber]={}&filter[show.id]={}&page[size]=100&page[number]={}' + for season in show_json['filters'][0]['options']: + season_id = season['id'] + total_pages, page_num = 1, 0 + while page_num < total_pages: + season_json = self._download_json( + season_url.format(season_id, show_id, str(page_num + 1)), show_name, headers=headers, + note='Downloading season %s JSON metadata%s' % (season_id, ' page %d' % page_num if page_num else '')) + if page_num == 0: + total_pages = try_get(season_json, lambda x: x['meta']['totalPages'], int) or 1 + episodes_json = season_json['data'] + for episode in episodes_json: + video_path = episode['attributes']['path'] + yield self.url_result( + '%svideos/%s' % (self._DOMAIN, video_path), + ie=self._VIDEO_IE.ie_key(), video_id=episode.get('id') or video_path) + page_num += 1 + + def _real_extract(self, url): + show_name = self._match_valid_url(url).group('show_name') + return self.playlist_result(self._entries(show_name), playlist_id=show_name) + + +class DiscoveryPlusItalyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/it/video' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.discoveryplus.com/it/video/i-signori-della-neve/stagione-2-episodio-1-i-preparativi', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result(f'https://discoveryplus.it/video/{video_id}', DPlayIE.ie_key(), video_id) + + +class DiscoveryPlusItalyShowIE(DiscoveryPlusShowBaseIE): + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.it/programmi/(?P[^/]+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.discoveryplus.it/programmi/deal-with-it-stai-al-gioco', + 'playlist_mincount': 168, + 'info_dict': { + 'id': 'deal-with-it-stai-al-gioco', + }, + }] + + _BASE_API = 'https://disco-api.discoveryplus.it/' + _DOMAIN = 'https://www.discoveryplus.it/' + _X_CLIENT = 'WEB:UNKNOWN:dplay-client:2.6.0' + _REALM = 'dplayit' + _SHOW_STR = 'programmi' + _INDEX = 1 + _VIDEO_IE = DPlayIE + + +class DiscoveryPlusIndiaShowIE(DiscoveryPlusShowBaseIE): + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.in/show/(?P[^/]+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.discoveryplus.in/show/how-do-they-do-it', + 'playlist_mincount': 140, + 'info_dict': { + 'id': 'how-do-they-do-it', + }, + }] + + _BASE_API = 'https://ap2-prod-direct.discoveryplus.in/' + _DOMAIN = 'https://www.discoveryplus.in/' + _X_CLIENT = 'WEB:UNKNOWN:dplus-india:prod' + _REALM = 'dplusindia' + _SHOW_STR = 'show' + _INDEX = 4 + _VIDEO_IE = DiscoveryPlusIndiaIE diff --git a/yt_dlp/extractor/drooble.py b/yt_dlp/extractor/drooble.py new file mode 100644 index 0000000000..058425095f --- /dev/null +++ b/yt_dlp/extractor/drooble.py @@ -0,0 +1,116 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + try_get, +) + + +class DroobleIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://drooble\.com/(?: + (?:(?P[^/]+)/)?(?Psong|videos|music/albums)/(?P\d+)| + (?P[^/]+)/(?Pvideos|music)) + ''' + _TESTS = [{ + 'url': 'https://drooble.com/song/2858030', + 'md5': '5ffda90f61c7c318dc0c3df4179eb064', + 'info_dict': { + 'id': '2858030', + 'ext': 'mp3', + 'title': 'Skankocillin', + 'upload_date': '20200801', + 'timestamp': 1596241390, + 'uploader_id': '95894', + 'uploader': 'Bluebeat Shelter', + } + }, { + 'url': 'https://drooble.com/karl340758/videos/2859183', + 'info_dict': { + 'id': 'J6QCQY_I5Tk', + 'ext': 'mp4', + 'title': 'Skankocillin', + 'uploader_id': 'UCrSRoI5vVyeYihtWEYua7rg', + 'description': 'md5:ffc0bd8ba383db5341a86a6cd7d9bcca', + 'upload_date': '20200731', + 'uploader': 'Bluebeat Shelter', + } + }, { + 'url': 'https://drooble.com/karl340758/music/albums/2858031', + 'info_dict': { + 'id': '2858031', + }, + 'playlist_mincount': 8, + }, { + 'url': 'https://drooble.com/karl340758/music', + 'info_dict': { + 'id': 'karl340758', + }, + 'playlist_mincount': 8, + }, { + 'url': 'https://drooble.com/karl340758/videos', + 'info_dict': { + 'id': 'karl340758', + }, + 'playlist_mincount': 8, + }] + + def _call_api(self, method, video_id, data=None): + response = self._download_json( + f'https://drooble.com/api/dt/{method}', video_id, data=json.dumps(data).encode()) + if not response[0]: + raise ExtractorError('Unable to download JSON metadata') + return response[1] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + user = mobj.group('user') or mobj.group('user_2') + kind = mobj.group('kind') or mobj.group('kind_2') + display_id = mobj.group('id') or user + + if mobj.group('kind_2') == 'videos': + data = {'from_user': display_id, 'album': -1, 'limit': 18, 'offset': 0, 'order': 'new2old', 'type': 'video'} + elif kind in ('music/albums', 'music'): + data = {'user': user, 'public_only': True, 'individual_limit': {'singles': 1, 'albums': 1, 'playlists': 1}} + else: + data = {'url_slug': display_id, 'children': 10, 'order': 'old2new'} + + method = 'getMusicOverview' if kind in ('music/albums', 'music') else 'getElements' + json_data = self._call_api(method, display_id, data=data) + if kind in ('music/albums', 'music'): + json_data = json_data['singles']['list'] + + entites = [] + for media in json_data: + url = media.get('external_media_url') or media.get('link') + if url.startswith('https://www.youtube.com'): + entites.append({ + '_type': 'url', + 'url': url, + 'ie_key': 'Youtube' + }) + continue + is_audio = (media.get('type') or '').lower() == 'audio' + entites.append({ + 'url': url, + 'id': media['id'], + 'title': media['title'], + 'duration': int_or_none(media.get('duration')), + 'timestamp': int_or_none(media.get('timestamp')), + 'album': try_get(media, lambda x: x['album']['title']), + 'uploader': try_get(media, lambda x: x['creator']['display_name']), + 'uploader_id': try_get(media, lambda x: x['creator']['id']), + 'thumbnail': media.get('image_comment'), + 'like_count': int_or_none(media.get('likes')), + 'vcodec': 'none' if is_audio else None, + 'ext': 'mp3' if is_audio else None, + }) + + if len(entites) > 1: + return self.playlist_result(entites, display_id) + + return entites[0] diff --git a/yt_dlp/extractor/dropout.py b/yt_dlp/extractor/dropout.py new file mode 100644 index 0000000000..a7442d8f0d --- /dev/null +++ b/yt_dlp/extractor/dropout.py @@ -0,0 +1,212 @@ +# coding: utf-8 +from .common import InfoExtractor +from .vimeo import VHXEmbedIE +from ..utils import ( + clean_html, + ExtractorError, + get_element_by_class, + get_element_by_id, + get_elements_by_class, + int_or_none, + join_nonempty, + unified_strdate, + urlencode_postdata, +) + + +class DropoutIE(InfoExtractor): + _LOGIN_URL = 'https://www.dropout.tv/login' + _NETRC_MACHINE = 'dropout' + + _VALID_URL = r'https?://(?:www\.)?dropout\.tv/(?:[^/]+/)*videos/(?P[^/]+)/?$' + _TESTS = [ + { + 'url': 'https://www.dropout.tv/game-changer/season:2/videos/yes-or-no', + 'note': 'Episode in a series', + 'md5': '5e000fdfd8d8fa46ff40456f1c2af04a', + 'info_dict': { + 'id': '738153', + 'display_id': 'yes-or-no', + 'ext': 'mp4', + 'title': 'Yes or No', + 'description': 'Ally, Brennan, and Zac are asked a simple question, but is there a correct answer?', + 'release_date': '20200508', + 'thumbnail': 'https://vhx.imgix.net/chuncensoredstaging/assets/351e3f24-c4a3-459a-8b79-dc80f1e5b7fd.jpg', + 'series': 'Game Changer', + 'season_number': 2, + 'season': 'Season 2', + 'episode_number': 6, + 'episode': 'Yes or No', + 'duration': 1180, + 'uploader_id': 'user80538407', + 'uploader_url': 'https://vimeo.com/user80538407', + 'uploader': 'OTT Videos' + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'] + }, + { + 'url': 'https://www.dropout.tv/dimension-20-fantasy-high/season:1/videos/episode-1', + 'note': 'Episode in a series (missing release_date)', + 'md5': '712caf7c191f1c47c8f1879520c2fa5c', + 'info_dict': { + 'id': '320562', + 'display_id': 'episode-1', + 'ext': 'mp4', + 'title': 'The Beginning Begins', + 'description': 'The cast introduces their PCs, including a neurotic elf, a goblin PI, and a corn-worshipping cleric.', + 'thumbnail': 'https://vhx.imgix.net/chuncensoredstaging/assets/4421ed0d-f630-4c88-9004-5251b2b8adfa.jpg', + 'series': 'Dimension 20: Fantasy High', + 'season_number': 1, + 'season': 'Season 1', + 'episode_number': 1, + 'episode': 'The Beginning Begins', + 'duration': 6838, + 'uploader_id': 'user80538407', + 'uploader_url': 'https://vimeo.com/user80538407', + 'uploader': 'OTT Videos' + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'] + }, + { + 'url': 'https://www.dropout.tv/videos/misfits-magic-holiday-special', + 'note': 'Episode not in a series', + 'md5': 'c30fa18999c5880d156339f13c953a26', + 'info_dict': { + 'id': '1915774', + 'display_id': 'misfits-magic-holiday-special', + 'ext': 'mp4', + 'title': 'Misfits & Magic Holiday Special', + 'description': 'The magical misfits spend Christmas break at Gowpenny, with an unwelcome visitor.', + 'release_date': '20211215', + 'thumbnail': 'https://vhx.imgix.net/chuncensoredstaging/assets/d91ea8a6-b250-42ed-907e-b30fb1c65176-8e24b8e5.jpg', + 'duration': 11698, + 'uploader_id': 'user80538407', + 'uploader_url': 'https://vimeo.com/user80538407', + 'uploader': 'OTT Videos' + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'] + } + ] + + def _get_authenticity_token(self, display_id): + signin_page = self._download_webpage( + self._LOGIN_URL, display_id, note='Getting authenticity token') + return self._html_search_regex( + r'name=["\']authenticity_token["\'] value=["\'](.+?)["\']', + signin_page, 'authenticity_token') + + def _login(self, display_id): + username, password = self._get_login_info() + if not (username and password): + self.raise_login_required(method='password') + + response = self._download_webpage( + self._LOGIN_URL, display_id, note='Logging in', data=urlencode_postdata({ + 'email': username, + 'password': password, + 'authenticity_token': self._get_authenticity_token(display_id), + 'utf8': True + })) + + user_has_subscription = self._search_regex( + r'user_has_subscription:\s*["\'](.+?)["\']', response, 'subscription status', default='none') + if user_has_subscription.lower() == 'true': + return response + elif user_has_subscription.lower() == 'false': + raise ExtractorError('Account is not subscribed') + else: + raise ExtractorError('Incorrect username/password') + + def _real_extract(self, url): + display_id = self._match_id(url) + try: + self._login(display_id) + webpage = self._download_webpage(url, display_id, note='Downloading video webpage') + finally: + self._download_webpage('https://www.dropout.tv/logout', display_id, note='Logging out') + + embed_url = self._search_regex(r'embed_url:\s*["\'](.+?)["\']', webpage, 'embed url') + thumbnail = self._og_search_thumbnail(webpage) + watch_info = get_element_by_id('watch-info', webpage) or '' + + title = clean_html(get_element_by_class('video-title', watch_info)) + season_episode = get_element_by_class( + 'site-font-secondary-color', get_element_by_class('text', watch_info)) + episode_number = int_or_none(self._search_regex( + r'Episode (\d+)', season_episode or '', 'episode', default=None)) + + return { + '_type': 'url_transparent', + 'ie_key': VHXEmbedIE.ie_key(), + 'url': embed_url, + 'id': self._search_regex(r'embed.vhx.tv/videos/(.+?)\?', embed_url, 'id'), + 'display_id': display_id, + 'title': title, + 'description': self._html_search_meta('description', webpage, fatal=False), + 'thumbnail': thumbnail.split('?')[0] if thumbnail else None, # Ignore crop/downscale + 'series': clean_html(get_element_by_class('series-title', watch_info)), + 'episode_number': episode_number, + 'episode': title if episode_number else None, + 'season_number': int_or_none(self._search_regex( + r'Season (\d+),', season_episode or '', 'season', default=None)), + 'release_date': unified_strdate(self._search_regex( + r'data-meta-field-name=["\']release_dates["\'] data-meta-field-value=["\'](.+?)["\']', + watch_info, 'release date', default=None)), + } + + +class DropoutSeasonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dropout\.tv/(?P[^\/$&?#]+)(?:/?$|/season:[0-9]+/?$)' + _TESTS = [ + { + 'url': 'https://www.dropout.tv/dimension-20-fantasy-high/season:1', + 'note': 'Multi-season series with the season in the url', + 'playlist_count': 17, + 'info_dict': { + 'id': 'dimension-20-fantasy-high-season-1', + 'title': 'Dimension 20 Fantasy High - Season 1' + } + }, + { + 'url': 'https://www.dropout.tv/dimension-20-fantasy-high', + 'note': 'Multi-season series with the season not in the url', + 'playlist_count': 17, + 'info_dict': { + 'id': 'dimension-20-fantasy-high-season-1', + 'title': 'Dimension 20 Fantasy High - Season 1' + } + }, + { + 'url': 'https://www.dropout.tv/dimension-20-shriek-week', + 'note': 'Single-season series', + 'playlist_count': 4, + 'info_dict': { + 'id': 'dimension-20-shriek-week-season-1', + 'title': 'Dimension 20 Shriek Week - Season 1' + } + } + ] + + def _real_extract(self, url): + season_id = self._match_id(url) + season_title = season_id.replace('-', ' ').title() + webpage = self._download_webpage(url, season_id) + + entries = [ + self.url_result( + url=self._search_regex(r']+selected>([^<]+)', + seasons, 'current_season', default='').strip() + + return { + '_type': 'playlist', + 'id': join_nonempty(season_id, current_season.lower().replace(' ', '-')), + 'title': join_nonempty(season_title, current_season, delim=' - '), + 'entries': entries + } diff --git a/yt_dlp/extractor/drtv.py b/yt_dlp/extractor/drtv.py index 7bb15f8d4c..70134204c5 100644 --- a/yt_dlp/extractor/drtv.py +++ b/yt_dlp/extractor/drtv.py @@ -321,7 +321,7 @@ def _real_extract(self, url): channel_data = self._download_json( 'https://www.dr.dk/mu-online/api/1.0/channel/' + channel_id, channel_id) - title = self._live_title(channel_data['Title']) + title = channel_data['Title'] formats = [] for streaming_server in channel_data.get('StreamingServers', []): diff --git a/yt_dlp/extractor/dvtv.py b/yt_dlp/extractor/dvtv.py index de7f6d6701..08663cffb8 100644 --- a/yt_dlp/extractor/dvtv.py +++ b/yt_dlp/extractor/dvtv.py @@ -8,6 +8,7 @@ determine_ext, ExtractorError, int_or_none, + join_nonempty, js_to_json, mimetype2ext, try_get, @@ -139,13 +140,9 @@ def _parse_video_metadata(self, js, video_id, timestamp): label = video.get('label') height = self._search_regex( r'^(\d+)[pP]', label or '', 'height', default=None) - format_id = ['http'] - for f in (ext, label): - if f: - format_id.append(f) formats.append({ 'url': video_url, - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty('http', ext, label), 'height': int_or_none(height), }) self._sort_formats(formats) diff --git a/yt_dlp/extractor/dw.py b/yt_dlp/extractor/dw.py index d740652f17..6eaee07b47 100644 --- a/yt_dlp/extractor/dw.py +++ b/yt_dlp/extractor/dw.py @@ -5,6 +5,7 @@ from ..utils import ( int_or_none, unified_strdate, + url_or_none, ) from ..compat import compat_urlparse @@ -15,13 +16,13 @@ class DWIE(InfoExtractor): _TESTS = [{ # video 'url': 'http://www.dw.com/en/intelligent-light/av-19112290', - 'md5': '7372046e1815c5a534b43f3c3c36e6e9', + 'md5': 'fb9dfd9520811d3ece80f04befd73428', 'info_dict': { 'id': '19112290', 'ext': 'mp4', 'title': 'Intelligent light', 'description': 'md5:90e00d5881719f2a6a5827cb74985af1', - 'upload_date': '20160311', + 'upload_date': '20160605', } }, { # audio @@ -55,15 +56,16 @@ def _real_extract(self, url): title = hidden_inputs['media_title'] media_id = hidden_inputs.get('media_id') or media_id - if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1': + direct_url = url_or_none(hidden_inputs.get('file_name')) + if direct_url: + formats = [{'url': hidden_inputs['file_name']}] + else: formats = self._extract_smil_formats( 'http://www.dw.com/smil/v-%s' % media_id, media_id, transform_source=lambda s: s.replace( 'rtmp://tv-od.dw.de/flash/', 'http://tv-download.dw.de/dwtv_video/flv/')) - self._sort_formats(formats) - else: - formats = [{'url': hidden_inputs['file_name']}] + self._sort_formats(formats) upload_date = hidden_inputs.get('display_date') if not upload_date: diff --git a/yt_dlp/extractor/egghead.py b/yt_dlp/extractor/egghead.py index f6b50e7c2f..b6b86768c2 100644 --- a/yt_dlp/extractor/egghead.py +++ b/yt_dlp/extractor/egghead.py @@ -86,7 +86,6 @@ class EggheadLessonIE(EggheadBaseIE): }, 'params': { 'skip_download': True, - 'format': 'bestvideo', }, }, { 'url': 'https://egghead.io/api/v1/lessons/react-add-redux-to-a-react-application', diff --git a/yt_dlp/extractor/epicon.py b/yt_dlp/extractor/epicon.py new file mode 100644 index 0000000000..cd19325bc7 --- /dev/null +++ b/yt_dlp/extractor/epicon.py @@ -0,0 +1,119 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class EpiconIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?epicon\.in/(?:documentaries|movies|tv-shows/[^/?#]+/[^/?#]+)/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.epicon.in/documentaries/air-battle-of-srinagar', + 'info_dict': { + 'id': 'air-battle-of-srinagar', + 'ext': 'mp4', + 'title': 'Air Battle of Srinagar', + 'description': 'md5:c4de2013af9bc05ae4392e4115d518d7', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'url': 'https://www.epicon.in/movies/krit', + 'info_dict': { + 'id': 'krit', + 'ext': 'mp4', + 'title': 'Krit', + 'description': 'md5:c12b35dad915d48ccff7f013c79bab4a', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'url': 'https://www.epicon.in/tv-shows/paapnaashini-ganga/season-1/vardaan', + 'info_dict': { + 'id': 'vardaan', + 'ext': 'mp4', + 'title': 'Paapnaashini Ganga - Season 1 - Ep 1 - VARDAAN', + 'description': 'md5:f517058c3d0402398eefa6242f4dd6ae', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'url': 'https://www.epicon.in/movies/jayadev', + 'info_dict': { + 'id': 'jayadev', + 'ext': 'mp4', + 'title': 'Jayadev', + 'description': 'md5:09e349eecd8e585a3b6466904f19df6c', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + cid = self._search_regex(r'class=\"mylist-icon\ iconclick\"\ id=\"(\d+)', webpage, 'cid') + headers = {'content-type': 'application/x-www-form-urlencoded; charset=UTF-8'} + data = f'cid={cid}&action=st&type=video'.encode() + data_json = self._parse_json(self._download_json('https://www.epicon.in/ajaxplayer/', id, headers=headers, data=data), id) + + if not data_json['success']: + raise ExtractorError(data_json['message'], expected=True) + + title = self._search_regex(r'setplaytitle=\"([^\"]+)', webpage, 'title') + description = self._og_search_description(webpage) or None + thumbnail = self._og_search_thumbnail(webpage) or None + formats = self._extract_m3u8_formats(data_json['url']['video_url'], id) + self._sort_formats(formats) + + subtitles = {} + for subtitle in data_json.get('subtitles', []): + sub_url = subtitle.get('file') + if not sub_url: + continue + subtitles.setdefault(subtitle.get('lang', 'English'), []).append({ + 'url': self._proto_relative_url(sub_url), + }) + + return { + 'id': id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'subtitles': subtitles, + } + + +class EpiconSeriesIE(InfoExtractor): + _VALID_URL = r'(?!.*season)https?://(?:www\.)?epicon\.in/tv-shows/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.epicon.in/tv-shows/1-of-something', + 'playlist_mincount': 5, + 'info_dict': { + 'id': '1-of-something', + }, + }, { + 'url': 'https://www.epicon.in/tv-shows/eco-india-english', + 'playlist_mincount': 76, + 'info_dict': { + 'id': 'eco-india-english', + }, + }, { + 'url': 'https://www.epicon.in/tv-shows/s/', + 'playlist_mincount': 25, + 'info_dict': { + 'id': 's', + }, + }, { + 'url': 'https://www.epicon.in/tv-shows/ekaant', + 'playlist_mincount': 38, + 'info_dict': { + 'id': 'ekaant', + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + episodes = re.findall(r'ct-tray-url=\"(tv-shows/%s/[^\"]+)' % id, webpage) + entries = [self.url_result('https://www.epicon.in/%s' % episode, ie=EpiconIE.ie_key()) for episode in episodes] + return self.playlist_result(entries, playlist_id=id) diff --git a/yt_dlp/extractor/espn.py b/yt_dlp/extractor/espn.py index d4a66c29ff..dc50f3b8b5 100644 --- a/yt_dlp/extractor/espn.py +++ b/yt_dlp/extractor/espn.py @@ -7,7 +7,9 @@ from ..compat import compat_str from ..utils import ( determine_ext, + dict_get, int_or_none, + unified_strdate, unified_timestamp, ) @@ -236,3 +238,44 @@ def _real_extract(self, url): webpage, 'embed url') return self.url_result(embed_url, 'AbcNewsVideo') + + +class ESPNCricInfoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?espncricinfo\.com/video/[^#$&?/]+-(?P\d+)' + _TESTS = [{ + 'url': 'https://www.espncricinfo.com/video/finch-chasing-comes-with-risks-despite-world-cup-trend-1289135', + 'info_dict': { + 'id': '1289135', + 'ext': 'mp4', + 'title': 'Finch: Chasing comes with \'risks\' despite World Cup trend', + 'description': 'md5:ea32373303e25efbb146efdfc8a37829', + 'upload_date': '20211113', + 'duration': 96, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + data_json = self._download_json(f'https://hs-consumer-api.espncricinfo.com/v1/pages/video/video-details?videoId={id}', id)['video'] + formats, subtitles = [], {} + for item in data_json.get('playbacks') or []: + if item.get('type') == 'HLS' and item.get('url'): + m3u8_frmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(item['url'], id) + formats.extend(m3u8_frmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + elif item.get('type') == 'AUDIO' and item.get('url'): + formats.append({ + 'url': item['url'], + 'vcodec': 'none', + }) + self._sort_formats(formats) + return { + 'id': id, + 'title': data_json.get('title'), + 'description': data_json.get('summary'), + 'upload_date': unified_strdate(dict_get(data_json, ('publishedAt', 'recordedAt'))), + 'duration': data_json.get('duration'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/euscreen.py b/yt_dlp/extractor/euscreen.py new file mode 100644 index 0000000000..2759e7436f --- /dev/null +++ b/yt_dlp/extractor/euscreen.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + parse_duration, + js_to_json, +) + + +class EUScreenIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?euscreen\.eu/item.html\?id=(?P[^&?$/]+)' + + _TESTS = [{ + 'url': 'https://euscreen.eu/item.html?id=EUS_0EBCBF356BFC4E12A014023BA41BD98C', + 'info_dict': { + 'id': 'EUS_0EBCBF356BFC4E12A014023BA41BD98C', + 'ext': 'mp4', + 'title': "L'effondrement du stade du Heysel", + 'alt_title': 'Collapse of the Heysel Stadium', + 'duration': 318.0, + 'description': 'md5:f0ffffdfce6821139357a1b8359d6152', + 'series': 'JA2 DERNIERE', + 'episode': '-', + 'uploader': 'INA / France', + 'thumbnail': 'http://images3.noterik.com/domain/euscreenxl/user/eu_ina/video/EUS_0EBCBF356BFC4E12A014023BA41BD98C/image.jpg' + }, + 'params': {'skip_download': True} + }] + + _payload = b'-1Win32MozillaNetscape5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36true784758undefinedSat, 07 Oct 2021 08:56:50 GMT1633769810758' + + def _real_extract(self, url): + id = self._match_id(url) + args_for_js_request = self._download_webpage( + 'https://euscreen.eu/lou/LouServlet/domain/euscreenxl/html5application/euscreenxlitem', + id, data=self._payload, query={'actionlist': 'itempage', 'id': id}) + info_js = self._download_webpage( + 'https://euscreen.eu/lou/LouServlet/domain/euscreenxl/html5application/euscreenxlitem', + id, data=args_for_js_request.replace('screenid', 'screenId').encode()) + video_json = self._parse_json( + self._search_regex(r'setVideo\(({.+})\)\(\$end\$\)put', info_js, 'Video JSON'), + id, transform_source=js_to_json) + meta_json = self._parse_json( + self._search_regex(r'setData\(({.+})\)\(\$end\$\)', info_js, 'Metadata JSON'), + id, transform_source=js_to_json) + formats = [{ + 'url': source['src'], + } for source in video_json.get('sources', [])] + self._sort_formats(formats) + + return { + 'id': id, + 'title': meta_json.get('originalTitle'), + 'alt_title': meta_json.get('title'), + 'duration': parse_duration(meta_json.get('duration')), + 'description': '%s\n%s' % (meta_json.get('summaryOriginal', ''), meta_json.get('summaryEnglish', '')), + 'series': meta_json.get('series') or meta_json.get('seriesEnglish'), + 'episode': meta_json.get('episodeNumber'), + 'uploader': meta_json.get('provider'), + 'thumbnail': meta_json.get('screenshot') or video_json.get('screenshot'), + 'formats': formats, + } diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index a58a5001cf..1b32efc47b 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -4,6 +4,7 @@ from .abc import ( ABCIE, ABCIViewIE, + ABCIViewShowSeriesIE, ) from .abcnews import ( AbcNewsIE, @@ -50,6 +51,7 @@ AnimeLabIE, AnimeLabShowsIE, ) +from .amazon import AmazonStoreIE from .americastestkitchen import ( AmericasTestKitchenIE, AmericasTestKitchenSeasonIE, @@ -147,6 +149,8 @@ BilibiliAudioAlbumIE, BiliBiliPlayerIE, BilibiliChannelIE, + BiliIntlIE, + BiliIntlSeriesIE, ) from .biobiochiletv import BioBioChileTVIE from .bitchute import ( @@ -163,6 +167,7 @@ BleacherReportIE, BleacherReportCMSIE, ) +from .blogger import BloggerIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bongacams import BongaCamsIE @@ -175,6 +180,7 @@ ) from .bravotv import BravoTVIE from .breakcom import BreakIE +from .breitbart import BreitBartIE from .brightcove import ( BrightcoveLegacyIE, BrightcoveNewIE, @@ -183,13 +189,15 @@ from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE +from .cableav import CableAVIE +from .cam4 import CAM4IE from .camdemy import ( CamdemyIE, CamdemyFolderIE ) from .cammodels import CamModelsIE -from .camtube import CamTubeIE from .camwithher import CamWithHerIE +from .canalalpha import CanalAlphaIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .canvas import ( @@ -206,9 +214,9 @@ from .cbc import ( CBCIE, CBCPlayerIE, - CBCWatchVideoIE, - CBCWatchIE, - CBCOlympicsIE, + CBCGemIE, + CBCGemPlaylistIE, + CBCGemLiveIE, ) from .cbs import CBSIE from .cbslocal import ( @@ -233,14 +241,16 @@ from .ccma import CCMAIE from .cctv import CCTVIE from .cda import CDAIE -from .ceskatelevize import ( - CeskaTelevizeIE, - CeskaTelevizePoradyIE, -) +from .ceskatelevize import CeskaTelevizeIE +from .cgtn import CGTNIE from .channel9 import Channel9IE from .charlierose import CharlieRoseIE from .chaturbate import ChaturbateIE from .chilloutzone import ChilloutzoneIE +from .chingari import ( + ChingariIE, + ChingariUserIE, +) from .chirbit import ( ChirbitIE, ChirbitProfileIE, @@ -251,6 +261,7 @@ CiscoLiveSessionIE, CiscoLiveSearchIE, ) +from .ciscowebex import CiscoWebexIE from .cjsw import CJSWIE from .cliphunter import CliphunterIE from .clippit import ClippitIE @@ -285,12 +296,15 @@ from .condenast import CondeNastIE from .contv import CONtvIE from .corus import CorusIE +from .cozytv import CozyTVIE from .cracked import CrackedIE from .crackle import CrackleIE from .crooksandliars import CrooksAndLiarsIE from .crunchyroll import ( CrunchyrollIE, - CrunchyrollShowPlaylistIE + CrunchyrollShowPlaylistIE, + CrunchyrollBetaIE, + CrunchyrollBetaShowIE, ) from .cspan import CSpanIE from .ctsnews import CtsNewsIE @@ -299,7 +313,8 @@ from .cultureunplugged import CultureUnpluggedIE from .curiositystream import ( CuriosityStreamIE, - CuriosityStreamCollectionIE, + CuriosityStreamCollectionsIE, + CuriosityStreamSeriesIE, ) from .cwtv import CWTVIE from .dailymail import DailyMailIE @@ -308,6 +323,10 @@ DailymotionPlaylistIE, DailymotionUserIE, ) +from .damtomo import ( + DamtomoRecordIE, + DamtomoVideoIE, +) from .daum import ( DaumIE, DaumClipIE, @@ -324,12 +343,7 @@ from .dfb import DFBIE from .dhm import DHMIE from .digg import DiggIE -from .discoveryplusindia import ( - DiscoveryPlusIndiaIE, - DiscoveryPlusIndiaShowIE, -) from .dotsub import DotsubIE -from .douyin import DouyinIE from .douyutv import ( DouyuShowIE, DouyuTVIE, @@ -338,7 +352,14 @@ DPlayIE, DiscoveryPlusIE, HGTVDeIE, - ScienceChannelIE + ScienceChannelIE, + DIYNetworkIE, + AnimalPlanetIE, + DiscoveryPlusIndiaIE, + DiscoveryNetworksDeIE, + DiscoveryPlusItalyIE, + DiscoveryPlusItalyShowIE, + DiscoveryPlusIndiaShowIE, ) from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE @@ -360,12 +381,15 @@ DiscoveryGoIE, DiscoveryGoPlaylistIE, ) -from .discoverynetworks import DiscoveryNetworksDeIE from .discoveryvr import DiscoveryVRIE from .disney import DisneyIE from .dispeak import DigitallySpeakingIE from .doodstream import DoodStreamIE from .dropbox import DropboxIE +from .dropout import ( + DropoutSeasonIE, + DropoutIE +) from .dw import ( DWIE, DWArticleIE, @@ -390,6 +414,10 @@ from .elpais import ElPaisIE from .embedly import EmbedlyIE from .engadget import EngadgetIE +from .epicon import ( + EpiconIE, + EpiconSeriesIE, +) from .eporner import EpornerIE from .eroprofile import ( EroProfileIE, @@ -400,9 +428,11 @@ ESPNIE, ESPNArticleIE, FiveThirtyEightIE, + ESPNCricInfoIE, ) from .esri import EsriVideoIE from .europa import EuropaIE +from .euscreen import EUScreenIE from .expotv import ExpoTVIE from .expressen import ExpressenIE from .extremetube import ExtremeTubeIE @@ -410,6 +440,7 @@ from .facebook import ( FacebookIE, FacebookPluginsVideoIE, + FacebookRedirectURLIE, ) from .fancode import ( FancodeVodIE, @@ -457,12 +488,7 @@ from .francetv import ( FranceTVIE, FranceTVSiteIE, - FranceTVEmbedIE, FranceTVInfoIE, - FranceTVInfoSportIE, - FranceTVJeunesseIE, - GenerationWhatIE, - CultureboxIE, ) from .freesound import FreesoundIE from .freespeech import FreespeechIE @@ -480,9 +506,20 @@ ) from .funk import FunkIE from .fusion import FusionIE -from .gab import GabTVIE +from .gab import ( + GabTVIE, + GabIE, +) from .gaia import GaiaIE from .gameinformer import GameInformerIE +from .gamejolt import ( + GameJoltIE, + GameJoltUserIE, + GameJoltGameIE, + GameJoltGameSoundtrackIE, + GameJoltCommunityIE, + GameJoltSearchIE, +) from .gamespot import GameSpotIE from .gamestar import GameStarIE from .gaskrank import GaskrankIE @@ -490,6 +527,7 @@ from .gdcvault import GDCVaultIE from .gedidigital import GediDigitalIE from .generic import GenericIE +from .gettr import GettrIE from .gfycat import GfycatIE from .giantbomb import GiantBombIE from .giga import GigaIE @@ -500,6 +538,7 @@ ) from .go import GoIE from .godtube import GodTubeIE +from .gofile import GofileIE from .golem import GolemIE from .googledrive import GoogleDriveIE from .googlepodcasts import ( @@ -507,8 +546,11 @@ GooglePodcastsFeedIE, ) from .googlesearch import GoogleSearchIE +from .gopro import GoProIE from .goshgay import GoshgayIE +from .gotostage import GoToStageIE from .gputechconf import GPUTechConfIE +from .gronkh import GronkhIE from .groupon import GrouponIE from .hbo import HBOIE from .hearthisat import HearThisAtIE @@ -536,6 +578,10 @@ HRTiIE, HRTiPlaylistIE, ) +from .hse import ( + HSEShowIE, + HSEProductIE, +) from .huajiao import HuajiaoIE from .huffpost import HuffPostIE from .hungama import ( @@ -544,6 +590,10 @@ HungamaAlbumPlaylistIE, ) from .hypem import HypemIE +from .ichinanalive import ( + IchinanaLiveIE, + IchinanaLiveClipIE, +) from .ign import ( IGNIE, IGNVideoIE, @@ -568,12 +618,17 @@ from .infoq import InfoQIE from .instagram import ( InstagramIE, + InstagramIOSIE, InstagramUserIE, InstagramTagIE, + InstagramStoryIE, ) from .internazionale import InternazionaleIE from .internetvideoarchive import InternetVideoArchiveIE -from .iprima import IPrimaIE +from .iprima import ( + IPrimaIE, + IPrimaCNNIE +) from .iqiyi import IqiyiIE from .ir90tv import Ir90TvIE from .itv import ( @@ -610,6 +665,7 @@ from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE from .konserthusetplay import KonserthusetPlayIE +from .koo import KooIE from .krasview import KrasViewIE from .ku6 import Ku6IE from .kusi import KUSIIE @@ -672,6 +728,7 @@ LineLiveChannelIE, ) from .linkedin import ( + LinkedInIE, LinkedInLearningIE, LinkedInLearningCourseIE, ) @@ -718,7 +775,12 @@ from .matchtv import MatchTVIE from .mdr import MDRIE from .medaltv import MedalTVIE -from .mediaset import MediasetIE +from .mediaite import MediaiteIE +from .mediaklikk import MediaKlikkIE +from .mediaset import ( + MediasetIE, + MediasetShowIE, +) from .mediasite import ( MediasiteIE, MediasiteCatalogIE, @@ -734,6 +796,7 @@ from .mgoon import MgoonIE from .mgtv import MGTVIE from .miaopai import MiaoPaiIE +from .microsoftstream import MicrosoftStreamIE from .microsoftvirtualacademy import ( MicrosoftVirtualAcademyIE, MicrosoftVirtualAcademyCourseIE, @@ -757,6 +820,7 @@ ) from .mit import TechTVMITIE, OCWMITIE from .mitele import MiTeleIE +from .mixch import MixchIE from .mixcloud import ( MixcloudIE, MixcloudUserIE, @@ -766,6 +830,7 @@ MLBIE, MLBVideoIE, ) +from .mlssoccer import MLSSoccerIE from .mnet import MnetIE from .moevideo import MoeVideoIE from .mofosex import ( @@ -793,6 +858,7 @@ MTVItaliaProgrammaIE, ) from .muenchentv import MuenchenTVIE +from .musescore import MuseScoreIE from .mwave import MwaveIE, MwaveMeetGreetIE from .mxplayer import ( MxplayerIE, @@ -807,6 +873,14 @@ ) from .myvideoge import MyVideoGeIE from .myvidster import MyVidsterIE +from .n1 import ( + N1InfoAssetIE, + N1InfoIIE, +) +from .nate import ( + NateIE, + NateProgramIE, +) from .nationalgeographic import ( NationalGeographicVideoIE, NationalGeographicTVIE, @@ -840,7 +914,10 @@ NJoyEmbedIE, ) from .ndtv import NDTVIE -from .nebula import NebulaIE +from .nebula import ( + NebulaIE, + NebulaCollectionIE, +) from .nerdcubed import NerdCubedFeedIE from .netzkino import NetzkinoIE from .neteasemusic import ( @@ -855,6 +932,7 @@ from .newgrounds import ( NewgroundsIE, NewgroundsPlaylistIE, + NewgroundsUserIE, ) from .newstube import NewstubeIE from .nextmedia import ( @@ -884,8 +962,19 @@ NickNightIE, NickRuIE, ) -from .niconico import NiconicoIE, NiconicoPlaylistIE, NiconicoUserIE -from .ninecninemedia import NineCNineMediaIE + +from .niconico import ( + NiconicoIE, + NiconicoPlaylistIE, + NiconicoUserIE, + NicovideoSearchDateIE, + NicovideoSearchIE, + NicovideoSearchURLIE, +) +from .ninecninemedia import ( + NineCNineMediaIE, + CPTwentyFourIE, +) from .ninegag import NineGagIE from .ninenow import NineNowIE from .nintendo import NintendoIE @@ -900,6 +989,7 @@ NovaEmbedIE, NovaIE, ) +from .novaplay import NovaPlayIE from .nowness import ( NownessIE, NownessPlaylistIE, @@ -940,11 +1030,15 @@ NYTimesCookingIE, ) from .nuvid import NuvidIE +from .nzherald import NZHeraldIE from .nzz import NZZIE from .odatv import OdaTVIE from .odnoklassniki import OdnoklassnikiIE from .oktoberfesttv import OktoberfestTVIE +from .olympics import OlympicsReplayIE +from .on24 import On24IE from .ondemandkorea import OnDemandKoreaIE +from .onefootball import OneFootballIE from .onet import ( OnetIE, OnetChannelIE, @@ -956,6 +1050,10 @@ OoyalaIE, OoyalaExternalIE, ) +from .opencast import ( + OpencastIE, + OpencastPlaylistIE, +) from .openrec import ( OpenRecIE, OpenRecCaptureIE, @@ -995,10 +1093,21 @@ ) from .parliamentliveuk import ParliamentLiveUKIE from .parlview import ParlviewIE -from .patreon import PatreonIE +from .patreon import ( + PatreonIE, + PatreonUserIE +) from .pbs import PBSIE from .pearvideo import PearVideoIE -from .peertube import PeerTubeIE +from .peertube import ( + PeerTubeIE, + PeerTubePlaylistIE, +) +from .peertv import PeerTVIE +from .peloton import ( + PelotonIE, + PelotonLiveIE +) from .people import PeopleIE from .performgroup import PerformGroupIE from .periscope import ( @@ -1018,7 +1127,12 @@ PinterestIE, PinterestCollectionIE, ) +from .pixivsketch import ( + PixivSketchIE, + PixivSketchUserIE, +) from .pladform import PladformIE +from .planetmarathi import PlanetMarathiIE from .platzi import ( PlatziIE, PlatziCourseIE, @@ -1040,9 +1154,14 @@ PokemonIE, PokemonWatchIE, ) +from .polsatgo import PolsatGoIE from .polskieradio import ( PolskieRadioIE, PolskieRadioCategoryIE, + PolskieRadioPlayerIE, + PolskieRadioPodcastIE, + PolskieRadioPodcastListIE, + PolskieRadioRadioKierowcowIE, ) from .popcorntimes import PopcorntimesIE from .popcorntv import PopcornTVIE @@ -1065,6 +1184,7 @@ PuhuTVSerieIE, ) from .presstv import PressTVIE +from .projectveritas import ProjectVeritasIE from .prosiebensat1 import ProSiebenSat1IE from .puls4 import Puls4IE from .pyvideo import PyvideoIE @@ -1079,6 +1199,7 @@ R7IE, R7ArticleIE, ) +from .radiko import RadikoIE, RadikoRadioIE from .radiocanada import ( RadioCanadaIE, RadioCanadaAudioVideoIE, @@ -1087,11 +1208,23 @@ from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE +from .radiozet import RadioZetPodcastIE +from .radiokapital import ( + RadioKapitalIE, + RadioKapitalShowIE, +) +from .radlive import ( + RadLiveIE, + RadLiveChannelIE, + RadLiveSeasonIE, +) from .rai import ( RaiPlayIE, RaiPlayLiveIE, RaiPlayPlaylistIE, RaiIE, + RaiPlayRadioIE, + RaiPlayRadioPlaylistIE, ) from .raywenderlich import ( RayWenderlichIE, @@ -1115,9 +1248,11 @@ RedBullTVRrnContentIE, RedBullIE, ) -from .reddit import ( - RedditIE, - RedditRIE, +from .reddit import RedditIE +from .redgifs import ( + RedGifsIE, + RedGifsSearchIE, + RedGifsUserIE, ) from .redtube import RedTubeIE from .regiotv import RegioTVIE @@ -1132,7 +1267,7 @@ from .rmcdecouverte import RMCDecouverteIE from .ro220 import Ro220IE from .rockstargames import RockstarGamesIE -from .roosterteeth import RoosterTeethIE +from .roosterteeth import RoosterTeethIE, RoosterTeethSeriesIE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rozhlas import RozhlasIE @@ -1145,12 +1280,22 @@ RTL2YouSeriesIE, ) from .rtp import RTPIE +from .rtrfm import RTRFMIE from .rts import RTSIE -from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE +from .rtve import ( + RTVEALaCartaIE, + RTVEAudioIE, + RTVELiveIE, + RTVEInfantilIE, + RTVETelevisionIE, +) from .rtvnh import RTVNHIE from .rtvs import RTVSIE from .ruhd import RUHDIE -from .rumble import RumbleEmbedIE +from .rumble import ( + RumbleEmbedIE, + RumbleChannelIE, +) from .rutube import ( RutubeIE, RutubeChannelIE, @@ -1158,6 +1303,7 @@ RutubeMovieIE, RutubePersonIE, RutubePlaylistIE, + RutubeTagsIE, ) from .rutv import RUTVIE from .ruutu import RuutuIE @@ -1183,7 +1329,7 @@ SCTECourseIE, ) from .seeker import SeekerIE -from .senateisvp import SenateISVPIE +from .senategov import SenateISVPIE, SenateGovIE from .sendtonews import SendtoNewsIE from .servus import ServusIE from .sevenplus import SevenPlusIE @@ -1209,6 +1355,7 @@ ) from .sina import SinaIE from .sixplay import SixPlayIE +from .skeb import SkebIE from .skyit import ( SkyItPlayerIE, SkyItVideoIE, @@ -1224,8 +1371,10 @@ SkyNewsArabiaIE, SkyNewsArabiaArticleIE, ) +from .skynewsau import SkyNewsAUIE from .sky import ( SkyNewsIE, + SkyNewsStoryIE, SkySportsIE, SkySportsNewsIE, ) @@ -1242,6 +1391,7 @@ SoundcloudEmbedIE, SoundcloudIE, SoundcloudSetIE, + SoundcloudRelatedIE, SoundcloudUserIE, SoundcloudTrackStationIE, SoundcloudPlaylistIE, @@ -1258,6 +1408,10 @@ SouthParkEsIE, SouthParkNlIE ) +from .sovietscloset import ( + SovietsClosetIE, + SovietsClosetPlaylistIE +) from .spankbang import ( SpankBangIE, SpankBangPlaylistIE, @@ -1293,6 +1447,7 @@ ) from .srmediathek import SRMediathekIE from .stanfordoc import StanfordOpenClassroomIE +from .startv import StarTVIE from .steam import SteamIE from .storyfire import ( StoryFireIE, @@ -1300,10 +1455,13 @@ StoryFireSeriesIE, ) from .streamable import StreamableIE +from .streamanity import StreamanityIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE +from .streamff import StreamFFIE from .streetvoice import StreetVoiceIE from .stretchinternet import StretchInternetIE +from .stripchat import StripchatIE from .stv import STVPlayerIE from .sunporno import SunPornoIE from .sverigesradio import ( @@ -1319,10 +1477,7 @@ from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE from .sztvhu import SztvHuIE -from .tagesschau import ( - TagesschauPlayerIE, - TagesschauIE, -) +from .tagesschau import TagesschauIE from .tass import TassIE from .tbs import TBSIE from .tdslifeway import TDSLifewayIE @@ -1368,14 +1523,26 @@ from .thescene import TheSceneIE from .thestar import TheStarIE from .thesun import TheSunIE +from .theta import ( + ThetaVideoIE, + ThetaStreamIE, +) from .theweatherchannel import TheWeatherChannelIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE +from .threespeak import ( + ThreeSpeakIE, + ThreeSpeakUserIE, +) from .threeqsdn import ThreeQSDNIE from .tiktok import ( TikTokIE, TikTokUserIE, + TikTokSoundIE, + TikTokEffectIE, + TikTokTagIE, + DouyinIE, ) from .tinypic import TinyPicIE from .tmz import TMZIE @@ -1389,6 +1556,13 @@ ToggleIE, MeWatchIE, ) +from .toggo import ( + ToggoIE, +) +from .tokentube import ( + TokentubeIE, + TokentubeChannelIE +) from .tonline import TOnlineIE from .toongoggles import ToonGogglesIE from .toutv import TouTvIE @@ -1398,7 +1572,10 @@ from .trovo import ( TrovoIE, TrovoVodIE, + TrovoChannelVodIE, + TrovoChannelClipIE, ) +from .trueid import TrueIDIE from .trunews import TruNewsIE from .trutv import TruTVIE from .tube8 import Tube8IE @@ -1426,7 +1603,10 @@ TV2DKIE, TV2DKBornholmPlayIE, ) -from .tv2hu import TV2HuIE +from .tv2hu import ( + TV2HuIE, + TV2HuSeriesIE, +) from .tv4 import TV4IE from .tv5mondeplus import TV5MondePlusIE from .tv5unis import ( @@ -1462,6 +1642,7 @@ from .tvp import ( TVPEmbedIE, TVPIE, + TVPStreamIE, TVPWebsiteIE, ) from .tvplay import ( @@ -1511,6 +1692,7 @@ DLiveVODIE, DLiveStreamIE, ) +from .drooble import DroobleIE from .umg import UMGDeIE from .unistra import UnistraIE from .unity import UnityIE @@ -1532,6 +1714,7 @@ from .varzesh3 import Varzesh3IE from .vbox7 import Vbox7IE from .veehd import VeeHDIE +from .veo import VeoIE from .veoh import VeohIE from .vesti import VestiIE from .vevo import ( @@ -1566,11 +1749,6 @@ VidioLiveIE ) from .vidlii import VidLiiIE -from .vidme import ( - VidmeIE, - VidmeUserIE, - VidmeUserLikesIE, -) from .vier import VierIE, VierVideosIE from .viewlift import ( ViewLiftIE, @@ -1641,6 +1819,7 @@ from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE +from .vupload import VuploadIE from .vvvvid import ( VVVVIDIE, VVVVIDShowIE, @@ -1675,6 +1854,7 @@ WeiboMobileIE ) from .weiqitv import WeiqiTVIE +from .willow import WillowIE from .wimtv import WimTVIE from .whowatch import WhoWatchIE from .wistia import ( @@ -1682,6 +1862,10 @@ WistiaPlaylistIE, ) from .worldstarhiphop import WorldStarHipHopIE +from .wppilot import ( + WPPilotIE, + WPPilotChannelsIE, +) from .wsj import ( WSJIE, WSJArticleIE, @@ -1727,7 +1911,11 @@ YandexMusicArtistTracksIE, YandexMusicArtistAlbumsIE, ) -from .yandexvideo import YandexVideoIE +from .yandexvideo import ( + YandexVideoIE, + ZenYandexIE, + ZenYandexChannelIE, +) from .yapfiles import YapFilesIE from .yesjapan import YesJapanIE from .yinyuetai import YinYueTaiIE @@ -1747,6 +1935,7 @@ from .yourupload import YourUploadIE from .youtube import ( YoutubeIE, + YoutubeClipIE, YoutubeFavouritesIE, YoutubeHistoryIE, YoutubeTabIE, diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index e5bdb335a8..6dbcd690d7 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -23,9 +23,11 @@ merge_dicts, network_exceptions, parse_count, + parse_qs, qualities, sanitized_Request, try_get, + url_or_none, urlencode_postdata, urljoin, ) @@ -35,7 +37,7 @@ class FacebookIE(InfoExtractor): _VALID_URL = r'''(?x) (?: https?:// - (?:[\w-]+\.)?(?:facebook\.com|facebookcorewwwi\.onion)/ + (?:[\w-]+\.)?(?:facebook\.com|facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd\.onion)/ (?:[^#]*?\#!/)? (?: (?: @@ -226,7 +228,7 @@ class FacebookIE(InfoExtractor): 'only_matching': True, }, { # data.video - 'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670', + 'url': 'https://www.facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd.onion/video.php?v=274175099429670', 'only_matching': True, }, { # no title @@ -479,7 +481,7 @@ def process_formats(formats): for f in formats: f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' - self._sort_formats(formats) + self._sort_formats(formats, ('res', 'quality')) def extract_relay_data(_filter): return self._parse_json(self._search_regex( @@ -687,13 +689,14 @@ def parse_attachment(attachment, key='media'): for src_type in ('src', 'src_no_ratelimit'): src = f[0].get('%s_%s' % (quality, src_type)) if src: - preference = -10 if format_id == 'progressive' else 0 + preference = -10 if format_id == 'progressive' else -1 if quality == 'hd': preference += 5 formats.append({ 'format_id': '%s_%s_%s' % (format_id, quality, src_type), 'url': src, 'quality': preference, + 'height': 720 if quality == 'hd' else None }) extract_dash_manifest(f[0], formats) subtitles_src = f[0].get('subtitles_src') @@ -745,3 +748,42 @@ def _real_extract(self, url): return self.url_result( compat_urllib_parse_unquote(self._match_id(url)), FacebookIE.ie_key()) + + +class FacebookRedirectURLIE(InfoExtractor): + IE_DESC = False # Do not list + _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/flx/warn[/?]' + _TESTS = [{ + 'url': 'https://www.facebook.com/flx/warn/?h=TAQHsoToz&u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&s=1', + 'info_dict': { + 'id': 'pO8h3EaFRdo', + 'ext': 'mp4', + 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', + 'description': 'md5:2d713ccbb45b686a1888397b2c77ca6b', + 'channel_id': 'UCGBpxWJr9FNOcFYA5GkKrMg', + 'playable_in_embed': True, + 'categories': ['Music'], + 'channel': 'Boiler Room', + 'uploader_id': 'brtvofficial', + 'uploader': 'Boiler Room', + 'tags': 'count:11', + 'duration': 3332, + 'live_status': 'not_live', + 'thumbnail': 'https://i.ytimg.com/vi/pO8h3EaFRdo/maxresdefault.jpg', + 'channel_url': 'https://www.youtube.com/channel/UCGBpxWJr9FNOcFYA5GkKrMg', + 'availability': 'public', + 'uploader_url': 'http://www.youtube.com/user/brtvofficial', + 'upload_date': '20150917', + 'age_limit': 0, + 'view_count': int, + 'like_count': int, + }, + 'add_ie': ['Youtube'], + 'params': {'skip_download': 'Youtube'}, + }] + + def _real_extract(self, url): + redirect_url = url_or_none(parse_qs(url).get('u', [None])[-1]) + if not redirect_url: + raise ExtractorError('Invalid facebook redirect URL', expected=True) + return self.url_result(redirect_url) diff --git a/yt_dlp/extractor/fancode.py b/yt_dlp/extractor/fancode.py index fd84a6e508..978df31fff 100644 --- a/yt_dlp/extractor/fancode.py +++ b/yt_dlp/extractor/fancode.py @@ -21,7 +21,6 @@ class FancodeVodIE(InfoExtractor): 'url': 'https://fancode.com/video/15043/match-preview-pbks-vs-mi', 'params': { 'skip_download': True, - 'format': 'bestvideo' }, 'info_dict': { 'id': '6249806281001', @@ -42,7 +41,7 @@ class FancodeVodIE(InfoExtractor): _ACCESS_TOKEN = None _NETRC_MACHINE = 'fancode' - _LOGIN_HINT = 'Use "--user refresh --password " to login using a refresh token' + _LOGIN_HINT = 'Use "--username refresh --password " to login using a refresh token' headers = { 'content-type': 'application/json', @@ -173,7 +172,7 @@ def _real_extract(self, url): match_info = try_get(info_json, lambda x: x['data']['match']) - if match_info.get('status') != "LIVE": + if match_info.get('streamingStatus') != "STARTED": raise ExtractorError('The stream can\'t be accessed', expected=True) self._check_login_required(match_info.get('isUserEntitled'), True) # all live streams are premium only diff --git a/yt_dlp/extractor/filmon.py b/yt_dlp/extractor/filmon.py index f775fe0bae..7b43ecc0f9 100644 --- a/yt_dlp/extractor/filmon.py +++ b/yt_dlp/extractor/filmon.py @@ -170,7 +170,7 @@ def _real_extract(self, url): return { 'id': channel_id, 'display_id': channel_data.get('alias'), - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': channel_data.get('description'), 'thumbnails': thumbnails, 'formats': formats, diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py index 41910cefb1..877c5c0556 100644 --- a/yt_dlp/extractor/francetv.py +++ b/yt_dlp/extractor/francetv.py @@ -4,19 +4,12 @@ from .common import InfoExtractor -from ..compat import ( - compat_str, -) from ..utils import ( - clean_html, determine_ext, ExtractorError, - int_or_none, - parse_duration, + format_field, + parse_iso8601, parse_qs, - try_get, - url_or_none, - urljoin, ) from .dailymotion import DailymotionIE @@ -89,97 +82,81 @@ def _extract_video(self, video_id, catalogue=None): # Videos are identified by idDiffusion so catalogue part is optional. # However when provided, some extra formats may be returned so we pass # it if available. - info = self._download_json( - 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/', - video_id, 'Downloading video JSON', query={ - 'idDiffusion': video_id, - 'catalogue': catalogue or '', - }) - - if info.get('status') == 'NOK': - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, info['message']), - expected=True) - allowed_countries = info['videos'][0].get('geoblocage') - if allowed_countries: - georestricted = True - geo_info = self._download_json( - 'http://geo.francetv.fr/ws/edgescape.json', video_id, - 'Downloading geo restriction info') - country = geo_info['reponse']['geo_info']['country_code'] - if country not in allowed_countries: - raise ExtractorError( - 'The video is not available from your location', - expected=True) - else: - georestricted = False - - def sign(manifest_url, manifest_id): - for host in ('hdfauthftv-a.akamaihd.net', 'hdfauth.francetv.fr'): - signed_url = url_or_none(self._download_webpage( - 'https://%s/esi/TA' % host, video_id, - 'Downloading signed %s manifest URL' % manifest_id, - fatal=False, query={ - 'url': manifest_url, - })) - if signed_url: - return signed_url - return manifest_url - is_live = None - videos = [] + title = None + subtitle = None + image = None + duration = None + timestamp = None + spritesheets = None - for video in (info.get('videos') or []): - if video.get('statut') != 'ONLINE': + for device_type in ('desktop', 'mobile'): + dinfo = self._download_json( + 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id, + video_id, 'Downloading %s video JSON' % device_type, query={ + 'device_type': device_type, + 'browser': 'chrome', + }, fatal=False) + + if not dinfo: continue - if not video.get('url'): - continue - videos.append(video) - if not videos: - for device_type in ['desktop', 'mobile']: - fallback_info = self._download_json( - 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id, - video_id, 'Downloading fallback %s video JSON' % device_type, query={ - 'device_type': device_type, - 'browser': 'chrome', - }, fatal=False) + video = dinfo.get('video') + if video: + videos.append(video) + if duration is None: + duration = video.get('duration') + if is_live is None: + is_live = video.get('is_live') + if spritesheets is None: + spritesheets = video.get('spritesheets') - if fallback_info and fallback_info.get('video'): - videos.append(fallback_info['video']) + meta = dinfo.get('meta') + if meta: + if title is None: + title = meta.get('title') + # XXX: what is meta['pre_title']? + if subtitle is None: + subtitle = meta.get('additional_title') + if image is None: + image = meta.get('image_url') + if timestamp is None: + timestamp = parse_iso8601(meta.get('broadcasted_at')) formats = [] subtitles = {} for video in videos: - video_url = video.get('url') - if not video_url: - continue - if is_live is None: - is_live = (try_get( - video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True - or video.get('is_live') is True - or '/live.francetv.fr/' in video_url) format_id = video.get('format') + + video_url = None + if video.get('workflow') == 'token-akamai': + token_url = video.get('token') + if token_url: + token_json = self._download_json( + token_url, video_id, + 'Downloading signed %s manifest URL' % format_id) + if token_json: + video_url = token_json.get('url') + if not video_url: + video_url = video.get('url') + ext = determine_ext(video_url) if ext == 'f4m': - if georestricted: - # See https://github.com/ytdl-org/youtube-dl/issues/3963 - # m3u8 urls work fine - continue formats.extend(self._extract_f4m_formats( - sign(video_url, format_id) + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', - video_id, f4m_id=format_id, fatal=False)) + video_url, video_id, f4m_id=format_id, fatal=False)) elif ext == 'm3u8': - m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles( - sign(video_url, format_id), video_id, 'mp4', + fmts, subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) - formats.extend(m3u8_fmts) - subtitles = self._merge_subtitles(subtitles, m3u8_subs) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False)) + fmts, subs = self._extract_mpd_formats_and_subtitles( + video_url, video_id, mpd_id=format_id, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) elif video_url.startswith('rtmp'): formats.append({ 'url': video_url, @@ -193,28 +170,43 @@ def sign(manifest_url, manifest_id): 'format_id': format_id, }) + # XXX: what is video['captions']? + + for f in formats: + if f.get('acodec') != 'none' and f.get('language') in ('qtz', 'qad'): + f['language_preference'] = -10 + f['format_note'] = 'audio description%s' % format_field(f, 'format_note', ', %s') + + if spritesheets: + formats.append({ + 'format_id': 'spritesheets', + 'format_note': 'storyboard', + 'acodec': 'none', + 'vcodec': 'none', + 'ext': 'mhtml', + 'protocol': 'mhtml', + 'url': 'about:invalid', + 'fragments': [{ + 'path': sheet, + # XXX: not entirely accurate; each spritesheet seems to be + # a 10×10 grid of thumbnails corresponding to approximately + # 2 seconds of the video; the last spritesheet may be shorter + 'duration': 200, + } for sheet in spritesheets] + }) + self._sort_formats(formats) - title = info['titre'] - subtitle = info.get('sous_titre') if subtitle: title += ' - %s' % subtitle title = title.strip() - subtitles.setdefault('fr', []).extend( - [{ - 'url': subformat['url'], - 'ext': subformat.get('format'), - } for subformat in info.get('subtitles', []) if subformat.get('url')] - ) - return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, - 'description': clean_html(info.get('synopsis')), - 'thumbnail': urljoin('https://sivideo.webservices.francetelevisions.fr', info.get('image')), - 'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')), - 'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])), + 'title': title, + 'thumbnail': image, + 'duration': duration, + 'timestamp': timestamp, 'is_live': is_live, 'formats': formats, 'subtitles': subtitles, @@ -308,35 +300,6 @@ def _real_extract(self, url): return self._make_url_result(video_id, catalogue) -class FranceTVEmbedIE(FranceTVBaseInfoExtractor): - _VALID_URL = r'https?://embed\.francetv\.fr/*\?.*?\bue=(?P[^&]+)' - - _TESTS = [{ - 'url': 'http://embed.francetv.fr/?ue=7fd581a2ccf59d2fc5719c5c13cf6961', - 'info_dict': { - 'id': 'NI_983319', - 'ext': 'mp4', - 'title': 'Le Pen Reims', - 'upload_date': '20170505', - 'timestamp': 1493981780, - 'duration': 16, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [FranceTVIE.ie_key()], - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - video = self._download_json( - 'http://api-embed.webservices.francetelevisions.fr/key/%s' % video_id, - video_id) - - return self._make_url_result(video['video_id'], video.get('catalog')) - - class FranceTVInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P[^/?#&.]+)' @@ -426,139 +389,3 @@ def _real_extract(self, url): webpage, 'video id') return self._make_url_result(video_id) - - -class FranceTVInfoSportIE(FranceTVBaseInfoExtractor): - IE_NAME = 'sport.francetvinfo.fr' - _VALID_URL = r'https?://sport\.francetvinfo\.fr/(?:[^/]+/)*(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'https://sport.francetvinfo.fr/les-jeux-olympiques/retour-sur-les-meilleurs-moments-de-pyeongchang-2018', - 'info_dict': { - 'id': '6e49080e-3f45-11e8-b459-000d3a2439ea', - 'ext': 'mp4', - 'title': 'Retour sur les meilleurs moments de Pyeongchang 2018', - 'timestamp': 1523639962, - 'upload_date': '20180413', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [FranceTVIE.ie_key()], - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex(r'data-video="([^"]+)"', webpage, 'video_id') - return self._make_url_result(video_id, 'Sport-web') - - -class GenerationWhatIE(InfoExtractor): - IE_NAME = 'france2.fr:generation-what' - _VALID_URL = r'https?://generation-what\.francetv\.fr/[^/]+/video/(?P[^/?#&]+)' - - _TESTS = [{ - 'url': 'http://generation-what.francetv.fr/portrait/video/present-arms', - 'info_dict': { - 'id': 'wtvKYUG45iw', - 'ext': 'mp4', - 'title': 'Generation What - Garde à vous - FRA', - 'uploader': 'Generation What', - 'uploader_id': 'UCHH9p1eetWCgt4kXBYCb3_w', - 'upload_date': '20160411', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Youtube'], - }, { - 'url': 'http://generation-what.francetv.fr/europe/video/present-arms', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - youtube_id = self._search_regex( - r"window\.videoURL\s*=\s*'([0-9A-Za-z_-]{11})';", - webpage, 'youtube id') - - return self.url_result(youtube_id, ie='Youtube', video_id=youtube_id) - - -class CultureboxIE(FranceTVBaseInfoExtractor): - _VALID_URL = r'https?://(?:m\.)?culturebox\.francetvinfo\.fr/(?:[^/]+/)*(?P[^/?#&]+)' - - _TESTS = [{ - 'url': 'https://culturebox.francetvinfo.fr/opera-classique/musique-classique/c-est-baroque/concerts/cantates-bwv-4-106-et-131-de-bach-par-raphael-pichon-57-268689', - 'info_dict': { - 'id': 'EV_134885', - 'ext': 'mp4', - 'title': 'Cantates BWV 4, 106 et 131 de Bach par Raphaël Pichon 5/7', - 'description': 'md5:19c44af004b88219f4daa50fa9a351d4', - 'upload_date': '20180206', - 'timestamp': 1517945220, - 'duration': 5981, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [FranceTVIE.ie_key()], - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - if ">Ce live n'est plus disponible en replay<" in webpage: - raise ExtractorError( - 'Video %s is not available' % display_id, expected=True) - - video_id, catalogue = self._search_regex( - r'["\'>]https?://videos\.francetv\.fr/video/([^@]+@.+?)["\'<]', - webpage, 'video id').split('@') - - return self._make_url_result(video_id, catalogue) - - -class FranceTVJeunesseIE(FranceTVBaseInfoExtractor): - _VALID_URL = r'(?Phttps?://(?:www\.)?(?:zouzous|ludo)\.fr/heros/(?P[^/?#&]+))' - - _TESTS = [{ - 'url': 'https://www.zouzous.fr/heros/simon', - 'info_dict': { - 'id': 'simon', - }, - 'playlist_count': 9, - }, { - 'url': 'https://www.ludo.fr/heros/ninjago', - 'info_dict': { - 'id': 'ninjago', - }, - 'playlist_count': 10, - }, { - 'url': 'https://www.zouzous.fr/heros/simon?abc', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - playlist_id = mobj.group('id') - - playlist = self._download_json( - '%s/%s' % (mobj.group('url'), 'playlist'), playlist_id) - - if not playlist.get('count'): - raise ExtractorError( - '%s is not available' % playlist_id, expected=True) - - entries = [] - for item in playlist['items']: - identity = item.get('identity') - if identity and isinstance(identity, compat_str): - entries.append(self._make_url_result(identity)) - - return self.playlist_result(entries, playlist_id) diff --git a/yt_dlp/extractor/freshlive.py b/yt_dlp/extractor/freshlive.py index 72a8459453..ad19b81094 100644 --- a/yt_dlp/extractor/freshlive.py +++ b/yt_dlp/extractor/freshlive.py @@ -59,9 +59,6 @@ def _real_extract(self, url): stream_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') - if is_live: - title = self._live_title(title) - return { 'id': video_id, 'formats': formats, diff --git a/yt_dlp/extractor/fujitv.py b/yt_dlp/extractor/fujitv.py index a02a943742..1cea62609f 100644 --- a/yt_dlp/extractor/fujitv.py +++ b/yt_dlp/extractor/fujitv.py @@ -5,19 +5,32 @@ class FujiTVFODPlus7IE(InfoExtractor): - _VALID_URL = r'https?://i\.fod\.fujitv\.co\.jp/plus7/web/[0-9a-z]{4}/(?P[0-9a-z]+)' + _VALID_URL = r'https?://fod\.fujitv\.co\.jp/title/[0-9a-z]{4}/(?P[0-9a-z]+)' _BASE_URL = 'http://i.fod.fujitv.co.jp/' _BITRATE_MAP = { 300: (320, 180), 800: (640, 360), 1200: (1280, 720), 2000: (1280, 720), + 4000: (1920, 1080), } + _TESTS = [{ + 'url': 'https://fod.fujitv.co.jp/title/5d40/5d40810075', + 'info_dict': { + 'id': '5d40810075', + 'title': '5d40810075', + 'ext': 'mp4', + 'format_id': '4000', + 'thumbnail': 'http://i.fod.fujitv.co.jp/pc/image/wbtn/wbtn_5d40810075.jpg' + }, + 'skip': 'Expires after a week' + }] + def _real_extract(self, url): video_id = self._match_id(url) formats = self._extract_m3u8_formats( - self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id, 'mp4') + self._BASE_URL + 'abr/tv_android/%s.m3u8' % video_id, video_id, 'mp4') for f in formats: wh = self._BITRATE_MAP.get(f.get('tbr')) if wh: diff --git a/yt_dlp/extractor/funimation.py b/yt_dlp/extractor/funimation.py index 4d95f1c7c4..96dad2ca34 100644 --- a/yt_dlp/extractor/funimation.py +++ b/yt_dlp/extractor/funimation.py @@ -2,25 +2,62 @@ from __future__ import unicode_literals import random +import re import string from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( determine_ext, - dict_get, int_or_none, + join_nonempty, js_to_json, + orderedSet, + qualities, str_or_none, + traverse_obj, try_get, urlencode_postdata, ExtractorError, ) -class FunimationPageIE(InfoExtractor): +class FunimationBaseIE(InfoExtractor): + _NETRC_MACHINE = 'funimation' + _REGION = None + _TOKEN = None + + def _get_region(self): + region_cookie = self._get_cookies('https://www.funimation.com').get('region') + region = region_cookie.value if region_cookie else self.get_param('geo_bypass_country') + return region or traverse_obj( + self._download_json( + 'https://geo-service.prd.funimationsvc.com/geo/v1/region/check', None, fatal=False, + note='Checking geo-location', errnote='Unable to fetch geo-location information'), + 'region') or 'US' + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + try: + data = self._download_json( + 'https://prod-api-funimationnow.dadcdigital.com/api/auth/login/', + None, 'Logging in', data=urlencode_postdata({ + 'username': username, + 'password': password, + })) + return data['token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + error = self._parse_json(e.cause.read().decode(), None)['error'] + raise ExtractorError(error, expected=True) + raise + + +class FunimationPageIE(FunimationBaseIE): IE_NAME = 'funimation:page' - _VALID_URL = r'(?Phttps?://(?:www\.)?funimation(?:\.com|now\.uk))/(?P[^/]+/)?(?Pshows/(?P[^/]+/[^/?#&]+).*$)' + _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/(?:(?P[^/]+)/)?(?:shows|v)/(?P[^/]+)/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.funimation.com/shows/attack-on-titan-junior-high/broadcast-dub-preview/', @@ -45,38 +82,34 @@ class FunimationPageIE(InfoExtractor): }, { 'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/', 'only_matching': True, + }, { + 'url': 'https://www.funimation.com/v/a-certain-scientific-railgun/super-powered-level-5', + 'only_matching': True, }] + def _real_initialize(self): + if not self._REGION: + FunimationBaseIE._REGION = self._get_region() + if not self._TOKEN: + FunimationBaseIE._TOKEN = self._login() + def _real_extract(self, url): - mobj = self._match_valid_url(url) - display_id = mobj.group('id').replace('/', '_') - if not mobj.group('lang'): - url = '%s/en/%s' % (mobj.group('origin'), mobj.group('path')) + locale, show, episode = self._match_valid_url(url).group('lang', 'show', 'episode') - webpage = self._download_webpage(url, display_id) - title_data = self._parse_json(self._search_regex( - r'TITLE_DATA\s*=\s*({[^}]+})', - webpage, 'title data', default=''), - display_id, js_to_json, fatal=False) or {} + video_id = traverse_obj(self._download_json( + f'https://title-api.prd.funimationsvc.com/v1/shows/{show}/episodes/{episode}', + f'{show}_{episode}', query={ + 'deviceType': 'web', + 'region': self._REGION, + 'locale': locale or 'en' + }), ('videoList', ..., 'id'), get_all=False) - video_id = ( - title_data.get('id') - or self._search_regex( - (r"KANE_customdimensions.videoID\s*=\s*'(\d+)';", r']+src="/player/(\d+)'), - webpage, 'video_id', default=None) - or self._search_regex( - r'/player/(\d+)', - self._html_search_meta(['al:web:url', 'og:video:url', 'og:video:secure_url'], webpage, fatal=True), - 'video id')) return self.url_result(f'https://www.funimation.com/player/{video_id}', FunimationIE.ie_key(), video_id) -class FunimationIE(InfoExtractor): +class FunimationIE(FunimationBaseIE): _VALID_URL = r'https?://(?:www\.)?funimation\.com/player/(?P\d+)' - _NETRC_MACHINE = 'funimation' - _TOKEN = None - _TESTS = [{ 'url': 'https://www.funimation.com/player/210051', 'info_dict': { @@ -92,7 +125,7 @@ class FunimationIE(InfoExtractor): 'season_number': 99, 'series': 'Attack on Titan: Junior High', 'description': '', - 'duration': 154, + 'duration': 155, }, 'params': { 'skip_download': 'm3u8', @@ -113,7 +146,7 @@ class FunimationIE(InfoExtractor): 'season_number': 99, 'series': 'Attack on Titan: Junior High', 'description': '', - 'duration': 154, + 'duration': 155, }, 'params': { 'skip_download': 'm3u8', @@ -121,26 +154,9 @@ class FunimationIE(InfoExtractor): }, }] - def _login(self): - username, password = self._get_login_info() - if username is None: - return - try: - data = self._download_json( - 'https://prod-api-funimationnow.dadcdigital.com/api/auth/login/', - None, 'Logging in', data=urlencode_postdata({ - 'username': username, - 'password': password, - })) - self._TOKEN = data['token'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read().decode(), None)['error'] - raise ExtractorError(error, expected=True) - raise - def _real_initialize(self): - self._login() + if not self._TOKEN: + FunimationBaseIE._TOKEN = self._login() @staticmethod def _get_experiences(episode): @@ -180,6 +196,8 @@ def _real_extract(self, url): formats, subtitles, thumbnails, duration = [], {}, [], 0 requested_languages, requested_versions = self._configuration_arg('language'), self._configuration_arg('version') + language_preference = qualities((requested_languages or [''])[::-1]) + source_preference = qualities((requested_versions or ['uncut', 'simulcast'])[::-1]) only_initial_experience = 'seperate-video-versions' in self.get_param('compat_opts', []) for lang, version, fmt in self._get_experiences(episode): @@ -227,10 +245,15 @@ def _real_extract(self, url): }) for f in current_formats: # TODO: Convert language to code - f.update({'language': lang, 'format_note': version}) + f.update({ + 'language': lang, + 'format_note': version, + 'source_preference': source_preference(version.lower()), + 'language_preference': language_preference(lang.lower()), + }) formats.extend(current_formats) self._remove_duplicate_formats(formats) - self._sort_formats(formats) + self._sort_formats(formats, ('lang', 'source')) return { 'id': initial_experience_id if only_initial_experience else episode_id, @@ -253,7 +276,7 @@ def _real_extract(self, url): def _get_subtitles(self, subtitles, experience_id, episode, display_id, format_name): if isinstance(episode, str): webpage = self._download_webpage( - f'https://www.funimation.com/player/{experience_id}', display_id, + f'https://www.funimation.com/player/{experience_id}/', display_id, fatal=False, note=f'Downloading player webpage for {format_name}') episode, _, _ = self._get_episode(webpage, episode_id=episode, fatal=False) @@ -266,16 +289,17 @@ def _get_subtitles(self, subtitles, experience_id, episode, display_id, format_n sub_type = sub_type if sub_type != 'FULL' else None current_sub = { 'url': text_track['src'], - 'name': ' '.join(filter(None, (version, text_track.get('label'), sub_type))) + 'name': join_nonempty(version, text_track.get('label'), sub_type, delim=' ') } - lang = '_'.join(filter(None, ( - text_track.get('language', 'und'), version if version != 'Simulcast' else None, sub_type))) + lang = join_nonempty(text_track.get('language', 'und'), + version if version != 'Simulcast' else None, + sub_type, delim='_') if current_sub not in subtitles.get(lang, []): subtitles.setdefault(lang, []).append(current_sub) return subtitles -class FunimationShowIE(FunimationIE): +class FunimationShowIE(FunimationBaseIE): IE_NAME = 'funimation:show' _VALID_URL = r'(?Phttps?://(?:www\.)?funimation(?:\.com|now\.uk)/(?P[^/]+)?/?shows/(?P[^/?#&]+))/?(?:[?#]|$)' @@ -302,24 +326,29 @@ class FunimationShowIE(FunimationIE): }, }] + def _real_initialize(self): + if not self._REGION: + FunimationBaseIE._REGION = self._get_region() + def _real_extract(self, url): base_url, locale, display_id = self._match_valid_url(url).groups() show_info = self._download_json( - 'https://title-api.prd.funimationsvc.com/v2/shows/%s?region=US&deviceType=web&locale=%s' - % (display_id, locale or 'en'), display_id) - items = self._download_json( + 'https://title-api.prd.funimationsvc.com/v2/shows/%s?region=%s&deviceType=web&locale=%s' + % (display_id, self._REGION, locale or 'en'), display_id) + items_info = self._download_json( 'https://prod-api-funimationnow.dadcdigital.com/api/funimation/episodes/?limit=99999&title_id=%s' - % show_info.get('id'), display_id).get('items') - vod_items = map(lambda k: dict_get(k, ('mostRecentSvod', 'mostRecentAvod')).get('item'), items) + % show_info.get('id'), display_id) + + vod_items = traverse_obj(items_info, ('items', ..., re.compile('(?i)mostRecent[AS]vod').match, 'item')) return { '_type': 'playlist', 'id': show_info['id'], 'title': show_info['name'], - 'entries': [ + 'entries': orderedSet( self.url_result( '%s/%s' % (base_url, vod_item.get('episodeSlug')), FunimationPageIE.ie_key(), vod_item.get('episodeId'), vod_item.get('episodeName')) - for vod_item in sorted(vod_items, key=lambda x: x.get('episodeOrder'))], + for vod_item in sorted(vod_items, key=lambda x: x.get('episodeOrder', -1))), } diff --git a/yt_dlp/extractor/gab.py b/yt_dlp/extractor/gab.py index 25b5cb0664..9ba0b1ca19 100644 --- a/yt_dlp/extractor/gab.py +++ b/yt_dlp/extractor/gab.py @@ -6,12 +6,16 @@ from .common import InfoExtractor from ..utils import ( clean_html, + int_or_none, + parse_codecs, + parse_duration, str_to_int, + unified_timestamp ) class GabTVIE(InfoExtractor): - _VALID_URL = r'(?:https?://)tv.gab.com/channel/[^/]+/view/(?P[a-z0-9-]+)' + _VALID_URL = r'https?://tv\.gab\.com/channel/[^/]+/view/(?P[a-z0-9-]+)' _TESTS = [{ 'url': 'https://tv.gab.com/channel/wurzelroot/view/why-was-america-in-afghanistan-61217eacea5665de450d0488', 'info_dict': { @@ -32,8 +36,10 @@ def _real_extract(self, url): channel_name = self._search_regex(r'data-channel-name=\"(?P[^\"]+)', webpage, 'channel_name') title = self._search_regex(r'data-episode-title=\"(?P[^\"]+)', webpage, 'title') view_key = self._search_regex(r'data-view-key=\"(?P[^\"]+)', webpage, 'view_key') - description = clean_html(self._html_search_regex(self._meta_regex('description'), webpage, 'description', group='content')) or None - available_resolutions = re.findall(r'[^\"]+)' % id, webpage) + description = clean_html( + self._html_search_regex(self._meta_regex('description'), webpage, 'description', group='content')) or None + available_resolutions = re.findall(r'[^\"]+)' % id, + webpage) formats = [] for resolution in available_resolutions: @@ -62,3 +68,80 @@ def _real_extract(self, url): 'uploader_id': channel_id, 'thumbnail': f'https://tv.gab.com/image/{id}', } + + +class GabIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gab\.com/[^/]+/posts/(?P\d+)' + _TESTS = [{ + 'url': 'https://gab.com/SomeBitchIKnow/posts/107163961867310434', + 'md5': '8ca34fb00f1e1033b5c5988d79ec531d', + 'info_dict': { + 'id': '107163961867310434-0', + 'ext': 'mp4', + 'title': 'L on Gab', + 'uploader_id': '946600', + 'uploader': 'SomeBitchIKnow', + 'description': 'md5:204055fafd5e1a519f5d6db953567ca3', + 'timestamp': 1635192289, + 'upload_date': '20211025', + } + }, { + 'url': 'https://gab.com/TheLonelyProud/posts/107045884469287653', + 'md5': 'f9cefcfdff6418e392611a828d47839d', + 'info_dict': { + 'id': '107045884469287653-0', + 'ext': 'mp4', + 'title': 'Jody Sadowski on Gab', + 'uploader_id': '1390705', + 'timestamp': 1633390571, + 'upload_date': '20211004', + 'uploader': 'TheLonelyProud', + } + }] + + def _real_extract(self, url): + post_id = self._match_id(url) + json_data = self._download_json(f'https://gab.com/api/v1/statuses/{post_id}', post_id) + + entries = [] + for idx, media in enumerate(json_data['media_attachments']): + if media.get('type') not in ('video', 'gifv'): + continue + metadata = media['meta'] + format_metadata = { + 'acodec': parse_codecs(metadata.get('audio_encode')).get('acodec'), + 'asr': int_or_none((metadata.get('audio_bitrate') or '').split(' ')[0]), + 'fps': metadata.get('fps'), + } + + formats = [{ + 'url': url, + 'width': f.get('width'), + 'height': f.get('height'), + 'tbr': int_or_none(f.get('bitrate'), scale=1000), + **format_metadata, + } for url, f in ((media.get('url'), metadata.get('original') or {}), + (media.get('source_mp4'), metadata.get('playable') or {})) if url] + + self._sort_formats(formats) + + author = json_data.get('account') or {} + entries.append({ + 'id': f'{post_id}-{idx}', + 'title': f'{json_data["account"]["display_name"]} on Gab', + 'timestamp': unified_timestamp(json_data.get('created_at')), + 'formats': formats, + 'description': clean_html(json_data.get('content')), + 'duration': metadata.get('duration') or parse_duration(metadata.get('length')), + 'like_count': json_data.get('favourites_count'), + 'comment_count': json_data.get('replies_count'), + 'repost_count': json_data.get('reblogs_count'), + 'uploader': author.get('username'), + 'uploader_id': author.get('id'), + 'uploader_url': author.get('url'), + }) + + if len(entries) > 1: + return self.playlist_result(entries, post_id) + + return entries[0] diff --git a/yt_dlp/extractor/gamejolt.py b/yt_dlp/extractor/gamejolt.py new file mode 100644 index 0000000000..7f2f6f3e1d --- /dev/null +++ b/yt_dlp/extractor/gamejolt.py @@ -0,0 +1,540 @@ +# coding: utf-8 +import itertools +import json +import math + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote +from ..utils import ( + determine_ext, + int_or_none, + str_or_none, + traverse_obj, + try_get +) + + +class GameJoltBaseIE(InfoExtractor): + _API_BASE = 'https://gamejolt.com/site-api/' + + def _call_api(self, endpoint, *args, **kwargs): + kwargs.setdefault('headers', {}).update({'Accept': 'image/webp,*/*'}) + return self._download_json(self._API_BASE + endpoint, *args, **kwargs)['payload'] + + def _parse_content_as_text(self, content): + outer_contents, joined_contents = content.get('content') or [], [] + for outer_content in outer_contents: + if outer_content.get('type') != 'paragraph': + joined_contents.append(self._parse_content_as_text(outer_content)) + continue + inner_contents, inner_content_text = outer_content.get('content') or [], '' + for inner_content in inner_contents: + if inner_content.get('text'): + inner_content_text += inner_content['text'] + elif inner_content.get('type') == 'hardBreak': + inner_content_text += '\n' + joined_contents.append(inner_content_text) + + return '\n'.join(joined_contents) + + def _get_comments(self, post_num_id, post_hash_id): + sort_by, scroll_id = self._configuration_arg('comment_sort', ['hot'], ie_key=GameJoltIE.ie_key())[0], -1 + is_scrolled = sort_by in ('new', 'you') + for page in itertools.count(1): + comments_data = self._call_api( + 'comments/Fireside_Post/%s/%s?%s=%d' % ( + post_num_id, sort_by, + 'scroll_id' if is_scrolled else 'page', scroll_id if is_scrolled else page), + post_hash_id, note='Downloading comments list page %d' % page) + if not comments_data.get('comments'): + break + for comment in traverse_obj(comments_data, (('comments', 'childComments'), ...), expected_type=dict, default=[]): + yield { + 'id': comment['id'], + 'text': self._parse_content_as_text( + self._parse_json(comment['comment_content'], post_hash_id)), + 'timestamp': int_or_none(comment.get('posted_on'), scale=1000), + 'like_count': comment.get('votes'), + 'author': traverse_obj(comment, ('user', ('display_name', 'name')), expected_type=str_or_none, get_all=False), + 'author_id': traverse_obj(comment, ('user', 'username'), expected_type=str_or_none), + 'author_thumbnail': traverse_obj(comment, ('user', 'image_avatar'), expected_type=str_or_none), + 'parent': comment.get('parent_id') or None, + } + scroll_id = int_or_none(comments_data['comments'][-1].get('posted_on')) + + def _parse_post(self, post_data): + post_id = post_data['hash'] + lead_content = self._parse_json(post_data.get('lead_content') or '{}', post_id, fatal=False) or {} + description, full_description = post_data.get('leadStr') or self._parse_content_as_text( + self._parse_json(post_data.get('lead_content'), post_id)), None + if post_data.get('has_article'): + article_content = self._parse_json( + post_data.get('article_content') + or self._call_api(f'web/posts/article/{post_data.get("id", post_id)}', post_id, + note='Downloading article metadata', errnote='Unable to download article metadata', fatal=False).get('article'), + post_id, fatal=False) + full_description = self._parse_content_as_text(article_content) + + user_data = post_data.get('user') or {} + info_dict = { + 'extractor_key': GameJoltIE.ie_key(), + 'extractor': 'GameJolt', + 'webpage_url': str_or_none(post_data.get('url')) or f'https://gamejolt.com/p/{post_id}', + 'id': post_id, + 'title': description, + 'description': full_description or description, + 'display_id': post_data.get('slug'), + 'uploader': user_data.get('display_name') or user_data.get('name'), + 'uploader_id': user_data.get('username'), + 'uploader_url': 'https://gamejolt.com' + user_data['url'] if user_data.get('url') else None, + 'categories': [try_get(category, lambda x: '%s - %s' % (x['community']['name'], x['channel'].get('display_title') or x['channel']['title'])) + for category in post_data.get('communities' or [])], + 'tags': traverse_obj( + lead_content, ('content', ..., 'content', ..., 'marks', ..., 'attrs', 'tag'), expected_type=str_or_none), + 'like_count': int_or_none(post_data.get('like_count')), + 'comment_count': int_or_none(post_data.get('comment_count'), default=0), + 'timestamp': int_or_none(post_data.get('added_on'), scale=1000), + 'release_timestamp': int_or_none(post_data.get('published_on'), scale=1000), + '__post_extractor': self.extract_comments(post_data.get('id'), post_id) + } + + # TODO: Handle multiple videos/embeds? + video_data = traverse_obj(post_data, ('videos', ...), expected_type=dict, get_all=False) or {} + formats, subtitles, thumbnails = [], {}, [] + for media in video_data.get('media') or []: + media_url, mimetype, ext, media_id = media['img_url'], media.get('filetype', ''), determine_ext(media['img_url']), media.get('type') + if mimetype == 'application/vnd.apple.mpegurl' or ext == 'm3u8': + hls_formats, hls_subs = self._extract_m3u8_formats_and_subtitles(media_url, post_id, 'mp4', m3u8_id=media_id) + formats.extend(hls_formats) + subtitles.update(hls_subs) + elif mimetype == 'application/dash+xml' or ext == 'mpd': + dash_formats, dash_subs = self._extract_mpd_formats_and_subtitles(media_url, post_id, mpd_id=media_id) + formats.extend(dash_formats) + subtitles.update(dash_subs) + elif 'image' in mimetype: + thumbnails.append({ + 'id': media_id, + 'url': media_url, + 'width': media.get('width'), + 'height': media.get('height'), + 'filesize': media.get('filesize'), + }) + else: + formats.append({ + 'format_id': media_id, + 'url': media_url, + 'width': media.get('width'), + 'height': media.get('height'), + 'filesize': media.get('filesize'), + 'acodec': 'none' if 'video-card' in media_url else None, + }) + + if formats: + return { + **info_dict, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'view_count': int_or_none(video_data.get('view_count')), + } + + gif_entries = [] + for media in post_data.get('media', []): + if determine_ext(media['img_url']) != 'gif' or 'gif' not in media.get('filetype', ''): + continue + gif_entries.append({ + 'id': media['hash'], + 'title': media['filename'].split('.')[0], + 'formats': [{ + 'format_id': url_key, + 'url': media[url_key], + 'width': media.get('width') if url_key == 'img_url' else None, + 'height': media.get('height') if url_key == 'img_url' else None, + 'filesize': media.get('filesize') if url_key == 'img_url' else None, + 'acodec': 'none', + } for url_key in ('img_url', 'mediaserver_url', 'mediaserver_url_mp4', 'mediaserver_url_webm') if media.get(url_key)] + }) + if gif_entries: + return { + '_type': 'playlist', + **info_dict, + 'entries': gif_entries, + } + + embed_url = traverse_obj(post_data, ('embeds', ..., 'url'), expected_type=str_or_none, get_all=False) + if embed_url: + return self.url_result(embed_url) + return info_dict + + +class GameJoltIE(GameJoltBaseIE): + _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/p/(?:[\w-]*-)?(?P\w{8})' + _TESTS = [{ + # No audio + 'url': 'https://gamejolt.com/p/introducing-ramses-jackson-some-fnf-himbo-i-ve-been-animating-fo-c6achnzu', + 'md5': 'cd5f733258f6678b0ce500dd88166d86', + 'info_dict': { + 'id': 'c6achnzu', + 'ext': 'mp4', + 'display_id': 'introducing-ramses-jackson-some-fnf-himbo-i-ve-been-animating-fo-c6achnzu', + 'title': 'Introducing Ramses Jackson, some FNF himbo I’ve been animating for the past few days, hehe.\n#fnfmod #fridaynightfunkin', + 'description': 'Introducing Ramses Jackson, some FNF himbo I’ve been animating for the past few days, hehe.\n#fnfmod #fridaynightfunkin', + 'uploader': 'Jakeneutron', + 'uploader_id': 'Jakeneutron', + 'uploader_url': 'https://gamejolt.com/@Jakeneutron', + 'categories': ['Friday Night Funkin\' - Videos'], + 'tags': ['fnfmod', 'fridaynightfunkin'], + 'timestamp': 1633499590, + 'upload_date': '20211006', + 'release_timestamp': 1633499655, + 'release_date': '20211006', + 'thumbnail': 're:^https?://.+wgch9mhq.png$', + 'like_count': int, + 'comment_count': int, + 'view_count': int, + } + }, { + # YouTube embed + 'url': 'https://gamejolt.com/p/hey-hey-if-there-s-anyone-who-s-looking-to-get-into-learning-a-n6g4jzpq', + 'md5': '79a931ff500a5c783ef6c3bda3272e32', + 'info_dict': { + 'id': 'XsNA_mzC0q4', + 'title': 'Adobe Animate CC 2021 Tutorial || Part 1 - The Basics', + 'description': 'md5:9d1ab9e2625b3fe1f42b2a44c67fdd13', + 'uploader': 'Jakeneutron', + 'uploader_id': 'Jakeneutron', + 'uploader_url': 'http://www.youtube.com/user/Jakeneutron', + 'ext': 'mp4', + 'duration': 1749, + 'tags': ['Adobe Animate CC', 'Tutorial', 'Animation', 'The Basics', 'For Beginners'], + 'like_count': int, + 'playable_in_embed': True, + 'categories': ['Education'], + 'availability': 'public', + 'thumbnail': 'https://i.ytimg.com/vi_webp/XsNA_mzC0q4/maxresdefault.webp', + 'age_limit': 0, + 'live_status': 'not_live', + 'channel_url': 'https://www.youtube.com/channel/UC6_L7fnczNalFZyBthUE9oA', + 'channel': 'Jakeneutron', + 'channel_id': 'UC6_L7fnczNalFZyBthUE9oA', + 'upload_date': '20211015', + 'view_count': int, + 'chapters': 'count:18', + } + }, { + # Article + 'url': 'https://gamejolt.com/p/i-fuckin-broke-chaos-d56h3eue', + 'md5': '786c1ccf98fde02c03a2768acb4258d0', + 'info_dict': { + 'id': 'd56h3eue', + 'ext': 'mp4', + 'display_id': 'i-fuckin-broke-chaos-d56h3eue', + 'title': 'I fuckin broke Chaos.', + 'description': 'I moved my tab durning the cutscene so now it\'s stuck like this.', + 'uploader': 'Jeff____________', + 'uploader_id': 'The_Nyesh_Man', + 'uploader_url': 'https://gamejolt.com/@The_Nyesh_Man', + 'categories': ['Friday Night Funkin\' - Videos'], + 'timestamp': 1639800264, + 'upload_date': '20211218', + 'release_timestamp': 1639800330, + 'release_date': '20211218', + 'thumbnail': 're:^https?://.+euksy8bd.png$', + 'like_count': int, + 'comment_count': int, + 'view_count': int, + } + }, { + # Single GIF + 'url': 'https://gamejolt.com/p/hello-everyone-i-m-developing-a-pixel-art-style-mod-for-fnf-and-i-vs4gdrd8', + 'info_dict': { + 'id': 'vs4gdrd8', + 'display_id': 'hello-everyone-i-m-developing-a-pixel-art-style-mod-for-fnf-and-i-vs4gdrd8', + 'title': 'md5:cc3d8b031d9bc7ec2ec5a9ffc707e1f9', + 'description': 'md5:cc3d8b031d9bc7ec2ec5a9ffc707e1f9', + 'uploader': 'Quesoguy', + 'uploader_id': 'CheeseguyDev', + 'uploader_url': 'https://gamejolt.com/@CheeseguyDev', + 'categories': ['Game Dev - General', 'Arts n\' Crafts - Creations', 'Pixel Art - showcase', + 'Friday Night Funkin\' - Mods', 'Newgrounds - Friday Night Funkin (13+)'], + 'timestamp': 1639517122, + 'release_timestamp': 1639519966, + 'like_count': int, + 'comment_count': int, + }, + 'playlist': [{ + 'info_dict': { + 'id': 'dszyjnwi', + 'ext': 'webm', + 'title': 'gif-presentacion-mejorado-dszyjnwi', + 'n_entries': 1, + } + }] + }, { + # Multiple GIFs + 'url': 'https://gamejolt.com/p/gif-yhsqkumq', + 'playlist_count': 35, + 'info_dict': { + 'id': 'yhsqkumq', + 'display_id': 'gif-yhsqkumq', + 'title': 'GIF', + 'description': 'GIF', + 'uploader': 'DaniilTvman', + 'uploader_id': 'DaniilTvman', + 'uploader_url': 'https://gamejolt.com/@DaniilTvman', + 'categories': ['Five Nights At The AGK Studio Comunity - NEWS game'], + 'timestamp': 1638721559, + 'release_timestamp': 1638722276, + 'like_count': int, + 'comment_count': int, + }, + }] + + def _real_extract(self, url): + post_id = self._match_id(url) + post_data = self._call_api( + f'web/posts/view/{post_id}', post_id)['post'] + return self._parse_post(post_data) + + +class GameJoltPostListBaseIE(GameJoltBaseIE): + def _entries(self, endpoint, list_id, note='Downloading post list', errnote='Unable to download post list', initial_items=[]): + page_num, scroll_id = 1, None + items = initial_items or self._call_api(endpoint, list_id, note=note, errnote=errnote)['items'] + while items: + for item in items: + yield self._parse_post(item['action_resource_model']) + scroll_id = items[-1]['scroll_id'] + page_num += 1 + items = self._call_api( + endpoint, list_id, note=f'{note} page {page_num}', errnote=errnote, data=json.dumps({ + 'scrollDirection': 'from', + 'scrollId': scroll_id, + }).encode('utf-8')).get('items') + + +class GameJoltUserIE(GameJoltPostListBaseIE): + _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/@(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://gamejolt.com/@BlazikenSuperStar', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '6116784', + 'title': 'S. Blaze', + 'description': 'md5:5ba7fbbb549e8ea2545aafbfe22eb03a', + }, + 'params': { + 'ignore_no_formats_error': True, + }, + 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'], + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + user_data = self._call_api( + f'web/profile/@{user_id}', user_id, note='Downloading user info', errnote='Unable to download user info')['user'] + bio = self._parse_content_as_text( + self._parse_json(user_data.get('bio_content', '{}'), user_id, fatal=False) or {}) + return self.playlist_result( + self._entries(f'web/posts/fetch/user/@{user_id}?tab=active', user_id, 'Downloading user posts', 'Unable to download user posts'), + str_or_none(user_data.get('id')), user_data.get('display_name') or user_data.get('name'), bio) + + +class GameJoltGameIE(GameJoltPostListBaseIE): + _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/games/[\w-]+/(?P\d+)' + _TESTS = [{ + 'url': 'https://gamejolt.com/games/Friday4Fun/655124', + 'playlist_mincount': 2, + 'info_dict': { + 'id': '655124', + 'title': 'Friday Night Funkin\': Friday 4 Fun', + 'description': 'md5:576a7dd87912a2dcf33c50d2bd3966d3' + }, + 'params': { + 'ignore_no_formats_error': True, + }, + 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'], + }] + + def _real_extract(self, url): + game_id = self._match_id(url) + game_data = self._call_api( + f'web/discover/games/{game_id}', game_id, note='Downloading game info', errnote='Unable to download game info')['game'] + description = self._parse_content_as_text( + self._parse_json(game_data.get('description_content', '{}'), game_id, fatal=False) or {}) + return self.playlist_result( + self._entries(f'web/posts/fetch/game/{game_id}', game_id, 'Downloading game posts', 'Unable to download game posts'), + game_id, game_data.get('title'), description) + + +class GameJoltGameSoundtrackIE(GameJoltBaseIE): + _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/get/soundtrack(?:\?|\#!?)(?:.*?[&;])??game=(?P(?:\d+)+)' + _TESTS = [{ + 'url': 'https://gamejolt.com/get/soundtrack?foo=bar&game=657899', + 'info_dict': { + 'id': '657899', + 'title': 'Friday Night Funkin\': Vs Oswald', + 'n_entries': None, + }, + 'playlist': [{ + 'info_dict': { + 'id': '184434', + 'ext': 'mp3', + 'title': 'Gettin\' Lucky (Menu Music)', + 'url': r're:^https://.+vs-oswald-menu-music\.mp3$', + 'release_timestamp': 1635190816, + 'release_date': '20211025', + 'n_entries': 3, + } + }, { + 'info_dict': { + 'id': '184435', + 'ext': 'mp3', + 'title': 'Rabbit\'s Luck (Extended Version)', + 'url': r're:^https://.+rabbit-s-luck--full-version-\.mp3$', + 'release_timestamp': 1635190841, + 'release_date': '20211025', + 'n_entries': 3, + } + }, { + 'info_dict': { + 'id': '185228', + 'ext': 'mp3', + 'title': 'Last Straw', + 'url': r're:^https://.+last-straw\.mp3$', + 'release_timestamp': 1635881104, + 'release_date': '20211102', + 'n_entries': 3, + } + }] + }] + + def _real_extract(self, url): + game_id = self._match_id(url) + game_overview = self._call_api( + f'web/discover/games/overview/{game_id}', game_id, note='Downloading soundtrack info', errnote='Unable to download soundtrack info') + return self.playlist_result([{ + 'id': str_or_none(song.get('id')), + 'title': str_or_none(song.get('title')), + 'url': str_or_none(song.get('url')), + 'release_timestamp': int_or_none(song.get('posted_on'), scale=1000), + } for song in game_overview.get('songs') or []], game_id, traverse_obj( + game_overview, ('microdata', 'name'), (('twitter', 'fb'), 'title'), expected_type=str_or_none, get_all=False)) + + +class GameJoltCommunityIE(GameJoltPostListBaseIE): + _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/c/(?P(?P[\w-]+)(?:/(?P[\w-]+))?)(?:(?:\?|\#!?)(?:.*?[&;])??sort=(?P\w+))?' + _TESTS = [{ + 'url': 'https://gamejolt.com/c/fnf/videos', + 'playlist_mincount': 50, + 'info_dict': { + 'id': 'fnf/videos', + 'title': 'Friday Night Funkin\' - Videos', + 'description': 'md5:6d8c06f27460f7d35c1554757ffe53c8' + }, + 'params': { + 'playlistend': 50, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'], + }, { + 'url': 'https://gamejolt.com/c/youtubers', + 'playlist_mincount': 50, + 'info_dict': { + 'id': 'youtubers/featured', + 'title': 'Youtubers - featured', + 'description': 'md5:53e5582c93dcc467ab597bfca4db17d4' + }, + 'params': { + 'playlistend': 50, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'], + }] + + def _real_extract(self, url): + display_id, community_id, channel_id, sort_by = self._match_valid_url(url).group('id', 'community', 'channel', 'sort') + channel_id, sort_by = channel_id or 'featured', sort_by or 'new' + + community_data = self._call_api( + f'web/communities/view/{community_id}', display_id, + note='Downloading community info', errnote='Unable to download community info')['community'] + channel_data = traverse_obj(self._call_api( + f'web/communities/view-channel/{community_id}/{channel_id}', display_id, + note='Downloading channel info', errnote='Unable to download channel info', fatal=False), 'channel') or {} + + title = f'{community_data.get("name") or community_id} - {channel_data.get("display_title") or channel_id}' + description = self._parse_content_as_text( + self._parse_json(community_data.get('description_content') or '{}', display_id, fatal=False) or {}) + return self.playlist_result( + self._entries( + f'web/posts/fetch/community/{community_id}?channels[]={sort_by}&channels[]={channel_id}', + display_id, 'Downloading community posts', 'Unable to download community posts'), + f'{community_id}/{channel_id}', title, description) + + +class GameJoltSearchIE(GameJoltPostListBaseIE): + _VALID_URL = r'https?://(?:www\.)?gamejolt\.com/search(?:/(?Pcommunities|users|games))?(?:\?|\#!?)(?:.*?[&;])??q=(?P(?:[^&#]+)+)' + _URL_FORMATS = { + 'users': 'https://gamejolt.com/@{username}', + 'communities': 'https://gamejolt.com/c/{path}', + 'games': 'https://gamejolt.com/games/{slug}/{id}', + } + _TESTS = [{ + 'url': 'https://gamejolt.com/search?foo=bar&q=%23fnf', + 'playlist_mincount': 50, + 'info_dict': { + 'id': '#fnf', + 'title': '#fnf', + }, + 'params': { + 'playlistend': 50, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': ['skipping format', 'No video formats found', 'Requested format is not available'], + }, { + 'url': 'https://gamejolt.com/search/communities?q=cookie%20run', + 'playlist_mincount': 10, + 'info_dict': { + 'id': 'cookie run', + 'title': 'cookie run', + }, + }, { + 'url': 'https://gamejolt.com/search/users?q=mlp', + 'playlist_mincount': 278, + 'info_dict': { + 'id': 'mlp', + 'title': 'mlp', + }, + }, { + 'url': 'https://gamejolt.com/search/games?q=roblox', + 'playlist_mincount': 688, + 'info_dict': { + 'id': 'roblox', + 'title': 'roblox', + }, + }] + + def _search_entries(self, query, filter_mode, display_query): + initial_search_data = self._call_api( + f'web/search/{filter_mode}?q={query}', display_query, + note=f'Downloading {filter_mode} list', errnote=f'Unable to download {filter_mode} list') + entries_num = traverse_obj(initial_search_data, 'count', f'{filter_mode}Count') + if not entries_num: + return + for page in range(1, math.ceil(entries_num / initial_search_data['perPage']) + 1): + search_results = self._call_api( + f'web/search/{filter_mode}?q={query}&page={page}', display_query, + note=f'Downloading {filter_mode} list page {page}', errnote=f'Unable to download {filter_mode} list') + for result in search_results[filter_mode]: + yield self.url_result(self._URL_FORMATS[filter_mode].format(**result)) + + def _real_extract(self, url): + filter_mode, query = self._match_valid_url(url).group('filter', 'id') + display_query = compat_urllib_parse_unquote(query) + return self.playlist_result( + self._search_entries(query, filter_mode, display_query) if filter_mode else self._entries( + f'web/posts/fetch/search/{query}', display_query, initial_items=self._call_api( + f'web/search?q={query}', display_query, + note='Downloading initial post list', errnote='Unable to download initial post list')['posts']), + display_query, display_query) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index d08f8f30de..5dafef2837 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -56,7 +56,7 @@ from .myvi import MyviIE from .condenast import CondeNastIE from .udn import UDNEmbedIE -from .senateisvp import SenateISVPIE +from .senategov import SenateISVPIE from .svt import SVTIE from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE @@ -135,6 +135,8 @@ from .medialaan import MedialaanIE from .simplecast import SimplecastIE from .wimtv import WimTVIE +from .tvp import TVPEmbedIE +from .blogger import BloggerIE class GenericIE(InfoExtractor): @@ -359,9 +361,6 @@ class GenericIE(InfoExtractor): 'formats': 'mincount:9', 'upload_date': '20130904', }, - 'params': { - 'format': 'bestvideo', - }, }, # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 { @@ -1188,6 +1187,21 @@ class GenericIE(InfoExtractor): }, 'skip': 'Only has video a few mornings per month, see http://www.suffolk.edu/sjc/', }, + # jwplayer with only the json URL + { + 'url': 'https://www.hollywoodreporter.com/news/general-news/dunkirk-team-reveals-what-christopher-nolan-said-oscar-win-meet-your-oscar-winner-1092454', + 'info_dict': { + 'id': 'TljWkvWH', + 'ext': 'mp4', + 'upload_date': '20180306', + 'title': 'md5:91eb1862f6526415214f62c00b453936', + 'description': 'md5:73048ae50ae953da10549d1d2fe9b3aa', + 'timestamp': 1520367225, + }, + 'params': { + 'skip_download': True, + }, + }, # Complex jwplayer { 'url': 'http://www.indiedb.com/games/king-machine/videos', @@ -1215,14 +1229,13 @@ class GenericIE(InfoExtractor): }, { # JWPlatform iframe - 'url': 'https://www.mediaite.com/tv/dem-senator-claims-gary-cohn-faked-a-bad-connection-during-trump-call-to-get-him-off-the-phone/', - 'md5': 'ca00a040364b5b439230e7ebfd02c4e9', + 'url': 'https://www.covermagazine.co.uk/feature/2465255/business-protection-involved', 'info_dict': { - 'id': 'O0c5JcKT', + 'id': 'AG26UQXM', 'ext': 'mp4', - 'upload_date': '20171122', - 'timestamp': 1511366290, - 'title': 'Dem Senator Claims Gary Cohn Faked a Bad Connection During Trump Call to Get Him Off the Phone', + 'upload_date': '20160719', + 'timestamp': 468923808, + 'title': '2016_05_18 Cover L&G Business Protection V1 FINAL.mp4', }, 'add_ie': [JWPlatformIE.ie_key()], }, @@ -2161,6 +2174,17 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # blogger embed + 'url': 'https://blog.tomeuvizoso.net/2019/01/a-panfrost-milestone.html', + 'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac', + 'info_dict': { + 'id': 'BLOGGER-video-3c740e3a49197e16-796', + 'ext': 'mp4', + 'title': 'Blogger', + 'thumbnail': r're:^https?://.*', + }, + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2320,12 +2344,55 @@ class GenericIE(InfoExtractor): 'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg', } }, + { + # KVS Player (for sites that serve kt_player.js via non-https urls) + 'url': 'http://www.camhub.world/embed/389508', + 'md5': 'fbe89af4cfb59c8fd9f34a202bb03e32', + 'info_dict': { + 'id': '389508', + 'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source', + 'ext': 'mp4', + 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер', + 'thumbnail': 'http://www.camhub.world/contents/videos_screenshots/389000/389508/preview.mp4.jpg', + } + }, + { + # Reddit-hosted video that will redirect and be processed by RedditIE + # Redirects to https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ + 'url': 'https://v.redd.it/zv89llsvexdz', + 'md5': '87f5f02f6c1582654146f830f21f8662', + 'info_dict': { + 'id': 'zv89llsvexdz', + 'ext': 'mp4', + 'timestamp': 1501941939.0, + 'title': 'That small heart attack.', + 'upload_date': '20170805', + 'uploader': 'Antw87' + } + }, + { + # 1080p Reddit-hosted video that will redirect and be processed by RedditIE + 'url': 'https://v.redd.it/33hgok7dfbz71/', + 'md5': '7a1d587940242c9bb3bd6eb320b39258', + 'info_dict': { + 'id': '33hgok7dfbz71', + 'ext': 'mp4', + 'title': "The game Didn't want me to Knife that Guy I guess", + 'uploader': 'paraf1ve', + 'timestamp': 1636788683.0, + 'upload_date': '20211113' + } + } + # ] def report_following_redirect(self, new_url): """Report information extraction.""" self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) + def report_detected(self, name): + self._downloader.write_debug(f'Identified a {name}') + def _extract_rss(self, url, video_id, doc): playlist_title = doc.find('./channel/title').text playlist_desc_el = doc.find('./channel/description') @@ -2541,10 +2608,13 @@ def _real_extract(self, url): content_type = head_response.headers.get('Content-Type', '').lower() m = re.match(r'^(?Paudio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P[^;\s]+)', content_type) if m: + self.report_detected('direct video link') format_id = compat_str(m.group('format_id')) subtitles = {} if format_id.endswith('mpegurl'): formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') + elif format_id.endswith('mpd') or format_id.endswith('dash+xml'): + formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id) elif format_id == 'f4m': formats = self._extract_f4m_formats(url, video_id) else: @@ -2581,6 +2651,7 @@ def _real_extract(self, url): # Is it an M3U playlist? if first_bytes.startswith(b'#EXTM3U'): + self.report_detected('M3U playlist') info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') self._sort_formats(info_dict['formats']) return info_dict @@ -2611,16 +2682,20 @@ def _real_extract(self, url): except compat_xml_parse_error: doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': + self.report_detected('RSS feed') return self._extract_rss(url, video_id, doc) elif doc.tag == 'SmoothStreamingMedia': info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url) + self.report_detected('ISM manifest') self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): smil = self._parse_smil(doc, url, video_id) + self.report_detected('SMIL file') self._sort_formats(smil['formats']) return smil elif doc.tag == '{http://xspf.org/ns/0/}playlist': + self.report_detected('XSPF playlist') return self.playlist_result( self._parse_xspf( doc, video_id, xspf_url=url, @@ -2631,10 +2706,12 @@ def _real_extract(self, url): doc, mpd_base_url=full_response.geturl().rpartition('/')[0], mpd_url=url) + self.report_detected('DASH manifest') self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) + self.report_detected('F4M manifest') self._sort_formats(info_dict['formats']) return info_dict except compat_xml_parse_error: @@ -2643,6 +2720,7 @@ def _real_extract(self, url): # Is it a Camtasia project? camtasia_res = self._extract_camtasia(url, video_id, webpage) if camtasia_res is not None: + self.report_detected('Camtasia video') return camtasia_res # Sometimes embedded video player is hidden behind percent encoding @@ -2693,6 +2771,8 @@ def _real_extract(self, url): 'age_limit': age_limit, }) + self._downloader.write_debug('Looking for video embeds') + # Look for Brightcove Legacy Studio embeds bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: @@ -2756,12 +2836,6 @@ def _real_extract(self, url): if vhx_url: return self.url_result(vhx_url, VHXEmbedIE.ie_key()) - vid_me_embed_url = self._search_regex( - r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', - webpage, 'vid.me embed', default=None) - if vid_me_embed_url is not None: - return self.url_result(vid_me_embed_url, 'Vidme') - # Invidious Instances # https://github.com/yt-dlp/yt-dlp/issues/195 # https://github.com/iv-org/invidious/pull/1730 @@ -3196,6 +3270,11 @@ def _real_extract(self, url): if onionstudios_url: return self.url_result(onionstudios_url) + # Look for Blogger embeds + blogger_urls = BloggerIE._extract_urls(webpage) + if blogger_urls: + return self.playlist_from_matches(blogger_urls, video_id, video_title, ie=BloggerIE.ie_key()) + # Look for ViewLift embeds viewlift_url = ViewLiftEmbedIE._extract_url(webpage) if viewlift_url: @@ -3489,9 +3568,14 @@ def _real_extract(self, url): return self.playlist_from_matches( rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key()) + tvp_urls = TVPEmbedIE._extract_urls(webpage) + if tvp_urls: + return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key()) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: + self.report_detected('HTML5 media') if len(entries) == 1: entries[0].update({ 'id': video_id, @@ -3510,9 +3594,18 @@ def _real_extract(self, url): jwplayer_data = self._find_jwplayer_data( webpage, video_id, transform_source=js_to_json) if jwplayer_data: + if isinstance(jwplayer_data.get('playlist'), str): + self.report_detected('JW Player playlist') + return { + **info_dict, + '_type': 'url', + 'ie_key': JWPlatformIE.ie_key(), + 'url': jwplayer_data['playlist'], + } try: info = self._parse_jwplayer_data( jwplayer_data, video_id, require_title=False, base_url=url) + self.report_detected('JW Player data') return merge_dicts(info, info_dict) except ExtractorError: # See https://github.com/ytdl-org/youtube-dl/pull/16735 @@ -3562,15 +3655,20 @@ def _real_extract(self, url): }, }) if formats or subtitles: + self.report_detected('video.js embed') self._sort_formats(formats) info_dict['formats'] = formats info_dict['subtitles'] = subtitles return info_dict # Looking for http://schema.org/VideoObject - json_ld = self._search_json_ld( - webpage, video_id, default={}, expected_type='VideoObject') + json_ld = self._search_json_ld(webpage, video_id, default={}) if json_ld.get('url'): + self.report_detected('JSON LD') + if determine_ext(json_ld.get('url')) == 'm3u8': + json_ld['formats'], json_ld['subtitles'] = self._extract_m3u8_formats_and_subtitles( + json_ld['url'], video_id, 'mp4') + json_ld.pop('url') return merge_dicts(json_ld, info_dict) def check_video(vurl): @@ -3587,7 +3685,9 @@ def filter_video(urls): # Start with something easy: JW Player in SWFObject found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)) - if not found: + if found: + self.report_detected('JW Player in SFWObject') + else: # Look for gorilla-vid style embedding found = filter_video(re.findall(r'''(?sx) (?: @@ -3597,10 +3697,13 @@ def filter_video(urls): ) .*? ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage)) + if found: + self.report_detected('JW Player embed') if not found: # Look for generic KVS player - found = re.search(r'', webpage) @@ -3646,10 +3749,14 @@ def filter_video(urls): if not found: # Broaden the search a little bit found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) + if found: + self.report_detected('video file') if not found: # Broaden the findall a little bit: JWPlayer JS loader found = filter_video(re.findall( r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)) + if found: + self.report_detected('JW Player JS loader') if not found: # Flow player found = filter_video(re.findall(r'''(?xs) @@ -3658,10 +3765,14 @@ def filter_video(urls): \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s* ["']?url["']?\s*:\s*["']([^"']+)["'] ''', webpage)) + if found: + self.report_detected('Flow Player') if not found: # Cinerama player found = re.findall( r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage) + if found: + self.report_detected('Cinerama player') if not found: # Try to find twitter cards info # twitter:player:stream should be checked before twitter:player since @@ -3669,6 +3780,8 @@ def filter_video(urls): # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) found = filter_video(re.findall( r'[a-z0-9]+)' + _MEDIA_BASE_URL = 'https://media.gettr.com/' + + _TESTS = [{ + 'url': 'https://www.gettr.com/post/pcf6uv838f', + 'info_dict': { + 'id': 'pcf6uv838f', + 'title': 'md5:9086a646bbd06c41c4fe8e52b3c93454', + 'description': 'md5:be0577f1e4caadc06de4a002da2bf287', + 'ext': 'mp4', + 'uploader': 'EpochTV', + 'uploader_id': 'epochtv', + 'thumbnail': r're:^https?://.+/out\.jpg', + 'timestamp': 1632782451058, + 'duration': 58.5585, + } + }, { + 'url': 'https://gettr.com/post/p4iahp', + 'info_dict': { + 'id': 'p4iahp', + 'title': 'md5:b03c07883db6fbc1aab88877a6c3b149', + 'description': 'md5:741b7419d991c403196ed2ea7749a39d', + 'ext': 'mp4', + 'uploader': 'Neues Forum Freiheit', + 'uploader_id': 'nf_freiheit', + 'thumbnail': r're:^https?://.+/out\.jpg', + 'timestamp': 1626594455017, + 'duration': 23, + } + }] + + def _real_extract(self, url): + post_id = self._match_id(url) + webpage = self._download_webpage(url, post_id) + + api_data = self._download_json( + 'https://api.gettr.com/u/post/%s?incl="poststats|userinfo"' % post_id, post_id) + + post_data = try_get(api_data, lambda x: x['result']['data']) + user_data = try_get(api_data, lambda x: x['result']['aux']['uinf'][post_data['uid']]) or {} + + if post_data.get('nfound'): + raise ExtractorError(post_data.get('txt'), expected=True) + + title = description = str_or_none( + post_data.get('txt') or self._og_search_description(webpage)) + + uploader = str_or_none( + user_data.get('nickname') + or remove_end(self._og_search_title(webpage), ' on GETTR')) + if uploader: + title = '%s - %s' % (uploader, title) + + if not dict_get(post_data, ['vid', 'ovid']): + raise ExtractorError('There\'s no video in this post.') + + vid = post_data.get('vid') + ovid = post_data.get('ovid') + + formats = self._extract_m3u8_formats( + urljoin(self._MEDIA_BASE_URL, vid), post_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') if vid else [] + + if ovid: + formats.append({ + 'url': urljoin(self._MEDIA_BASE_URL, ovid), + 'format_id': 'ovid', + 'ext': 'mp4', + 'width': int_or_none(post_data.get('vid_wid')), + 'height': int_or_none(post_data.get('vid_hgt')), + 'source_preference': 1, + 'quality': 1, + }) + + self._sort_formats(formats) + + return { + 'id': post_id, + 'title': title, + 'description': description, + 'thumbnail': url_or_none( + urljoin(self._MEDIA_BASE_URL, post_data.get('main')) + or self._og_search_thumbnail(webpage)), + 'timestamp': int_or_none(post_data.get('cdate')), + 'uploader_id': str_or_none( + dict_get(user_data, ['_id', 'username']) + or post_data.get('uid')), + 'uploader': uploader, + 'formats': formats, + 'duration': float_or_none(post_data.get('vid_dur')), + 'tags': post_data.get('htgs'), + } diff --git a/yt_dlp/extractor/gfycat.py b/yt_dlp/extractor/gfycat.py index 18a30fe678..56a6dc03d3 100644 --- a/yt_dlp/extractor/gfycat.py +++ b/yt_dlp/extractor/gfycat.py @@ -24,9 +24,10 @@ class GfycatIE(InfoExtractor): 'duration': 10.4, 'view_count': int, 'like_count': int, - 'dislike_count': int, 'categories': list, 'age_limit': 0, + 'uploader_id': 'anonymous', + 'description': '', } }, { 'url': 'http://gfycat.com/ifr/JauntyTimelyAmazontreeboa', @@ -40,9 +41,27 @@ class GfycatIE(InfoExtractor): 'duration': 3.52, 'view_count': int, 'like_count': int, - 'dislike_count': int, 'categories': list, 'age_limit': 0, + 'uploader_id': 'anonymous', + 'description': '', + } + }, { + 'url': 'https://gfycat.com/alienatedsolidgreathornedowl', + 'info_dict': { + 'id': 'alienatedsolidgreathornedowl', + 'ext': 'mp4', + 'upload_date': '20211226', + 'uploader_id': 'reactions', + 'timestamp': 1640536930, + 'like_count': int, + 'description': '', + 'title': 'Ingrid Michaelson, Zooey Deschanel - Merry Christmas Happy New Year', + 'categories': list, + 'age_limit': 0, + 'duration': 2.9583333333333335, + 'uploader': 'Reaction GIFs', + 'view_count': int, } }, { 'url': 'https://gfycat.com/ru/RemarkableDrearyAmurstarfish', @@ -74,7 +93,7 @@ def _real_extract(self, url): title = gfy.get('title') or gfy['gfyName'] description = gfy.get('description') timestamp = int_or_none(gfy.get('createDate')) - uploader = gfy.get('userName') + uploader = gfy.get('userName') or gfy.get('username') view_count = int_or_none(gfy.get('views')) like_count = int_or_none(gfy.get('likes')) dislike_count = int_or_none(gfy.get('dislikes')) @@ -114,7 +133,8 @@ def _real_extract(self, url): 'title': title, 'description': description, 'timestamp': timestamp, - 'uploader': uploader, + 'uploader': gfy.get('userDisplayName') or uploader, + 'uploader_id': uploader, 'duration': duration, 'view_count': view_count, 'like_count': like_count, diff --git a/yt_dlp/extractor/globo.py b/yt_dlp/extractor/globo.py index 0cb3aa31bf..a3f0241570 100644 --- a/yt_dlp/extractor/globo.py +++ b/yt_dlp/extractor/globo.py @@ -9,15 +9,14 @@ from .common import InfoExtractor from ..compat import ( - compat_HTTPError, compat_str, ) from ..utils import ( ExtractorError, float_or_none, - int_or_none, orderedSet, str_or_none, + try_get, ) @@ -26,18 +25,19 @@ class GloboIE(InfoExtractor): _NETRC_MACHINE = 'globo' _TESTS = [{ 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', - 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', 'info_dict': { 'id': '3607726', 'ext': 'mp4', 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa', 'duration': 103.204, - 'uploader': 'Globo.com', - 'uploader_id': '265', + 'uploader': 'G1', + 'uploader_id': '2015', + }, + 'params': { + 'skip_download': True, }, }, { 'url': 'http://globoplay.globo.com/v/4581987/', - 'md5': 'f36a1ecd6a50da1577eee6dd17f67eff', 'info_dict': { 'id': '4581987', 'ext': 'mp4', @@ -46,6 +46,9 @@ class GloboIE(InfoExtractor): 'uploader': 'Rede Globo', 'uploader_id': '196', }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://canalbrasil.globo.com/programas/sangue-latino/videos/3928201.html', 'only_matching': True, @@ -66,30 +69,6 @@ class GloboIE(InfoExtractor): 'only_matching': True, }] - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return - - try: - glb_id = (self._download_json( - 'https://login.globo.com/api/authentication', None, data=json.dumps({ - 'payload': { - 'email': email, - 'password': password, - 'serviceId': 4654, - }, - }).encode(), headers={ - 'Content-Type': 'application/json; charset=utf-8', - }) or {}).get('glbId') - if glb_id: - self._set_cookie('.globo.com', 'GLBID', glb_id) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - resp = self._parse_json(e.cause.read(), None) - raise ExtractorError(resp.get('userMessage') or resp['id'], expected=True) - raise - def _real_extract(self, url): video_id = self._match_id(url) @@ -102,73 +81,67 @@ def _real_extract(self, url): title = video['title'] formats = [] + security = self._download_json( + 'https://playback.video.globo.com/v1/video-session', video_id, 'Downloading security hash for %s' % video_id, + headers={'content-type': 'application/json'}, data=json.dumps({ + "player_type": "desktop", + "video_id": video_id, + "quality": "max", + "content_protection": "widevine", + "vsid": "581b986b-4c40-71f0-5a58-803e579d5fa2", + "tz": "-3.0:00" + }).encode()) + + security_hash = security['source']['token'] + if not security_hash: + message = security.get('message') + if message: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, message), expected=True) + + hash_code = security_hash[:2] + padding = '%010d' % random.randint(1, 10000000000) + if hash_code in ('04', '14'): + received_time = security_hash[3:13] + received_md5 = security_hash[24:] + hash_prefix = security_hash[:23] + elif hash_code in ('02', '12', '03', '13'): + received_time = security_hash[2:12] + received_md5 = security_hash[22:] + padding += '1' + hash_prefix = '05' + security_hash[:22] + + padded_sign_time = compat_str(int(received_time) + 86400) + padding + md5_data = (received_md5 + padded_sign_time + '0xAC10FD').encode() + signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=') + signed_hash = hash_prefix + padded_sign_time + signed_md5 + source = security['source']['url_parts'] + resource_url = source['scheme'] + '://' + source['domain'] + source['path'] + signed_url = '%s?h=%s&k=html5&a=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A') + + formats.extend(self._extract_m3u8_formats( + signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + subtitles = {} for resource in video['resources']: - resource_id = resource.get('_id') - resource_url = resource.get('url') - resource_type = resource.get('type') - if not resource_url or (resource_type == 'media' and not resource_id) or resource_type not in ('subtitle', 'media'): - continue - - if resource_type == 'subtitle': + if resource.get('type') == 'subtitle': subtitles.setdefault(resource.get('language') or 'por', []).append({ - 'url': resource_url, + 'url': resource.get('url'), }) - continue - - security = self._download_json( - 'http://security.video.globo.com/videos/%s/hash' % video_id, - video_id, 'Downloading security hash for %s' % resource_id, query={ - 'player': 'desktop', - 'version': '5.19.1', - 'resource_id': resource_id, + subs = try_get(security, lambda x: x['source']['subtitles'], expected_type=dict) or {} + for sub_lang, sub_url in subs.items(): + if sub_url: + subtitles.setdefault(sub_lang or 'por', []).append({ + 'url': sub_url, }) - - security_hash = security.get('hash') - if not security_hash: - message = security.get('message') - if message: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, message), expected=True) - continue - - hash_code = security_hash[:2] - padding = '%010d' % random.randint(1, 10000000000) - if hash_code in ('04', '14'): - received_time = security_hash[3:13] - received_md5 = security_hash[24:] - hash_prefix = security_hash[:23] - elif hash_code in ('02', '12', '03', '13'): - received_time = security_hash[2:12] - received_md5 = security_hash[22:] - padding += '1' - hash_prefix = '05' + security_hash[:22] - - padded_sign_time = compat_str(int(received_time) + 86400) + padding - md5_data = (received_md5 + padded_sign_time + '0xAC10FD').encode() - signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=') - signed_hash = hash_prefix + padded_sign_time + signed_md5 - signed_url = '%s?h=%s&k=html5&a=%s&u=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A', security.get('user') or '') - - if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats( - signed_url, resource_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif resource_id.endswith('mpd') or resource_url.endswith('.mpd'): - formats.extend(self._extract_mpd_formats( - signed_url, resource_id, mpd_id='dash', fatal=False)) - elif resource_id.endswith('manifest') or resource_url.endswith('/manifest'): - formats.extend(self._extract_ism_formats( - signed_url, resource_id, ism_id='mss', fatal=False)) - else: - formats.append({ - 'url': signed_url, - 'format_id': 'http-%s' % resource_id, - 'height': int_or_none(resource.get('height')), + subs = try_get(security, lambda x: x['source']['subtitles_webvtt'], expected_type=dict) or {} + for sub_lang, sub_url in subs.items(): + if sub_url: + subtitles.setdefault(sub_lang or 'por', []).append({ + 'url': sub_url, }) - self._sort_formats(formats) - duration = float_or_none(video.get('duration'), 1000) uploader = video.get('channel') uploader_id = str_or_none(video.get('channel_id')) diff --git a/yt_dlp/extractor/gofile.py b/yt_dlp/extractor/gofile.py new file mode 100644 index 0000000000..62d778cfec --- /dev/null +++ b/yt_dlp/extractor/gofile.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + try_get +) + + +class GofileIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gofile\.io/d/(?P[^/]+)' + _TESTS = [{ + 'url': 'https://gofile.io/d/AMZyDw', + 'info_dict': { + 'id': 'AMZyDw', + }, + 'playlist_mincount': 2, + 'playlist': [{ + 'info_dict': { + 'id': 'de571ac1-5edc-42e2-8ec2-bdac83ad4a31', + 'filesize': 928116, + 'ext': 'mp4', + 'title': 'nuuh' + } + }] + }, { # URL to test mixed file types + 'url': 'https://gofile.io/d/avt34h', + 'info_dict': { + 'id': 'avt34h', + }, + 'playlist_mincount': 1, + }, { # URL to test no video/audio error + 'url': 'https://gofile.io/d/aB03lZ', + 'info_dict': { + 'id': 'aB03lZ', + }, + 'playlist_count': 0, + 'skip': 'No video/audio found at provided URL.', + }] + _TOKEN = None + + def _real_initialize(self): + token = self._get_cookies('https://gofile.io/').get('accountToken') + if token: + self._TOKEN = token.value + return + + account_data = self._download_json( + 'https://api.gofile.io/createAccount', None, note='Getting a new guest account') + self._TOKEN = account_data['data']['token'] + self._set_cookie('gofile.io', 'accountToken', self._TOKEN) + + def _entries(self, file_id): + files = self._download_json( + f'https://api.gofile.io/getContent?contentId={file_id}&token={self._TOKEN}&websiteToken=websiteToken&cache=true', + 'Gofile', note='Getting filelist') + + status = files['status'] + if status != 'ok': + raise ExtractorError(f'{self.IE_NAME} said: status {status}', expected=True) + + found_files = False + for file in (try_get(files, lambda x: x['data']['contents'], dict) or {}).values(): + file_type, file_format = file.get('mimetype').split('/', 1) + if file_type not in ('video', 'audio') and file_format != 'vnd.mts': + continue + + found_files = True + file_url = file.get('directLink') + if file_url: + yield { + 'id': file['id'], + 'title': file['name'].rsplit('.', 1)[0], + 'url': file_url, + 'filesize': file.get('size'), + 'release_timestamp': file.get('createTime') + } + + if not found_files: + raise ExtractorError('No video/audio found at provided URL.', expected=True) + + def _real_extract(self, url): + file_id = self._match_id(url) + return self.playlist_result(self._entries(file_id), playlist_id=file_id) diff --git a/yt_dlp/extractor/googlesearch.py b/yt_dlp/extractor/googlesearch.py index 5279fa807f..f605c0c35f 100644 --- a/yt_dlp/extractor/googlesearch.py +++ b/yt_dlp/extractor/googlesearch.py @@ -11,6 +11,7 @@ class GoogleSearchIE(SearchInfoExtractor): _MAX_RESULTS = 1000 IE_NAME = 'video.google:search' _SEARCH_KEY = 'gvsearch' + _WORKING = False _TEST = { 'url': 'gvsearch15:python language', 'info_dict': { @@ -20,16 +21,7 @@ class GoogleSearchIE(SearchInfoExtractor): 'playlist_count': 15, } - def _get_n_results(self, query, n): - """Get a specified number of results for a query""" - - entries = [] - res = { - '_type': 'playlist', - 'id': query, - 'title': query, - } - + def _search_results(self, query): for pagenum in itertools.count(): webpage = self._download_webpage( 'http://www.google.com/search', @@ -44,16 +36,8 @@ def _get_n_results(self, query, n): for hit_idx, mobj in enumerate(re.finditer( r'

    = n) or not re.search(r'id="pnnext"', webpage): - res['entries'] = entries[:n] - return res + if not re.search(r'id="pnnext"', webpage): + return diff --git a/yt_dlp/extractor/gopro.py b/yt_dlp/extractor/gopro.py new file mode 100644 index 0000000000..10cc1aec1d --- /dev/null +++ b/yt_dlp/extractor/gopro.py @@ -0,0 +1,110 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + remove_end, + str_or_none, + try_get, + unified_timestamp, + url_or_none, +) + + +class GoProIE(InfoExtractor): + _VALID_URL = r'https?://(www\.)?gopro\.com/v/(?P[A-Za-z0-9]+)' + + _TESTS = [{ + 'url': 'https://gopro.com/v/ZNVvED8QDzR5V', + 'info_dict': { + 'id': 'ZNVvED8QDzR5V', + 'title': 'My GoPro Adventure - 9/19/21', + 'thumbnail': r're:https?://.+', + 'ext': 'mp4', + 'timestamp': 1632072947, + 'upload_date': '20210919', + 'uploader_id': 'fireydive30018', + 'duration': 396062, + } + }, { + 'url': 'https://gopro.com/v/KRm6Vgp2peg4e', + 'info_dict': { + 'id': 'KRm6Vgp2peg4e', + 'title': 'じゃがいも カリカリ オーブン焼き', + 'thumbnail': r're:https?://.+', + 'ext': 'mp4', + 'timestamp': 1607231125, + 'upload_date': '20201206', + 'uploader_id': 'dc9bcb8b-47d2-47c6-afbc-4c48f9a3769e', + 'duration': 45187, + 'track': 'The Sky Machine', + } + }, { + 'url': 'https://gopro.com/v/kVrK9wlJvBMwn', + 'info_dict': { + 'id': 'kVrK9wlJvBMwn', + 'title': 'DARKNESS', + 'thumbnail': r're:https?://.+', + 'ext': 'mp4', + 'timestamp': 1594183735, + 'upload_date': '20200708', + 'uploader_id': '闇夜乃皇帝', + 'duration': 313075, + 'track': 'Battery (Live)', + 'artist': 'Metallica', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + metadata = self._parse_json( + self._html_search_regex(r'window\.__reflectData\s*=\s*([^;]+)', webpage, 'metadata'), video_id) + + video_info = metadata['collectionMedia'][0] + media_data = self._download_json( + 'https://api.gopro.com/media/%s/download' % video_info['id'], video_id) + + formats = [] + for fmt in try_get(media_data, lambda x: x['_embedded']['variations']) or []: + format_url = url_or_none(fmt.get('url')) + if not format_url: + continue + formats.append({ + 'url': format_url, + 'format_id': str_or_none(fmt.get('quality')), + 'format_note': str_or_none(fmt.get('label')), + 'ext': str_or_none(fmt.get('type')), + 'width': int_or_none(fmt.get('width')), + 'height': int_or_none(fmt.get('height')), + }) + + self._sort_formats(formats) + + title = str_or_none( + try_get(metadata, lambda x: x['collection']['title']) + or self._html_search_meta(['og:title', 'twitter:title'], webpage) + or remove_end(self._html_search_regex( + r']*>([^<]+)', webpage, 'title', fatal=False), ' | GoPro')) + if title: + title = title.replace('\n', ' ') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': url_or_none( + self._html_search_meta(['og:image', 'twitter:image'], webpage)), + 'timestamp': unified_timestamp( + try_get(metadata, lambda x: x['collection']['created_at'])), + 'uploader_id': str_or_none( + try_get(metadata, lambda x: x['account']['nickname'])), + 'duration': int_or_none( + video_info.get('source_duration')), + 'artist': str_or_none( + video_info.get('music_track_artist')), + 'track': str_or_none( + video_info.get('music_track_name')), + } diff --git a/yt_dlp/extractor/gotostage.py b/yt_dlp/extractor/gotostage.py new file mode 100644 index 0000000000..6aa96106a6 --- /dev/null +++ b/yt_dlp/extractor/gotostage.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + try_get, + url_or_none +) + +import json + + +class GoToStageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gotostage\.com/channel/[a-z0-9]+/recording/(?P[a-z0-9]+)/watch' + _TESTS = [{ + 'url': 'https://www.gotostage.com/channel/8901680603948959494/recording/60bb55548d434f21b9ce4f0e225c4895/watch', + 'md5': 'ca72ce990cdcd7a2bd152f7217e319a2', + 'info_dict': { + 'id': '60bb55548d434f21b9ce4f0e225c4895', + 'ext': 'mp4', + 'title': 'What is GoToStage?', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 93.924711 + } + }, { + 'url': 'https://www.gotostage.com/channel/bacc3d3535b34bafacc3f4ef8d4df78a/recording/831e74cd3e0042be96defba627b6f676/watch?source=HOMEPAGE', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + metadata = self._download_json( + 'https://api.gotostage.com/contents?ids=%s' % video_id, + video_id, + note='Downloading video metadata', + errnote='Unable to download video metadata')[0] + + registration_data = { + 'product': metadata['product'], + 'resourceType': metadata['contentType'], + 'productReferenceKey': metadata['productRefKey'], + 'firstName': 'foo', + 'lastName': 'bar', + 'email': 'foobar@example.com' + } + + registration_response = self._download_json( + 'https://api-registrations.logmeininc.com/registrations', + video_id, + data=json.dumps(registration_data).encode(), + expected_status=409, + headers={'Content-Type': 'application/json'}, + note='Register user', + errnote='Unable to register user') + + content_response = self._download_json( + 'https://api.gotostage.com/contents/%s/asset' % video_id, + video_id, + headers={'x-registrantkey': registration_response['registrationKey']}, + note='Get download url', + errnote='Unable to get download url') + + return { + 'id': video_id, + 'title': try_get(metadata, lambda x: x['title'], compat_str), + 'url': try_get(content_response, lambda x: x['cdnLocation'], compat_str), + 'ext': 'mp4', + 'thumbnail': url_or_none(try_get(metadata, lambda x: x['thumbnail']['location'])), + 'duration': try_get(metadata, lambda x: x['duration'], float), + 'categories': [try_get(metadata, lambda x: x['category'], compat_str)], + 'is_live': False + } diff --git a/yt_dlp/extractor/gronkh.py b/yt_dlp/extractor/gronkh.py new file mode 100644 index 0000000000..c9f1dd2566 --- /dev/null +++ b/yt_dlp/extractor/gronkh.py @@ -0,0 +1,46 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class GronkhIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gronkh\.tv/(?:watch/)?stream/(?P\d+)' + + _TESTS = [{ + 'url': 'https://gronkh.tv/stream/536', + 'info_dict': { + 'id': '536', + 'ext': 'mp4', + 'title': 'GTV0536, 2021-10-01 - MARTHA IS DEAD #FREiAB1830 !FF7 !horde !archiv', + 'view_count': 19491, + 'thumbnail': 'https://01.cdn.vod.farm/preview/6436746cce14e25f751260a692872b9b.jpg', + 'upload_date': '20211001' + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://gronkh.tv/watch/stream/546', + 'only_matching': True, + }] + + def _real_extract(self, url): + id = self._match_id(url) + data_json = self._download_json(f'https://api.gronkh.tv/v1/video/info?episode={id}', id) + m3u8_url = self._download_json(f'https://api.gronkh.tv/v1/video/playlist?episode={id}', id)['playlist_url'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, id) + if data_json.get('vtt_url'): + subtitles.setdefault('en', []).append({ + 'url': data_json['vtt_url'], + 'ext': 'vtt', + }) + self._sort_formats(formats) + return { + 'id': id, + 'title': data_json.get('title'), + 'view_count': data_json.get('views'), + 'thumbnail': data_json.get('preview_url'), + 'upload_date': unified_strdate(data_json.get('created_at')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/hidive.py b/yt_dlp/extractor/hidive.py index a5aa0853ce..15bd444f9f 100644 --- a/yt_dlp/extractor/hidive.py +++ b/yt_dlp/extractor/hidive.py @@ -1,19 +1,18 @@ # coding: utf-8 -from __future__ import unicode_literals - +import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, + try_get, url_or_none, urlencode_postdata, ) class HiDiveIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hidive\.com/stream/(?P[^/]+)/(?P<key>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?hidive\.com/stream/(?P<id>(?P<title>[^/]+)/(?P<key>[^/?#&]+))' # Using X-Forwarded-For results in 403 HTTP error for HLS fragments, # so disabling geo bypass completely _GEO_BYPASS = False @@ -53,65 +52,71 @@ def _real_initialize(self): self._download_webpage( self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(data)) - def _real_extract(self, url): - mobj = self._match_valid_url(url) - title, key = mobj.group('title', 'key') - video_id = '%s/%s' % (title, key) - - settings = self._download_json( + def _call_api(self, video_id, title, key, data={}, **kwargs): + data = { + **data, + 'Title': title, + 'Key': key, + 'PlayerId': 'f4f895ce1ca713ba263b91caeb1daa2d08904783', + } + return self._download_json( 'https://www.hidive.com/play/settings', video_id, - data=urlencode_postdata({ - 'Title': title, - 'Key': key, - 'PlayerId': 'f4f895ce1ca713ba263b91caeb1daa2d08904783', - })) + data=urlencode_postdata(data), **kwargs) or {} + + def _extract_subtitles_from_rendition(self, rendition, subtitles, parsed_urls): + for cc_file in rendition.get('ccFiles', []): + cc_url = url_or_none(try_get(cc_file, lambda x: x[2])) + # name is used since we cant distinguish subs with same language code + cc_lang = try_get(cc_file, (lambda x: x[1].replace(' ', '-').lower(), lambda x: x[0]), str) + if cc_url not in parsed_urls and cc_lang: + parsed_urls.add(cc_url) + subtitles.setdefault(cc_lang, []).append({'url': cc_url}) + + def _get_subtitles(self, url, video_id, title, key, parsed_urls): + webpage = self._download_webpage(url, video_id, fatal=False) or '' + subtitles = {} + for caption in set(re.findall(r'data-captions=\"([^\"]+)\"', webpage)): + renditions = self._call_api( + video_id, title, key, {'Captions': caption}, fatal=False, + note=f'Downloading {caption} subtitle information').get('renditions') or {} + for rendition_id, rendition in renditions.items(): + self._extract_subtitles_from_rendition(rendition, subtitles, parsed_urls) + return subtitles + + def _real_extract(self, url): + video_id, title, key = self._match_valid_url(url).group('id', 'title', 'key') + settings = self._call_api(video_id, title, key) restriction = settings.get('restrictionReason') if restriction == 'RegionRestricted': self.raise_geo_restricted() - if restriction and restriction != 'None': raise ExtractorError( '%s said: %s' % (self.IE_NAME, restriction), expected=True) - formats = [] - subtitles = {} + formats, parsed_urls = [], {None} for rendition_id, rendition in settings['renditions'].items(): - bitrates = rendition.get('bitrates') - if not isinstance(bitrates, dict): - continue - m3u8_url = url_or_none(bitrates.get('hls')) - if not m3u8_url: - continue - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='%s-hls' % rendition_id, fatal=False)) - cc_files = rendition.get('ccFiles') - if not isinstance(cc_files, list): - continue - for cc_file in cc_files: - if not isinstance(cc_file, list) or len(cc_file) < 3: - continue - cc_lang = cc_file[0] - cc_url = url_or_none(cc_file[2]) - if not isinstance(cc_lang, compat_str) or not cc_url: - continue - subtitles.setdefault(cc_lang, []).append({ - 'url': cc_url, - }) + audio, version, extra = rendition_id.split('_') + m3u8_url = url_or_none(try_get(rendition, lambda x: x['bitrates']['hls'])) + if m3u8_url not in parsed_urls: + parsed_urls.add(m3u8_url) + frmt = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=rendition_id, fatal=False) + for f in frmt: + f['language'] = audio + f['format_note'] = f'{version}, {extra}' + formats.extend(frmt) self._sort_formats(formats) - season_number = int_or_none(self._search_regex( - r's(\d+)', key, 'season number', default=None)) - episode_number = int_or_none(self._search_regex( - r'e(\d+)', key, 'episode number', default=None)) - return { 'id': video_id, 'title': video_id, - 'subtitles': subtitles, + 'subtitles': self.extract_subtitles(url, video_id, title, key, parsed_urls), 'formats': formats, 'series': title, - 'season_number': season_number, - 'episode_number': episode_number, + 'season_number': int_or_none( + self._search_regex(r's(\d+)', key, 'season number', default=None)), + 'episode_number': int_or_none( + self._search_regex(r'e(\d+)', key, 'episode number', default=None)), + 'http_headers': {'Referer': url} } diff --git a/yt_dlp/extractor/hitbox.py b/yt_dlp/extractor/hitbox.py index 3e5ff2685e..0470d0a99c 100644 --- a/yt_dlp/extractor/hitbox.py +++ b/yt_dlp/extractor/hitbox.py @@ -209,6 +209,6 @@ def _real_extract(self, url): 'https://www.smashcast.tv/api/media/live', video_id) metadata['formats'] = formats metadata['is_live'] = True - metadata['title'] = self._live_title(metadata.get('title')) + metadata['title'] = metadata.get('title') return metadata diff --git a/yt_dlp/extractor/hotstar.py b/yt_dlp/extractor/hotstar.py index 8d8a8bd75e..de2b30cf7c 100644 --- a/yt_dlp/extractor/hotstar.py +++ b/yt_dlp/extractor/hotstar.py @@ -70,7 +70,7 @@ def _call_api(self, path, video_id, query_name='contentId'): def _call_api_v2(self, path, video_id, st=None, cookies=None): return self._call_api_impl( '%s/content/%s' % (path, video_id), video_id, st=st, cookies=cookies, query={ - 'desired-config': 'audio_channel:stereo|dynamic_range:sdr|encryption:plain|ladder:tv|package:dash|resolution:hd|subs-tag:HotstarVIP|video_codec:vp9', + 'desired-config': 'audio_channel:stereo|container:fmp4|dynamic_range:hdr|encryption:plain|ladder:tv|package:dash|resolution:fhd|subs-tag:HotstarVIP|video_codec:h265', 'device-id': cookies.get('device_id').value if cookies.get('device_id') else compat_str(uuid.uuid4()), 'os-name': 'Windows', 'os-version': '10', @@ -196,41 +196,47 @@ def _real_extract(self, url): for playback_set in playback_sets: if not isinstance(playback_set, dict): continue + dr = re.search(r'dynamic_range:(?P<dr>[a-z]+)', playback_set.get('tagsCombination')).group('dr') format_url = url_or_none(playback_set.get('playbackUrl')) if not format_url: continue format_url = re.sub( r'(?<=//staragvod)(\d)', r'web\1', format_url) tags = str_or_none(playback_set.get('tagsCombination')) or '' - if tags and 'encryption:plain' not in tags: - continue ext = determine_ext(format_url) + current_formats, current_subs = [], {} try: if 'package:hls' in tags or ext == 'm3u8': - hls_formats, hls_subs = self._extract_m3u8_formats_and_subtitles( + current_formats, current_subs = self._extract_m3u8_formats_and_subtitles( format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', headers=headers) - formats.extend(hls_formats) - subs = self._merge_subtitles(subs, hls_subs) + m3u8_id=f'{dr}-hls', headers=headers) elif 'package:dash' in tags or ext == 'mpd': - dash_formats, dash_subs = self._extract_mpd_formats_and_subtitles( - format_url, video_id, mpd_id='dash', headers=headers) - formats.extend(dash_formats) - subs = self._merge_subtitles(subs, dash_subs) + current_formats, current_subs = self._extract_mpd_formats_and_subtitles( + format_url, video_id, mpd_id=f'{dr}-dash', headers=headers) elif ext == 'f4m': # produce broken files pass else: - formats.append({ + current_formats = [{ 'url': format_url, 'width': int_or_none(playback_set.get('width')), 'height': int_or_none(playback_set.get('height')), - }) + }] except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: geo_restricted = True continue + if tags and 'encryption:plain' not in tags: + for f in current_formats: + f['has_drm'] = True + if tags and 'language' in tags: + lang = re.search(r'language:(?P<lang>[a-z]+)', tags).group('lang') + for f in current_formats: + if not f.get('langauge'): + f['language'] = lang + formats.extend(current_formats) + subs = self._merge_subtitles(subs, current_subs) if not formats and geo_restricted: self.raise_geo_restricted(countries=['IN'], metadata_available=True) self._sort_formats(formats) @@ -254,6 +260,9 @@ def _real_extract(self, url): 'season_id': video_data.get('seasonId'), 'episode': title, 'episode_number': int_or_none(video_data.get('episodeNo')), + 'http_headers': { + 'Referer': 'https://www.hotstar.com/in', + } } @@ -287,7 +296,7 @@ def _real_extract(self, url): class HotStarSeriesIE(HotStarBaseIE): IE_NAME = 'hotstar:series' - _VALID_URL = r'(?:https?://)(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P<id>\d+)' + _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P<id>\d+))' _TESTS = [{ 'url': 'https://www.hotstar.com/in/tv/radhakrishn/1260000646', 'info_dict': { @@ -309,7 +318,7 @@ class HotStarSeriesIE(HotStarBaseIE): }] def _real_extract(self, url): - series_id = self._match_id(url) + url, series_id = self._match_valid_url(url).groups() headers = { 'x-country-code': 'IN', 'x-platform-code': 'PCTV', @@ -321,7 +330,7 @@ def _real_extract(self, url): video_id=series_id, headers=headers) entries = [ self.url_result( - 'hotstar:episode:%d' % video['contentId'], + '%s/ignoreme/%d' % (url, video['contentId']), ie=HotStarIE.ie_key(), video_id=video['contentId']) for video in item_json['body']['results']['items'] if video.get('contentId')] diff --git a/yt_dlp/extractor/hse.py b/yt_dlp/extractor/hse.py new file mode 100644 index 0000000000..9144ff8dcc --- /dev/null +++ b/yt_dlp/extractor/hse.py @@ -0,0 +1,95 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + unified_timestamp, +) + + +class HSEShowBaseInfoExtractor(InfoExtractor): + _GEO_COUNTRIES = ['DE'] + + def _extract_redux_data(self, url, video_id): + webpage = self._download_webpage(url, video_id) + redux = self._html_search_regex( + r'window\.__REDUX_DATA__\s*=\s*({.*});?', webpage, 'redux data') + return self._parse_json(redux.replace('\n', ''), video_id) + + def _extract_formats_and_subtitles(self, sources, video_id): + if not sources: + raise ExtractorError('No video found', expected=True, video_id=video_id) + formats, subtitles = [], {} + for src in sources: + if src['mimetype'] != 'application/x-mpegURL': + continue + fmts, subs = self._extract_m3u8_formats_and_subtitles(src['url'], video_id, ext='mp4') + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + self._sort_formats(formats) + return formats, subtitles + + +class HSEShowIE(HSEShowBaseInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hse\.de/dpl/c/tv-shows/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.hse.de/dpl/c/tv-shows/505350', + 'info_dict': { + 'id': '505350', + 'ext': 'mp4', + 'title': 'Pfeffinger Mode & Accessoires', + 'timestamp': 1638810000, + 'upload_date': '20211206', + 'channel': 'HSE24', + 'uploader': 'Arina Pirayesh' + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._extract_redux_data(url, video_id) + formats, subtitles = self._extract_formats_and_subtitles( + traverse_obj(json_data, ('tvShowPage', 'tvShowVideo', 'sources')), video_id) + + show = traverse_obj(json_data, ('tvShowPage', 'tvShow')) or {} + return { + 'id': video_id, + 'title': show.get('title') or video_id, + 'formats': formats, + 'timestamp': unified_timestamp(f'{show.get("date")} {show.get("hour")}:00'), + 'thumbnail': traverse_obj(json_data, ('tvShowVideo', 'poster')), + 'channel': self._search_regex( + r'tvShow \| ([A-Z0-9]+)_', show.get('actionFieldText') or '', video_id, fatal=False), + 'uploader': show.get('presenter'), + 'subtitles': subtitles, + } + + +class HSEProductIE(HSEShowBaseInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hse\.de/dpl/p/product/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.hse.de/dpl/p/product/408630', + 'info_dict': { + 'id': '408630', + 'ext': 'mp4', + 'title': 'Hose im Ponte-Mix', + 'uploader': 'Judith Williams' + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._extract_redux_data(url, video_id) + video = traverse_obj(json_data, ('productContent', 'productContent', 'videos', 0)) or {} + formats, subtitles = self._extract_formats_and_subtitles(video.get('sources'), video_id) + + return { + 'id': video_id, + 'title': traverse_obj(json_data, ('productDetail', 'product', 'name', 'short')) or video_id, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': video.get('poster'), + 'uploader': traverse_obj(json_data, ('productDetail', 'product', 'brand', 'brandName')), + } diff --git a/yt_dlp/extractor/ichinanalive.py b/yt_dlp/extractor/ichinanalive.py new file mode 100644 index 0000000000..cb39f821c6 --- /dev/null +++ b/yt_dlp/extractor/ichinanalive.py @@ -0,0 +1,167 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError, str_or_none, traverse_obj, unified_strdate +from ..compat import compat_str + + +class IchinanaLiveIE(InfoExtractor): + IE_NAME = '17live' + _VALID_URL = r'https?://(?:www\.)?17\.live/(?:[^/]+/)*(?:live|profile/r)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://17.live/live/3773096', + 'info_dict': { + 'id': '3773096', + 'title': '萠珈☕🤡🍫moka', + 'is_live': True, + 'uploader': '萠珈☕🤡🍫moka', + 'uploader_id': '3773096', + 'like_count': 366, + 'view_count': 18121, + 'timestamp': 1630569012, + }, + 'skip': 'running as of writing, but may be ended as of testing', + }, { + 'note': 'nothing except language differs', + 'url': 'https://17.live/ja/live/3773096', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return not IchinanaLiveClipIE.suitable(url) and super(IchinanaLiveIE, cls).suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + url = 'https://17.live/live/%s' % video_id + + enter = self._download_json( + 'https://api-dsa.17app.co/api/v1/lives/%s/enter' % video_id, video_id, + headers={'Referer': url}, fatal=False, expected_status=420, + data=b'\0') + if enter and enter.get('message') == 'ended': + raise ExtractorError('This live has ended.', expected=True) + + view_data = self._download_json( + 'https://api-dsa.17app.co/api/v1/lives/%s' % video_id, video_id, + headers={'Referer': url}) + + uploader = traverse_obj( + view_data, ('userInfo', 'displayName'), ('userInfo', 'openID')) + + video_urls = view_data.get('rtmpUrls') + if not video_urls: + raise ExtractorError('unable to extract live URL information') + formats = [] + for (name, value) in video_urls[0].items(): + if not isinstance(value, compat_str): + continue + if not value.startswith('http'): + continue + quality = -1 + if 'web' in name: + quality -= 1 + if 'High' in name: + quality += 4 + if 'Low' in name: + quality -= 2 + formats.append({ + 'format_id': name, + 'url': value, + 'quality': quality, + 'http_headers': {'Referer': url}, + 'ext': 'flv', + 'vcodec': 'h264', + 'acodec': 'aac', + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': uploader or video_id, + 'formats': formats, + 'is_live': True, + 'uploader': uploader, + 'uploader_id': video_id, + 'like_count': view_data.get('receivedLikeCount'), + 'view_count': view_data.get('viewerCount'), + 'thumbnail': view_data.get('coverPhoto'), + 'description': view_data.get('caption'), + 'timestamp': view_data.get('beginTime'), + } + + +class IchinanaLiveClipIE(InfoExtractor): + IE_NAME = '17live:clip' + _VALID_URL = r'https?://(?:www\.)?17\.live/(?:[^/]+/)*profile/r/(?P<uploader_id>\d+)/clip/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://17.live/profile/r/1789280/clip/1bHQSK8KUieruFXaCH4A4upCzlN', + 'info_dict': { + 'id': '1bHQSK8KUieruFXaCH4A4upCzlN', + 'title': 'マチコ先生🦋Class💋', + 'description': 'マチ戦隊 第一次 バスターコール\n総額200万coin!\n動画制作@うぉーかー🌱Walker🎫', + 'uploader_id': '1789280', + }, + }, { + 'url': 'https://17.live/ja/profile/r/1789280/clip/1bHQSK8KUieruFXaCH4A4upCzlN', + 'only_matching': True, + }] + + def _real_extract(self, url): + uploader_id, video_id = self._match_valid_url(url).groups() + url = 'https://17.live/profile/r/%s/clip/%s' % (uploader_id, video_id) + + view_data = self._download_json( + 'https://api-dsa.17app.co/api/v1/clips/%s' % video_id, video_id, + headers={'Referer': url}) + + uploader = traverse_obj( + view_data, ('userInfo', 'displayName'), ('userInfo', 'name')) + + formats = [] + if view_data.get('videoURL'): + formats.append({ + 'id': 'video', + 'url': view_data['videoURL'], + 'quality': -1, + }) + if view_data.get('transcodeURL'): + formats.append({ + 'id': 'transcode', + 'url': view_data['transcodeURL'], + 'quality': -1, + }) + if view_data.get('srcVideoURL'): + # highest quality + formats.append({ + 'id': 'srcVideo', + 'url': view_data['srcVideoURL'], + 'quality': 1, + }) + + for fmt in formats: + fmt.update({ + 'ext': 'mp4', + 'protocol': 'https', + 'vcodec': 'h264', + 'acodec': 'aac', + 'http_headers': {'Referer': url}, + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': uploader or video_id, + 'formats': formats, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'like_count': view_data.get('likeCount'), + 'view_count': view_data.get('viewCount'), + 'thumbnail': view_data.get('imageURL'), + 'duration': view_data.get('duration'), + 'description': view_data.get('caption'), + 'upload_date': unified_strdate(str_or_none(view_data.get('createdAt'))), + } diff --git a/yt_dlp/extractor/imdb.py b/yt_dlp/extractor/imdb.py index a31301985b..24f1fde640 100644 --- a/yt_dlp/extractor/imdb.py +++ b/yt_dlp/extractor/imdb.py @@ -111,7 +111,7 @@ def _real_extract(self, url): 'formats': formats, 'description': info.get('videoDescription'), 'thumbnail': url_or_none(try_get( - video_metadata, lambda x: x['videoSlate']['source'])), + info, lambda x: x['videoSlate']['source'])), 'duration': parse_duration(info.get('videoRuntime')), } diff --git a/yt_dlp/extractor/imggaming.py b/yt_dlp/extractor/imggaming.py index ef20a4b9e8..14d3fad55d 100644 --- a/yt_dlp/extractor/imggaming.py +++ b/yt_dlp/extractor/imggaming.py @@ -88,7 +88,7 @@ def _real_extract(self, url): video_data = self._download_json(dve_api_url, media_id) is_live = media_type == 'live' if is_live: - title = self._live_title(self._call_api('event/', media_id)['title']) + title = self._call_api('event/', media_id)['title'] else: title = video_data['name'] diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index 9aad804cf8..ab14e5b0ac 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -1,13 +1,13 @@ -from __future__ import unicode_literals +# coding: utf-8 import itertools import hashlib import json import re +import time from .common import InfoExtractor from ..compat import ( - compat_str, compat_HTTPError, ) from ..utils import ( @@ -17,14 +17,161 @@ int_or_none, lowercase_escape, std_headers, - try_get, + str_to_int, + traverse_obj, url_or_none, - variadic, + urlencode_postdata, ) -class InstagramIE(InfoExtractor): - _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))' +class InstagramBaseIE(InfoExtractor): + _NETRC_MACHINE = 'instagram' + _IS_LOGGED_IN = False + + def _login(self): + username, password = self._get_login_info() + if username is None or self._IS_LOGGED_IN: + return + + login_webpage = self._download_webpage( + 'https://www.instagram.com/accounts/login/', None, + note='Downloading login webpage', errnote='Failed to download login webpage') + + shared_data = self._parse_json( + self._search_regex( + r'window\._sharedData\s*=\s*({.+?});', + login_webpage, 'shared data', default='{}'), + None) + + login = self._download_json('https://www.instagram.com/accounts/login/ajax/', None, note='Logging in', headers={ + 'Accept': '*/*', + 'X-IG-App-ID': '936619743392459', + 'X-ASBD-ID': '198387', + 'X-IG-WWW-Claim': '0', + 'X-Requested-With': 'XMLHttpRequest', + 'X-CSRFToken': shared_data['config']['csrf_token'], + 'X-Instagram-AJAX': shared_data['rollout_hash'], + 'Referer': 'https://www.instagram.com/', + }, data=urlencode_postdata({ + 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}', + 'username': username, + 'queryParams': '{}', + 'optIntoOneTap': 'false', + 'stopDeletionNonce': '', + 'trustedDeviceRecords': '{}', + })) + + if not login.get('authenticated'): + if login.get('message'): + raise ExtractorError(f'Unable to login: {login["message"]}') + elif login.get('user'): + raise ExtractorError('Unable to login: Sorry, your password was incorrect. Please double-check your password.', expected=True) + elif login.get('user') is False: + raise ExtractorError('Unable to login: The username you entered doesn\'t belong to an account. Please check your username and try again.', expected=True) + raise ExtractorError('Unable to login') + InstagramBaseIE._IS_LOGGED_IN = True + + def _real_initialize(self): + self._login() + + def _get_count(self, media, kind, *keys): + return traverse_obj( + media, (kind, 'count'), *((f'edge_media_{key}', 'count') for key in keys), + expected_type=int_or_none) + + def _get_dimension(self, name, media, webpage=None): + return ( + traverse_obj(media, ('dimensions', name), expected_type=int_or_none) + or int_or_none(self._html_search_meta( + (f'og:video:{name}', f'video:{name}'), webpage or '', default=None))) + + def _extract_nodes(self, nodes, is_direct=False): + for idx, node in enumerate(nodes, start=1): + if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: + continue + + video_id = node.get('shortcode') + + if is_direct: + info = { + 'id': video_id or node['id'], + 'url': node.get('video_url'), + 'width': self._get_dimension('width', node), + 'height': self._get_dimension('height', node), + 'http_headers': { + 'Referer': 'https://www.instagram.com/', + } + } + elif not video_id: + continue + else: + info = { + '_type': 'url', + 'ie_key': 'Instagram', + 'id': video_id, + 'url': f'https://instagram.com/p/{video_id}', + } + + yield { + **info, + 'title': node.get('title') or (f'Video {idx}' if is_direct else None), + 'description': traverse_obj( + node, ('edge_media_to_caption', 'edges', 0, 'node', 'text'), expected_type=str), + 'thumbnail': traverse_obj( + node, 'display_url', 'thumbnail_src', 'display_src', expected_type=url_or_none), + 'duration': float_or_none(node.get('video_duration')), + 'timestamp': int_or_none(node.get('taken_at_timestamp')), + 'view_count': int_or_none(node.get('video_view_count')), + 'comment_count': self._get_count(node, 'comments', 'preview_comment', 'to_comment', 'to_parent_comment'), + 'like_count': self._get_count(node, 'likes', 'preview_like'), + } + + +class InstagramIOSIE(InfoExtractor): + IE_DESC = 'IOS instagram:// URL' + _VALID_URL = r'instagram://media\?id=(?P<id>[\d_]+)' + _TESTS = [{ + 'url': 'instagram://media?id=482584233761418119', + 'md5': '0d2da106a9d2631273e192b372806516', + 'info_dict': { + 'id': 'aye83DjauH', + 'ext': 'mp4', + 'title': 'Video by naomipq', + 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 0, + 'timestamp': 1371748545, + 'upload_date': '20130620', + 'uploader_id': 'naomipq', + 'uploader': 'B E A U T Y F O R A S H E S', + 'like_count': int, + 'comment_count': int, + 'comments': list, + }, + 'add_ie': ['Instagram'] + }] + + def _get_id(self, id): + """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id""" + chrs = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' + media_id = int(id.split('_')[0]) + shortened_id = '' + while media_id > 0: + r = media_id % 64 + media_id = (media_id - r) // 64 + shortened_id = chrs[r] + shortened_id + return shortened_id + + def _real_extract(self, url): + return { + '_type': 'url_transparent', + 'url': f'http://instagram.com/tv/{self._get_id(self._match_id(url))}/', + 'ie_key': 'Instagram', + } + + +class InstagramIE(InstagramBaseIE): + _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com(?:/[^/]+)?/(?:p|tv|reel)/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', @@ -120,6 +267,9 @@ class InstagramIE(InfoExtractor): }, { 'url': 'https://www.instagram.com/reel/CDUMkliABpa/', 'only_matching': True, + }, { + 'url': 'https://www.instagram.com/marvelskies.fc/reel/CWqAgUZgCku/', + 'only_matching': True, }] @staticmethod @@ -141,27 +291,25 @@ def _extract_embed_url(webpage): return mobj.group('link') def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - url = mobj.group('url') - - webpage = self._download_webpage(url, video_id) - - (media, video_url, description, thumbnail, timestamp, uploader, - uploader_id, like_count, comment_count, comments, height, - width) = [None] * 12 + video_id, url = self._match_valid_url(url).group('id', 'url') + webpage, urlh = self._download_webpage_handle(url, video_id) + if 'www.instagram.com/accounts/login' in urlh.geturl(): + self.report_warning('Main webpage is locked behind the login page. ' + 'Retrying with embed webpage (Note that some metadata might be missing)') + webpage = self._download_webpage( + 'https://www.instagram.com/p/%s/embed/' % video_id, video_id, note='Downloading embed webpage') shared_data = self._parse_json( self._search_regex( r'window\._sharedData\s*=\s*({.+?});', webpage, 'shared data', default='{}'), video_id, fatal=False) - if shared_data: - media = try_get( - shared_data, - (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], - lambda x: x['entry_data']['PostPage'][0]['media']), - dict) + media = traverse_obj( + shared_data, + ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'), + ('entry_data', 'PostPage', 0, 'media'), + expected_type=dict) + # _sharedData.entry_data.PostPage is empty when authenticated (see # https://github.com/ytdl-org/youtube-dl/pull/22880) if not media: @@ -170,123 +318,83 @@ def _real_extract(self, url): r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;', webpage, 'additional data', default='{}'), video_id, fatal=False) - if additional_data: - media = try_get( - additional_data, lambda x: x['graphql']['shortcode_media'], - dict) - if media: - video_url = media.get('video_url') - height = int_or_none(media.get('dimensions', {}).get('height')) - width = int_or_none(media.get('dimensions', {}).get('width')) - description = try_get( - media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], - compat_str) or media.get('caption') - title = media.get('title') - thumbnail = media.get('display_src') or media.get('display_url') - duration = float_or_none(media.get('video_duration')) - timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) - uploader = media.get('owner', {}).get('full_name') - uploader_id = media.get('owner', {}).get('username') + media = traverse_obj(additional_data, ('graphql', 'shortcode_media'), 'shortcode_media', expected_type=dict) or {} - def get_count(keys, kind): - for key in variadic(keys): - count = int_or_none(try_get( - media, (lambda x: x['edge_media_%s' % key]['count'], - lambda x: x['%ss' % kind]['count']))) - if count is not None: - return count + if not media and 'www.instagram.com/accounts/login' in urlh.geturl(): + self.raise_login_required('You need to log in to access this content') - like_count = get_count('preview_like', 'like') - comment_count = get_count( - ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment') - - comments = [] - for comment in try_get(media, lambda x: x['edge_media_to_parent_comment']['edges']): - comment_dict = comment.get('node', {}) - comment_text = comment_dict.get('text') - if comment_text: - comments.append({ - 'author': try_get(comment_dict, lambda x: x['owner']['username']), - 'author_id': try_get(comment_dict, lambda x: x['owner']['id']), - 'id': comment_dict.get('id'), - 'text': comment_text, - 'timestamp': int_or_none(comment_dict.get('created_at')), - }) - if not video_url: - edges = try_get( - media, lambda x: x['edge_sidecar_to_children']['edges'], - list) or [] - if edges: - entries = [] - for edge_num, edge in enumerate(edges, start=1): - node = try_get(edge, lambda x: x['node'], dict) - if not node: - continue - node_video_url = url_or_none(node.get('video_url')) - if not node_video_url: - continue - entries.append({ - 'id': node.get('shortcode') or node['id'], - 'title': node.get('title') or 'Video %d' % edge_num, - 'url': node_video_url, - 'thumbnail': node.get('display_url'), - 'duration': float_or_none(node.get('video_duration')), - 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), - 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), - 'view_count': int_or_none(node.get('video_view_count')), - }) - return self.playlist_result( - entries, video_id, - 'Post by %s' % uploader_id if uploader_id else None, - description) - - if not video_url: - video_url = self._og_search_video_url(webpage, secure=False) - - formats = [{ - 'url': video_url, - 'width': width, - 'height': height, - }] - - if not uploader_id: - uploader_id = self._search_regex( - r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', - webpage, 'uploader id', fatal=False) + uploader_id = traverse_obj(media, ('owner', 'username')) or self._search_regex( + r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', webpage, 'uploader id', fatal=False) + description = ( + traverse_obj(media, ('edge_media_to_caption', 'edges', 0, 'node', 'text'), expected_type=str) + or media.get('caption')) if not description: description = self._search_regex( r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None) if description is not None: description = lowercase_escape(description) - if not thumbnail: - thumbnail = self._og_search_thumbnail(webpage) + video_url = media.get('video_url') + if not video_url: + nodes = traverse_obj(media, ('edge_sidecar_to_children', 'edges', ..., 'node'), expected_type=dict) or [] + if nodes: + return self.playlist_result( + self._extract_nodes(nodes, True), video_id, + 'Post by %s' % uploader_id if uploader_id else None, description) + + video_url = self._og_search_video_url(webpage, secure=False) + + formats = [{ + 'url': video_url, + 'width': self._get_dimension('width', media, webpage), + 'height': self._get_dimension('height', media, webpage), + }] + dash = traverse_obj(media, ('dash_info', 'video_dash_manifest')) + if dash: + formats.extend(self._parse_mpd_formats(self._parse_xml(dash, video_id), mpd_id='dash')) + self._sort_formats(formats) + + comment_data = traverse_obj(media, ('edge_media_to_parent_comment', 'edges')) + comments = [{ + 'author': traverse_obj(comment_dict, ('node', 'owner', 'username')), + 'author_id': traverse_obj(comment_dict, ('node', 'owner', 'id')), + 'id': traverse_obj(comment_dict, ('node', 'id')), + 'text': traverse_obj(comment_dict, ('node', 'text')), + 'timestamp': traverse_obj(comment_dict, ('node', 'created_at'), expected_type=int_or_none), + } for comment_dict in comment_data] if comment_data else None + + display_resources = ( + media.get('display_resources') + or [{'src': media.get(key)} for key in ('display_src', 'display_url')] + or [{'src': self._og_search_thumbnail(webpage)}]) + thumbnails = [{ + 'url': thumbnail['src'], + 'width': thumbnail.get('config_width'), + 'height': thumbnail.get('config_height'), + } for thumbnail in display_resources if thumbnail.get('src')] return { 'id': video_id, 'formats': formats, - 'ext': 'mp4', - 'title': title or 'Video by %s' % uploader_id, + 'title': media.get('title') or 'Video by %s' % uploader_id, 'description': description, - 'duration': duration, - 'thumbnail': thumbnail, - 'timestamp': timestamp, + 'duration': float_or_none(media.get('video_duration')), + 'timestamp': traverse_obj(media, 'taken_at_timestamp', 'date', expected_type=int_or_none), 'uploader_id': uploader_id, - 'uploader': uploader, - 'like_count': like_count, - 'comment_count': comment_count, + 'uploader': traverse_obj(media, ('owner', 'full_name')), + 'like_count': self._get_count(media, 'likes', 'preview_like') or str_to_int(self._search_regex( + r'data-log-event="likeCountClick"[^>]*>[^\d]*([\d,\.]+)', webpage, 'like count', fatal=False)), + 'comment_count': self._get_count(media, 'comments', 'preview_comment', 'to_comment', 'to_parent_comment'), 'comments': comments, + 'thumbnails': thumbnails, 'http_headers': { 'Referer': 'https://www.instagram.com/', } } -class InstagramPlaylistIE(InfoExtractor): - # A superclass for handling any kind of query based on GraphQL which - # results in a playlist. - +class InstagramPlaylistBaseIE(InstagramBaseIE): _gis_tmpl = None # used to cache GIS request type def _parse_graphql(self, webpage, item_id): @@ -298,10 +406,6 @@ def _parse_graphql(self, webpage, item_id): def _extract_graphql(self, data, url): # Parses GraphQL queries containing videos and generates a playlist. - def get_count(suffix): - return int_or_none(try_get( - node, lambda x: x['edge_media_' + suffix]['count'])) - uploader_id = self._match_id(url) csrf_token = data['config']['csrf_token'] rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8' @@ -350,55 +454,14 @@ def get_count(suffix): continue raise - edges = media.get('edges') - if not edges or not isinstance(edges, list): + nodes = traverse_obj(media, ('edges', ..., 'node'), expected_type=dict) or [] + if not nodes: break + yield from self._extract_nodes(nodes) - for edge in edges: - node = edge.get('node') - if not node or not isinstance(node, dict): - continue - if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: - continue - video_id = node.get('shortcode') - if not video_id: - continue - - info = self.url_result( - 'https://instagram.com/p/%s/' % video_id, - ie=InstagramIE.ie_key(), video_id=video_id) - - description = try_get( - node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], - compat_str) - thumbnail = node.get('thumbnail_src') or node.get('display_src') - timestamp = int_or_none(node.get('taken_at_timestamp')) - - comment_count = get_count('to_comment') - like_count = get_count('preview_like') - view_count = int_or_none(node.get('video_view_count')) - - info.update({ - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'comment_count': comment_count, - 'like_count': like_count, - 'view_count': view_count, - }) - - yield info - - page_info = media.get('page_info') - if not page_info or not isinstance(page_info, dict): - break - - has_next_page = page_info.get('has_next_page') - if not has_next_page: - break - - cursor = page_info.get('end_cursor') - if not cursor or not isinstance(cursor, compat_str): + has_next_page = traverse_obj(media, ('page_info', 'has_next_page')) + cursor = traverse_obj(media, ('page_info', 'end_cursor'), expected_type=str) + if not has_next_page or not cursor: break def _real_extract(self, url): @@ -412,11 +475,11 @@ def _real_extract(self, url): self._extract_graphql(data, url), user_or_tag, user_or_tag) -class InstagramUserIE(InstagramPlaylistIE): +class InstagramUserIE(InstagramPlaylistBaseIE): _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])' IE_DESC = 'Instagram user profile' IE_NAME = 'instagram:user' - _TEST = { + _TESTS = [{ 'url': 'https://instagram.com/porsche', 'info_dict': { 'id': 'porsche', @@ -428,7 +491,7 @@ class InstagramUserIE(InstagramPlaylistIE): 'skip_download': True, 'playlistend': 5, } - } + }] _QUERY_HASH = '42323d64886122307be10013ad2dcc44', @@ -446,11 +509,11 @@ def _query_vars_for(data): } -class InstagramTagIE(InstagramPlaylistIE): +class InstagramTagIE(InstagramPlaylistBaseIE): _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)' - IE_DESC = 'Instagram hashtag search' + IE_DESC = 'Instagram hashtag search URLs' IE_NAME = 'instagram:tag' - _TEST = { + _TESTS = [{ 'url': 'https://instagram.com/explore/tags/lolcats', 'info_dict': { 'id': 'lolcats', @@ -462,7 +525,7 @@ class InstagramTagIE(InstagramPlaylistIE): 'skip_download': True, 'playlistend': 50, } - } + }] _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314', @@ -479,3 +542,77 @@ def _query_vars_for(data): 'tag_name': data['entry_data']['TagPage'][0]['graphql']['hashtag']['name'] } + + +class InstagramStoryIE(InstagramBaseIE): + _VALID_URL = r'https?://(?:www\.)?instagram\.com/stories/(?P<user>[^/]+)/(?P<id>\d+)' + IE_NAME = 'instagram:story' + + _TESTS = [{ + 'url': 'https://www.instagram.com/stories/highlights/18090946048123978/', + 'info_dict': { + 'id': '18090946048123978', + 'title': 'Rare', + }, + 'playlist_mincount': 50 + }] + + def _real_extract(self, url): + username, story_id = self._match_valid_url(url).groups() + + story_info_url = f'{username}/{story_id}/?__a=1' if username == 'highlights' else f'{username}/?__a=1' + story_info = self._download_json(f'https://www.instagram.com/stories/{story_info_url}', story_id, headers={ + 'X-IG-App-ID': 936619743392459, + 'X-ASBD-ID': 198387, + 'X-IG-WWW-Claim': 0, + 'X-Requested-With': 'XMLHttpRequest', + 'Referer': url, + }) + user_id = story_info['user']['id'] + highlight_title = traverse_obj(story_info, ('highlight', 'title')) + + story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}' + videos = self._download_json(f'https://i.instagram.com/api/v1/feed/reels_media/?reel_ids={story_info_url}', story_id, headers={ + 'X-IG-App-ID': 936619743392459, + 'X-ASBD-ID': 198387, + 'X-IG-WWW-Claim': 0, + })['reels'] + entites = [] + + videos = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (str(user_id), 'items')) + for video_info in videos: + formats = [] + if isinstance(video_info, list): + video_info = video_info[0] + vcodec = video_info.get('video_codec') + dash_manifest_raw = video_info.get('video_dash_manifest') + videos_list = video_info.get('video_versions') + if not (dash_manifest_raw or videos_list): + continue + for format in videos_list: + formats.append({ + 'url': format.get('url'), + 'width': format.get('width'), + 'height': format.get('height'), + 'vcodec': vcodec, + }) + if dash_manifest_raw: + formats.extend(self._parse_mpd_formats(self._parse_xml(dash_manifest_raw, story_id), mpd_id='dash')) + self._sort_formats(formats) + thumbnails = [{ + 'url': thumbnail.get('url'), + 'width': thumbnail.get('width'), + 'height': thumbnail.get('height') + } for thumbnail in traverse_obj(video_info, ('image_versions2', 'candidates')) or []] + entites.append({ + 'id': video_info.get('id'), + 'title': f'Story by {username}', + 'timestamp': int_or_none(video_info.get('taken_at')), + 'uploader': traverse_obj(videos, ('user', 'full_name')), + 'duration': float_or_none(video_info.get('video_duration')), + 'uploader_id': user_id, + 'thumbnails': thumbnails, + 'formats': formats, + }) + + return self.playlist_result(entites, playlist_id=story_id, playlist_title=highlight_title) diff --git a/yt_dlp/extractor/internazionale.py b/yt_dlp/extractor/internazionale.py index 676e8e269c..45e2af6905 100644 --- a/yt_dlp/extractor/internazionale.py +++ b/yt_dlp/extractor/internazionale.py @@ -20,9 +20,6 @@ class InternazionaleIE(InfoExtractor): 'upload_date': '20150219', 'thumbnail': r're:^https?://.*\.jpg$', }, - 'params': { - 'format': 'bestvideo', - }, }, { 'url': 'https://www.internazionale.it/video/2018/08/29/telefono-stare-con-noi-stessi', 'md5': '9db8663704cab73eb972d1cee0082c79', @@ -36,9 +33,6 @@ class InternazionaleIE(InfoExtractor): 'upload_date': '20180829', 'thumbnail': r're:^https?://.*\.jpg$', }, - 'params': { - 'format': 'bestvideo', - }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/iprima.py b/yt_dlp/extractor/iprima.py index 28e6609723..347fec1d53 100644 --- a/yt_dlp/extractor/iprima.py +++ b/yt_dlp/extractor/iprima.py @@ -8,12 +8,19 @@ from ..utils import ( determine_ext, js_to_json, + urlencode_postdata, + ExtractorError, + parse_qs ) class IPrimaIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?!cnn)(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' _GEO_BYPASS = False + _NETRC_MACHINE = 'iprima' + _LOGIN_URL = 'https://auth.iprima.cz/oauth2/login' + _TOKEN_URL = 'https://auth.iprima.cz/oauth2/token' + access_token = None _TESTS = [{ 'url': 'https://prima.iprima.cz/particka/92-epizoda', @@ -22,16 +29,8 @@ class IPrimaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Partička (92)', 'description': 'md5:859d53beae4609e6dd7796413f1b6cac', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - }, { - 'url': 'https://cnn.iprima.cz/videa/70-epizoda', - 'info_dict': { - 'id': 'p681554', - 'ext': 'mp4', - 'title': 'HLAVNÍ ZPRÁVY 3.5.2020', + 'upload_date': '20201103', + 'timestamp': 1604437480, }, 'params': { 'skip_download': True, # m3u8 download @@ -44,11 +43,9 @@ class IPrimaIE(InfoExtractor): 'url': 'http://play.iprima.cz/closer-nove-pripady/closer-nove-pripady-iv-1', 'only_matching': True, }, { - # iframe api.play-backend.iprima.cz 'url': 'https://prima.iprima.cz/my-little-pony/mapa-znameni-2-2', 'only_matching': True, }, { - # iframe prima.iprima.cz 'url': 'https://prima.iprima.cz/porady/jak-se-stavi-sen/rodina-rathousova-praha', 'only_matching': True, }, { @@ -66,9 +63,127 @@ class IPrimaIE(InfoExtractor): }, { 'url': 'https://love.iprima.cz/laska-az-za-hrob/slib-dany-bratrovi', 'only_matching': True, - }, { - 'url': 'https://autosalon.iprima.cz/motorsport/7-epizoda-1', - 'only_matching': True, + }] + + def _login(self): + username, password = self._get_login_info() + + if username is None or password is None: + self.raise_login_required('Login is required to access any iPrima content', method='password') + + login_page = self._download_webpage( + self._LOGIN_URL, None, note='Downloading login page', + errnote='Downloading login page failed') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + '_email': username, + '_password': password}) + + _, login_handle = self._download_webpage_handle( + self._LOGIN_URL, None, data=urlencode_postdata(login_form), + note='Logging in') + + code = parse_qs(login_handle.geturl()).get('code')[0] + if not code: + raise ExtractorError('Login failed', expected=True) + + token_request_data = { + 'scope': 'openid+email+profile+phone+address+offline_access', + 'client_id': 'prima_sso', + 'grant_type': 'authorization_code', + 'code': code, + 'redirect_uri': 'https://auth.iprima.cz/sso/auth-check'} + + token_data = self._download_json( + self._TOKEN_URL, None, + note='Downloading token', errnote='Downloading token failed', + data=urlencode_postdata(token_request_data)) + + self.access_token = token_data.get('access_token') + if self.access_token is None: + raise ExtractorError('Getting token failed', expected=True) + + def _raise_access_error(self, error_code): + if error_code == 'PLAY_GEOIP_DENIED': + self.raise_geo_restricted(countries=['CZ'], metadata_available=True) + elif error_code is not None: + self.raise_no_formats('Access to stream infos forbidden', expected=True) + + def _real_initialize(self): + if not self.access_token: + self._login() + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta( + ['og:title', 'twitter:title'], + webpage, 'title', default=None) + + video_id = self._search_regex(( + r'productId\s*=\s*([\'"])(?P<id>p\d+)\1', + r'pproduct_id\s*=\s*([\'"])(?P<id>p\d+)\1'), + webpage, 'real id', group='id') + + metadata = self._download_json( + f'https://api.play-backend.iprima.cz/api/v1//products/id-{video_id}/play', + video_id, note='Getting manifest URLs', errnote='Failed to get manifest URLs', + headers={'X-OTT-Access-Token': self.access_token}, + expected_status=403) + + self._raise_access_error(metadata.get('errorCode')) + + stream_infos = metadata.get('streamInfos') + formats = [] + if stream_infos is None: + self.raise_no_formats('Reading stream infos failed', expected=True) + else: + for manifest in stream_infos: + manifest_type = manifest.get('type') + manifest_url = manifest.get('url') + ext = determine_ext(manifest_url) + if manifest_type == 'HLS' or ext == 'm3u8': + formats += self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + elif manifest_type == 'DASH' or ext == 'mpd': + formats += self._extract_mpd_formats( + manifest_url, video_id, mpd_id='dash', fatal=False) + self._sort_formats(formats) + + final_result = self._search_json_ld(webpage, video_id) or {} + final_result.update({ + 'id': video_id, + 'title': title, + 'thumbnail': self._html_search_meta( + ['thumbnail', 'og:image', 'twitter:image'], + webpage, 'thumbnail', default=None), + 'formats': formats, + 'description': self._html_search_meta( + ['description', 'og:description', 'twitter:description'], + webpage, 'description', default=None)}) + + return final_result + + +class IPrimaCNNIE(InfoExtractor): + _VALID_URL = r'https?://cnn\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _GEO_BYPASS = False + + _TESTS = [{ + 'url': 'https://cnn.iprima.cz/porady/strunc/24072020-koronaviru-mam-plne-zuby-strasit-druhou-vlnou-je-absurdni-rika-senatorka-dernerova', + 'info_dict': { + 'id': 'p716177', + 'ext': 'mp4', + 'title': 'md5:277c6b1ed0577e51b40ddd35602ff43e', + }, + 'params': { + 'skip_download': 'm3u8' + } }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/itv.py b/yt_dlp/extractor/itv.py index 4122ac880c..bdd6af6884 100644 --- a/yt_dlp/extractor/itv.py +++ b/yt_dlp/extractor/itv.py @@ -5,10 +5,14 @@ from .common import InfoExtractor from .brightcove import BrightcoveNewIE + +from ..compat import compat_str from ..utils import ( + base_url, clean_html, determine_ext, extract_attributes, + ExtractorError, get_element_by_class, JSON_LD_RE, merge_dicts, @@ -16,6 +20,8 @@ smuggle_url, try_get, url_or_none, + url_basename, + urljoin, ) @@ -23,15 +29,32 @@ class ITVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)' _GEO_COUNTRIES = ['GB'] _TESTS = [{ - 'url': 'https://www.itv.com/hub/liar/2a4547a0012', + 'url': 'https://www.itv.com/hub/plebs/2a1873a0002', 'info_dict': { - 'id': '2a4547a0012', + 'id': '2a1873a0002', 'ext': 'mp4', - 'title': 'Liar - Series 2 - Episode 6', - 'description': 'md5:d0f91536569dec79ea184f0a44cca089', - 'series': 'Liar', - 'season_number': 2, - 'episode_number': 6, + 'title': 'Plebs - The Orgy', + 'description': 'md5:4d7159af53ebd5b36e8b3ec82a41fdb4', + 'series': 'Plebs', + 'season_number': 1, + 'episode_number': 1, + 'thumbnail': r're:https?://hubimages\.itv\.com/episode/2_1873_0002' + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://www.itv.com/hub/the-jonathan-ross-show/2a1166a0209', + 'info_dict': { + 'id': '2a1166a0209', + 'ext': 'mp4', + 'title': 'The Jonathan Ross Show - Series 17 - Episode 8', + 'description': 'md5:3023dcdd375db1bc9967186cdb3f1399', + 'series': 'The Jonathan Ross Show', + 'episode_number': 8, + 'season_number': 17, + 'thumbnail': r're:https?://hubimages\.itv\.com/episode/2_1873_0002' }, 'params': { # m3u8 download @@ -51,22 +74,16 @@ class ITVIE(InfoExtractor): 'only_matching': True, }] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - params = extract_attributes(self._search_regex( - r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params')) - - ios_playlist_url = params.get('data-video-playlist') or params['data-video-id'] - hmac = params['data-video-hmac'] - headers = self.geo_verification_headers() - headers.update({ + def _generate_api_headers(self, hmac): + return merge_dicts({ 'Accept': 'application/vnd.itv.vod.playlist.v2+json', 'Content-Type': 'application/json', 'hmac': hmac.upper(), - }) - ios_playlist = self._download_json( - ios_playlist_url, video_id, data=json.dumps({ + }, self.geo_verification_headers()) + + def _call_api(self, video_id, playlist_url, headers, platform_tag, featureset, fatal=True): + return self._download_json( + playlist_url, video_id, data=json.dumps({ 'user': { 'itvUserId': '', 'entitlements': [], @@ -87,15 +104,61 @@ def _real_extract(self, url): }, 'variantAvailability': { 'featureset': { - 'min': ['hls', 'aes', 'outband-webvtt'], - 'max': ['hls', 'aes', 'outband-webvtt'] + 'min': featureset, + 'max': featureset }, - 'platformTag': 'dotcom' + 'platformTag': platform_tag } - }).encode(), headers=headers) - video_data = ios_playlist['Playlist']['Video'] - ios_base_url = video_data.get('Base') + }).encode(), headers=headers, fatal=fatal) + def _get_subtitles(self, video_id, variants, ios_playlist_url, headers, *args, **kwargs): + subtitles = {} + # Prefer last matching featureset + # See: https://github.com/yt-dlp/yt-dlp/issues/986 + platform_tag_subs, featureset_subs = next( + ((platform_tag, featureset) + for platform_tag, featuresets in reversed(list(variants.items())) for featureset in featuresets + if try_get(featureset, lambda x: x[2]) == 'outband-webvtt'), + (None, None)) + + if platform_tag_subs and featureset_subs: + subs_playlist = self._call_api( + video_id, ios_playlist_url, headers, platform_tag_subs, featureset_subs, fatal=False) + subs = try_get(subs_playlist, lambda x: x['Playlist']['Video']['Subtitles'], list) or [] + for sub in subs: + if not isinstance(sub, dict): + continue + href = url_or_none(sub.get('Href')) + if not href: + continue + subtitles.setdefault('en', []).append({'url': href}) + return subtitles + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + params = extract_attributes(self._search_regex( + r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params')) + variants = self._parse_json( + try_get(params, lambda x: x['data-video-variants'], compat_str) or '{}', + video_id, fatal=False) + # Prefer last matching featureset + # See: https://github.com/yt-dlp/yt-dlp/issues/986 + platform_tag_video, featureset_video = next( + ((platform_tag, featureset) + for platform_tag, featuresets in reversed(list(variants.items())) for featureset in featuresets + if set(try_get(featureset, lambda x: x[:2]) or []) == {'aes', 'hls'}), + (None, None)) + if not platform_tag_video or not featureset_video: + raise ExtractorError('No downloads available', expected=True, video_id=video_id) + + ios_playlist_url = params.get('data-video-playlist') or params['data-video-id'] + headers = self._generate_api_headers(params['data-video-hmac']) + ios_playlist = self._call_api( + video_id, ios_playlist_url, headers, platform_tag_video, featureset_video) + + video_data = try_get(ios_playlist, lambda x: x['Playlist']['Video'], dict) or {} + ios_base_url = video_data.get('Base') formats = [] for media_file in (video_data.get('MediaFiles') or []): href = media_file.get('Href') @@ -113,20 +176,6 @@ def _real_extract(self, url): 'url': href, }) self._sort_formats(formats) - - subtitles = {} - subs = video_data.get('Subtitles') or [] - for sub in subs: - if not isinstance(sub, dict): - continue - href = url_or_none(sub.get('Href')) - if not href: - continue - subtitles.setdefault('en', []).append({ - 'url': href, - 'ext': determine_ext(href, 'vtt'), - }) - info = self._search_json_ld(webpage, video_id, default={}) if not info: json_ld = self._parse_json(self._search_regex( @@ -140,27 +189,54 @@ def _real_extract(self, url): info = self._json_ld(item, video_id, fatal=False) or {} break + thumbnails = [] + thumbnail_url = try_get(params, lambda x: x['data-video-posterframe'], compat_str) + if thumbnail_url: + thumbnails.extend([{ + 'url': thumbnail_url.format(width=1920, height=1080, quality=100, blur=0, bg='false'), + 'width': 1920, + 'height': 1080, + }, { + 'url': urljoin(base_url(thumbnail_url), url_basename(thumbnail_url)), + 'preference': -2 + }]) + + thumbnail_url = self._html_search_meta(['og:image', 'twitter:image'], webpage, default=None) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + }) + self._remove_duplicate_formats(thumbnails) + return merge_dicts({ 'id': video_id, 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage), 'formats': formats, - 'subtitles': subtitles, + 'subtitles': self.extract_subtitles(video_id, variants, ios_playlist_url, headers), 'duration': parse_duration(video_data.get('Duration')), 'description': clean_html(get_element_by_class('episode-info__synopsis', webpage)), + 'thumbnails': thumbnails }, info) class ITVBTCCIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?itv\.com/(?:news|btcc)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ 'url': 'https://www.itv.com/btcc/articles/btcc-2019-brands-hatch-gp-race-action', 'info_dict': { 'id': 'btcc-2019-brands-hatch-gp-race-action', 'title': 'BTCC 2019: Brands Hatch GP race action', }, 'playlist_count': 12, - } - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s' + }, { + 'url': 'https://www.itv.com/news/2021-10-27/i-have-to-protect-the-country-says-rishi-sunak-as-uk-faces-interest-rate-hike', + 'info_dict': { + 'id': 'i-have-to-protect-the-country-says-rishi-sunak-as-uk-faces-interest-rate-hike', + 'title': 'md5:6ef054dd9f069330db3dcc66cb772d32' + }, + 'playlist_count': 4 + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' def _real_extract(self, url): playlist_id = self._match_id(url) @@ -171,15 +247,15 @@ def _real_extract(self, url): '(?s)<script[^>]+id=[\'"]__NEXT_DATA__[^>]*>([^<]+)</script>', webpage, 'json_map'), playlist_id), lambda x: x['props']['pageProps']['article']['body']['content']) or [] - # Discard empty objects - video_ids = [] + entries = [] for video in json_map: - if video['data'].get('id'): - video_ids.append(video['data']['id']) - - entries = [ - self.url_result( - smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, { + if not any(video['data'].get(attr) == 'Brightcove' for attr in ('name', 'type')): + continue + video_id = video['data']['id'] + account_id = video['data']['accountId'] + player_id = video['data']['playerId'] + entries.append(self.url_result( + smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), { # ITV does not like some GB IP ranges, so here are some # IP blocks it accepts 'geo_ip_blocks': [ @@ -187,8 +263,7 @@ def _real_extract(self, url): ], 'referrer': url, }), - ie=BrightcoveNewIE.ie_key(), video_id=video_id) - for video_id in video_ids] + ie=BrightcoveNewIE.ie_key(), video_id=video_id)) title = self._og_search_title(webpage, fatal=False) diff --git a/yt_dlp/extractor/ivi.py b/yt_dlp/extractor/ivi.py index 5e1d89c9b3..5f8a046e08 100644 --- a/yt_dlp/extractor/ivi.py +++ b/yt_dlp/extractor/ivi.py @@ -3,7 +3,6 @@ import json import re -import sys from .common import InfoExtractor from ..utils import ( @@ -94,20 +93,21 @@ def _real_extract(self, url): ] }) - bundled = hasattr(sys, 'frozen') - for site in (353, 183): content_data = (data % site).encode() if site == 353: - if bundled: - continue try: from Cryptodome.Cipher import Blowfish from Cryptodome.Hash import CMAC - pycryptodomex_found = True + pycryptodome_found = True except ImportError: - pycryptodomex_found = False - continue + try: + from Crypto.Cipher import Blowfish + from Crypto.Hash import CMAC + pycryptodome_found = True + except ImportError: + pycryptodome_found = False + continue timestamp = (self._download_json( self._LIGHT_URL, video_id, @@ -140,14 +140,8 @@ def _real_extract(self, url): extractor_msg = 'Video %s does not exist' elif site == 353: continue - elif bundled: - raise ExtractorError( - 'This feature does not work from bundled exe. Run yt-dlp from sources.', - expected=True) - elif not pycryptodomex_found: - raise ExtractorError( - 'pycryptodomex not found. Please install', - expected=True) + elif not pycryptodome_found: + raise ExtractorError('pycryptodomex not found. Please install', expected=True) elif message: extractor_msg += ': ' + message raise ExtractorError(extractor_msg % video_id, expected=True) diff --git a/yt_dlp/extractor/ivideon.py b/yt_dlp/extractor/ivideon.py index 01e7b22d4c..44b2208468 100644 --- a/yt_dlp/extractor/ivideon.py +++ b/yt_dlp/extractor/ivideon.py @@ -75,7 +75,7 @@ def _real_extract(self, url): return { 'id': server_id, - 'title': self._live_title(camera_name or server_id), + 'title': camera_name or server_id, 'description': description, 'is_live': True, 'formats': formats, diff --git a/yt_dlp/extractor/iwara.py b/yt_dlp/extractor/iwara.py index 907d5fc8bb..254d986923 100644 --- a/yt_dlp/extractor/iwara.py +++ b/yt_dlp/extractor/iwara.py @@ -1,5 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals +import re from .common import InfoExtractor from ..compat import compat_urllib_parse_urlparse @@ -8,6 +9,8 @@ mimetype2ext, remove_end, url_or_none, + unified_strdate, + strip_or_none, ) @@ -21,6 +24,10 @@ class IwaraIE(InfoExtractor): 'ext': 'mp4', 'title': '【MMD R-18】ガールフレンド carry_me_off', 'age_limit': 18, + 'thumbnail': 'https://i.iwara.tv/sites/default/files/videos/thumbnails/7951/thumbnail-7951_0001.png', + 'uploader': 'Reimu丨Action', + 'upload_date': '20150828', + 'description': 'md5:1d4905ce48c66c9299c617f08e106e0f', }, }, { 'url': 'http://ecchi.iwara.tv/videos/Vb4yf2yZspkzkBO', @@ -72,6 +79,19 @@ def _real_extract(self, url): title = remove_end(self._html_search_regex( r'<title>([^<]+)', webpage, 'title'), ' | Iwara') + thumbnail = self._html_search_regex( + r'poster=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None) + + uploader = self._html_search_regex( + r'class="username">([^<]+)', webpage, 'uploader', fatal=False) + + upload_date = unified_strdate(self._html_search_regex( + r'作成日:([^\s]+)', webpage, 'upload_date', fatal=False)) + + description = strip_or_none(self._search_regex( + r'

    (.+?(?=[^/&#$?]+)' + _TESTS = [{ # Test for video in the comments + 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/946c4189-bc2d-4524-b95b-43f641e2adde', + 'info_dict': { + 'id': '946c4189-bc2d-4524-b95b-43f641e2adde', + 'ext': 'mp4', + 'title': 'test for video in comment', + 'description': 'md5:daa77dc214add4da8b6ea7d2226776e7', + 'timestamp': 1632215195, + 'uploader_id': 'ytdlpTestAccount', + 'uploader': 'yt-dlpTestAccount', + 'duration': 7000, + 'upload_date': '20210921' + }, + 'params': {'skip_download': True} + }, { # Test for koo with long title + 'url': 'https://www.kooapp.com/koo/laxman_kumarDBFEC/33decbf7-5e1e-4bb8-bfd7-04744a064361', + 'info_dict': { + 'id': '33decbf7-5e1e-4bb8-bfd7-04744a064361', + 'ext': 'mp4', + 'title': 'md5:47a71c2337295330c5a19a8af1bbf450', + 'description': 'md5:06a6a84e9321499486dab541693d8425', + 'timestamp': 1632106884, + 'uploader_id': 'laxman_kumarDBFEC', + 'uploader': 'Laxman Kumar 🇮🇳', + 'duration': 46000, + 'upload_date': '20210920' + }, + 'params': {'skip_download': True} + }, { # Test for audio + 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/a2a9c88e-ce4b-4d2d-952f-d06361c5b602', + 'info_dict': { + 'id': 'a2a9c88e-ce4b-4d2d-952f-d06361c5b602', + 'ext': 'mp4', + 'title': 'Test for audio', + 'description': 'md5:ecb9a2b6a5d34b736cecb53788cb11e8', + 'timestamp': 1632211634, + 'uploader_id': 'ytdlpTestAccount', + 'uploader': 'yt-dlpTestAccount', + 'duration': 214000, + 'upload_date': '20210921' + }, + 'params': {'skip_download': True} + }, { # Test for video + 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/a3e56c53-c1ed-4ac9-ac02-ed1630e6b1d1', + 'info_dict': { + 'id': 'a3e56c53-c1ed-4ac9-ac02-ed1630e6b1d1', + 'ext': 'mp4', + 'title': 'Test for video', + 'description': 'md5:7afc4eb839074ddeb2beea5dd6fe9500', + 'timestamp': 1632211468, + 'uploader_id': 'ytdlpTestAccount', + 'uploader': 'yt-dlpTestAccount', + 'duration': 14000, + 'upload_date': '20210921' + }, + 'params': {'skip_download': True} + }, { # Test for link + 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/01bf5b94-81a5-4d8e-a387-5f732022e15a', + 'skip': 'No video/audio found at the provided url.', + 'info_dict': { + 'id': '01bf5b94-81a5-4d8e-a387-5f732022e15a', + 'title': 'Test for link', + 'ext': 'none', + }, + }, { # Test for images + 'url': 'https://www.kooapp.com/koo/ytdlpTestAccount/dc05d9cd-a61d-45fd-bb07-e8019d8ca8cb', + 'skip': 'No video/audio found at the provided url.', + 'info_dict': { + 'id': 'dc05d9cd-a61d-45fd-bb07-e8019d8ca8cb', + 'title': 'Test for images', + 'ext': 'none', + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + data_json = self._download_json(f'https://www.kooapp.com/apiV1/ku/{id}?limit=20&offset=0&showSimilarKoos=true', id)['parentContent'] + item_json = next(content['items'][0] for content in data_json + if try_get(content, lambda x: x['items'][0]['id']) == id) + media_json = item_json['mediaMap'] + formats = [] + + mp4_url = media_json.get('videoMp4') + video_m3u8_url = media_json.get('videoHls') + if mp4_url: + formats.append({ + 'url': mp4_url, + 'ext': 'mp4', + }) + if video_m3u8_url: + formats.extend(self._extract_m3u8_formats(video_m3u8_url, id, fatal=False, ext='mp4')) + if not formats: + self.raise_no_formats('No video/audio found at the provided url.', expected=True) + + self._sort_formats(formats) + return { + 'id': id, + 'title': clean_html(item_json.get('title')), + 'description': f'{clean_html(item_json.get("title"))}\n\n{clean_html(item_json.get("enTransliteration"))}', + 'timestamp': item_json.get('createdAt'), + 'uploader_id': item_json.get('handle'), + 'uploader': item_json.get('name'), + 'duration': media_json.get('duration'), + 'formats': formats, + } diff --git a/yt_dlp/extractor/la7.py b/yt_dlp/extractor/la7.py index 363fbd6a51..de985e4508 100644 --- a/yt_dlp/extractor/la7.py +++ b/yt_dlp/extractor/la7.py @@ -7,8 +7,9 @@ from ..utils import ( determine_ext, float_or_none, + HEADRequest, + int_or_none, parse_duration, - smuggle_url, unified_strdate, ) @@ -25,19 +26,38 @@ class LA7IE(InfoExtractor): 'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722', 'md5': '8b613ffc0c4bf9b9e377169fc19c214c', 'info_dict': { - 'id': '0_42j6wd36', + 'id': 'inccool8-02-10-2015-163722', 'ext': 'mp4', 'title': 'Inc.Cool8', 'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico', 'thumbnail': 're:^https?://.*', - 'uploader_id': 'kdla7pillole@iltrovatore.it', - 'timestamp': 1443814869, 'upload_date': '20151002', }, }, { 'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077', 'only_matching': True, }] + _HOST = 'https://awsvodpkg.iltrovatore.it' + + def _generate_mp4_url(self, quality, m3u8_formats): + for f in m3u8_formats: + if f['vcodec'] != 'none' and quality in f['url']: + http_url = '%s%s.mp4' % (self._HOST, quality) + + urlh = self._request_webpage( + HEADRequest(http_url), quality, + note='Check filesize', fatal=False) + if urlh: + http_f = f.copy() + del http_f['manifest_url'] + http_f.update({ + 'format_id': http_f['format_id'].replace('hls-', 'https-'), + 'url': http_url, + 'protocol': 'https', + 'filesize_approx': int_or_none(urlh.headers.get('Content-Length', None)), + }) + return http_f + return None def _real_extract(self, url): video_id = self._match_id(url) @@ -46,22 +66,30 @@ def _real_extract(self, url): url = '%s//%s' % (self.http_scheme(), url) webpage = self._download_webpage(url, video_id) + video_path = self._search_regex(r'(/content/.*?).mp4', webpage, 'video_path') - player_data = self._search_regex( - [r'(?s)videoParams\s*=\s*({.+?});', r'videoLa7\(({[^;]+})\);'], - webpage, 'player data') - vid = self._search_regex(r'vid\s*:\s*"(.+?)",', player_data, 'vid') + formats = self._extract_mpd_formats( + f'{self._HOST}/local/dash/,{video_path}.mp4.urlset/manifest.mpd', + video_id, mpd_id='dash', fatal=False) + m3u8_formats = self._extract_m3u8_formats( + f'{self._HOST}/local/hls/,{video_path}.mp4.urlset/master.m3u8', + video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + + for q in filter(None, video_path.split(',')): + http_f = self._generate_mp4_url(q, m3u8_formats) + if http_f: + formats.append(http_f) + + self._sort_formats(formats) return { - '_type': 'url_transparent', - 'url': smuggle_url('kaltura:103:%s' % vid, { - 'service_url': 'http://nkdam.iltrovatore.it', - }), 'id': video_id, 'title': self._og_search_title(webpage, default=None), 'description': self._og_search_description(webpage, default=None), 'thumbnail': self._og_search_thumbnail(webpage, default=None), - 'ie_key': 'Kaltura', + 'formats': formats, + 'upload_date': unified_strdate(self._search_regex(r'datetime="(.+?)"', webpage, 'upload_date', fatal=False)) } diff --git a/yt_dlp/extractor/laola1tv.py b/yt_dlp/extractor/laola1tv.py index fa217365a3..b5d27c2f07 100644 --- a/yt_dlp/extractor/laola1tv.py +++ b/yt_dlp/extractor/laola1tv.py @@ -112,7 +112,7 @@ def get_flashvar(x, *args, **kwargs): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'upload_date': unified_strdate(_v('time_date')), 'uploader': _v('meta_organisation'), 'categories': categories, @@ -161,7 +161,7 @@ def _extract_video(self, url): return { 'id': video_id, 'display_id': display_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': video_data.get('description'), 'thumbnail': video_data.get('image'), 'categories': categories, diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index 4289c51b81..1405ce0c72 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -28,14 +28,19 @@ class LBRYBaseIE(InfoExtractor): _SUPPORTED_STREAM_TYPES = ['video', 'audio'] def _call_api_proxy(self, method, display_id, params, resource): - return self._download_json( + response = self._download_json( 'https://api.lbry.tv/api/v1/proxy', display_id, 'Downloading %s JSON metadata' % resource, headers={'Content-Type': 'application/json-rpc'}, data=json.dumps({ 'method': method, 'params': params, - }).encode())['result'] + }).encode()) + err = response.get('error') + if err: + raise ExtractorError( + f'{self.IE_NAME} said: {err.get("code")} - {err.get("message")}', expected=True) + return response['result'] def _resolve_url(self, url, display_id, resource): return self._call_api_proxy( @@ -179,28 +184,38 @@ def _real_extract(self, url): display_id = compat_urllib_parse_unquote(display_id) uri = 'lbry://' + display_id result = self._resolve_url(uri, display_id, 'stream') - result_value = result['value'] - if result_value.get('stream_type') not in self._SUPPORTED_STREAM_TYPES: + if result['value'].get('stream_type') in self._SUPPORTED_STREAM_TYPES: + claim_id, is_live, headers = result['claim_id'], False, None + streaming_url = self._call_api_proxy( + 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] + final_url = self._request_webpage( + streaming_url, display_id, note='Downloading streaming redirect url info').geturl() + elif result.get('value_type') == 'stream': + claim_id, is_live = result['signing_channel']['claim_id'], True + headers = {'referer': 'https://player.odysee.live/'} + live_data = self._download_json( + f'https://api.live.odysee.com/v1/odysee/live/{claim_id}', claim_id, + note='Downloading livestream JSON metadata')['data'] + if not live_data['live']: + raise ExtractorError('This stream is not live', expected=True) + streaming_url = final_url = live_data['url'] + else: raise ExtractorError('Unsupported URL', expected=True) - claim_id = result['claim_id'] - title = result_value['title'] - streaming_url = self._call_api_proxy( - 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] + info = self._parse_stream(result, url) - urlh = self._request_webpage( - streaming_url, display_id, note='Downloading streaming redirect url info') - if determine_ext(urlh.geturl()) == 'm3u8': + if determine_ext(final_url) == 'm3u8': info['formats'] = self._extract_m3u8_formats( - urlh.geturl(), display_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + final_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', live=is_live, headers=headers) self._sort_formats(info['formats']) else: info['url'] = streaming_url - info.update({ + return { + **info, 'id': claim_id, - 'title': title, - }) - return info + 'title': result['value']['title'], + 'is_live': is_live, + 'http_headers': headers, + } class LBRYChannelIE(LBRYBaseIE): diff --git a/yt_dlp/extractor/lego.py b/yt_dlp/extractor/lego.py index b9d8b167c4..901f43bcf3 100644 --- a/yt_dlp/extractor/lego.py +++ b/yt_dlp/extractor/lego.py @@ -8,6 +8,7 @@ from ..utils import ( ExtractorError, int_or_none, + join_nonempty, qualities, ) @@ -102,12 +103,8 @@ def _real_extract(self, url): m3u8_id=video_source_format, fatal=False)) else: video_source_quality = video_source.get('Quality') - format_id = [] - for v in (video_source_format, video_source_quality): - if v: - format_id.append(v) f = { - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(video_source_format, video_source_quality), 'quality': q(video_source_quality), 'url': video_source_url, } diff --git a/yt_dlp/extractor/line.py b/yt_dlp/extractor/line.py index d4bcae6c1c..e1d5f21e17 100644 --- a/yt_dlp/extractor/line.py +++ b/yt_dlp/extractor/line.py @@ -116,7 +116,7 @@ def _parse_broadcast_item(self, item): return { 'id': broadcast_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'thumbnails': thumbnails, 'timestamp': int_or_none(item.get('createdAt')), 'channel': channel.get('name'), diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py index 6d54d638ac..bd76ae1664 100644 --- a/yt_dlp/extractor/linkedin.py +++ b/yt_dlp/extractor/linkedin.py @@ -1,52 +1,33 @@ # coding: utf-8 from __future__ import unicode_literals +from itertools import zip_longest import re from .common import InfoExtractor from ..utils import ( + clean_html, + extract_attributes, ExtractorError, float_or_none, + get_element_by_class, int_or_none, + srt_subtitles_timecode, + strip_or_none, + mimetype2ext, + try_get, urlencode_postdata, urljoin, ) -class LinkedInLearningBaseIE(InfoExtractor): +class LinkedInBaseIE(InfoExtractor): _NETRC_MACHINE = 'linkedin' - _LOGIN_URL = 'https://www.linkedin.com/uas/login?trk=learning' - - def _call_api(self, course_slug, fields, video_slug=None, resolution=None): - query = { - 'courseSlug': course_slug, - 'fields': fields, - 'q': 'slugs', - } - sub = '' - if video_slug: - query.update({ - 'videoSlug': video_slug, - 'resolution': '_%s' % resolution, - }) - sub = ' %dp' % resolution - api_url = 'https://www.linkedin.com/learning-api/detailedCourses' - return self._download_json( - api_url, video_slug, 'Downloading%s JSON metadata' % sub, headers={ - 'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value, - }, query=query)['elements'][0] - - def _get_urn_id(self, video_data): - urn = video_data.get('urn') - if urn: - mobj = re.search(r'urn:li:lyndaCourse:\d+,(\d+)', urn) - if mobj: - return mobj.group(1) - - def _get_video_id(self, video_data, course_slug, video_slug): - return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug) + _logged_in = False def _real_initialize(self): + if self._logged_in: + return email, password = self._get_login_info() if email is None: return @@ -69,6 +50,84 @@ def _real_initialize(self): login_submit_page, 'error', default=None) if error: raise ExtractorError(error, expected=True) + LinkedInBaseIE._logged_in = True + + +class LinkedInLearningBaseIE(LinkedInBaseIE): + _LOGIN_URL = 'https://www.linkedin.com/uas/login?trk=learning' + + def _call_api(self, course_slug, fields, video_slug=None, resolution=None): + query = { + 'courseSlug': course_slug, + 'fields': fields, + 'q': 'slugs', + } + sub = '' + if video_slug: + query.update({ + 'videoSlug': video_slug, + 'resolution': '_%s' % resolution, + }) + sub = ' %dp' % resolution + api_url = 'https://www.linkedin.com/learning-api/detailedCourses' + if not self._get_cookies(api_url).get('JSESSIONID'): + self.raise_login_required() + return self._download_json( + api_url, video_slug, 'Downloading%s JSON metadata' % sub, headers={ + 'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value, + }, query=query)['elements'][0] + + def _get_urn_id(self, video_data): + urn = video_data.get('urn') + if urn: + mobj = re.search(r'urn:li:lyndaCourse:\d+,(\d+)', urn) + if mobj: + return mobj.group(1) + + def _get_video_id(self, video_data, course_slug, video_slug): + return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug) + + +class LinkedInIE(LinkedInBaseIE): + _VALID_URL = r'https?://(?:www\.)?linkedin\.com/posts/.+?(?P\d+)' + _TESTS = [{ + 'url': 'https://www.linkedin.com/posts/mishalkhawaja_sendinblueviews-toronto-digitalmarketing-ugcPost-6850898786781339649-mM20', + 'info_dict': { + 'id': '6850898786781339649', + 'ext': 'mp4', + 'title': 'Mishal K. on LinkedIn: #sendinblueviews #toronto #digitalmarketing', + 'description': 'md5:be125430bab1c574f16aeb186a4d5b19', + 'creator': 'Mishal K.' + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'([^<]+)', webpage, 'title') + description = clean_html(get_element_by_class('share-update-card__update-text', webpage)) + like_count = int_or_none(get_element_by_class('social-counts-reactions__social-counts-numRections', webpage)) + creator = strip_or_none(clean_html(get_element_by_class('comment__actor-name', webpage))) + + sources = self._parse_json(extract_attributes(self._search_regex(r'(]+>)', webpage, 'video'))['data-sources'], video_id) + formats = [{ + 'url': source['src'], + 'ext': mimetype2ext(source.get('type')), + 'tbr': float_or_none(source.get('data-bitrate'), scale=1000), + } for source in sources] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'like_count': like_count, + 'creator': creator, + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': description, + } class LinkedInLearningIE(LinkedInLearningBaseIE): @@ -86,10 +145,19 @@ class LinkedInLearningIE(LinkedInLearningBaseIE): }, } + def json2srt(self, transcript_lines, duration=None): + srt_data = '' + for line, (line_dict, next_dict) in enumerate(zip_longest(transcript_lines, transcript_lines[1:])): + start_time, caption = line_dict['transcriptStartAt'] / 1000, line_dict['caption'] + end_time = next_dict['transcriptStartAt'] / 1000 if next_dict else duration or start_time + 1 + srt_data += '%d\n%s --> %s\n%s\n\n' % (line + 1, srt_subtitles_timecode(start_time), + srt_subtitles_timecode(end_time), + caption) + return srt_data + def _real_extract(self, url): course_slug, video_slug = self._match_valid_url(url).groups() - video_data = None formats = [] for width, height in ((640, 360), (960, 540), (1280, 720)): video_data = self._call_api( @@ -101,6 +169,7 @@ def _real_extract(self, url): formats.append({ 'format_id': 'progressive-%dp' % height, 'url': progressive_url, + 'ext': 'mp4', 'height': height, 'width': width, 'source_preference': 1, @@ -128,6 +197,14 @@ def _real_extract(self, url): # However, unless someone can confirm this, the old # behaviour is being kept as-is self._sort_formats(formats, ('res', 'source_preference')) + subtitles = {} + duration = int_or_none(video_data.get('durationInSeconds')) + transcript_lines = try_get(video_data, lambda x: x['transcript']['lines'], expected_type=list) + if transcript_lines: + subtitles['en'] = [{ + 'ext': 'srt', + 'data': self.json2srt(transcript_lines, duration) + }] return { 'id': self._get_video_id(video_data, course_slug, video_slug), @@ -135,7 +212,8 @@ def _real_extract(self, url): 'formats': formats, 'thumbnail': video_data.get('defaultThumbnail'), 'timestamp': float_or_none(video_data.get('publishedOn'), 1000), - 'duration': int_or_none(video_data.get('durationInSeconds')), + 'duration': duration, + 'subtitles': subtitles, } diff --git a/yt_dlp/extractor/livestream.py b/yt_dlp/extractor/livestream.py index f591289ecd..45bf26d26c 100644 --- a/yt_dlp/extractor/livestream.py +++ b/yt_dlp/extractor/livestream.py @@ -176,7 +176,7 @@ def _extract_stream_info(self, stream_info): return { 'id': broadcast_id, 'formats': formats, - 'title': self._live_title(stream_info['stream_title']) if is_live else stream_info['stream_title'], + 'title': stream_info['stream_title'], 'thumbnail': stream_info.get('thumbnail_url'), 'is_live': is_live, } @@ -344,7 +344,7 @@ def _real_extract(self, url): is_live = video_data.get('isLive') info.update({ 'id': content_id, - 'title': self._live_title(info['title']) if is_live else info['title'], + 'title': info['title'], 'formats': self._extract_video_formats(video_data, content_id), 'is_live': is_live, }) diff --git a/yt_dlp/extractor/mangomolo.py b/yt_dlp/extractor/mangomolo.py index acee370e93..68ce138b3e 100644 --- a/yt_dlp/extractor/mangomolo.py +++ b/yt_dlp/extractor/mangomolo.py @@ -33,7 +33,7 @@ def _real_extract(self, url): return { 'id': page_id, - 'title': self._live_title(page_id) if self._IS_LIVE else page_id, + 'title': page_id, 'uploader_id': hidden_inputs.get('userid'), 'duration': int_or_none(hidden_inputs.get('duration')), 'is_live': self._IS_LIVE, diff --git a/yt_dlp/extractor/matchtv.py b/yt_dlp/extractor/matchtv.py index bc9933a813..e003b8d259 100644 --- a/yt_dlp/extractor/matchtv.py +++ b/yt_dlp/extractor/matchtv.py @@ -49,7 +49,7 @@ def _real_extract(self, url): self._sort_formats(formats) return { 'id': video_id, - 'title': self._live_title('Матч ТВ - Прямой эфир'), + 'title': 'Матч ТВ - Прямой эфир', 'is_live': True, 'formats': formats, } diff --git a/yt_dlp/extractor/mdr.py b/yt_dlp/extractor/mdr.py index 0bdd626930..3ca174c2b9 100644 --- a/yt_dlp/extractor/mdr.py +++ b/yt_dlp/extractor/mdr.py @@ -2,13 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_urlparse from ..utils import ( determine_ext, int_or_none, + join_nonempty, parse_duration, parse_iso8601, url_or_none, @@ -148,13 +146,9 @@ def _real_extract(self, url): abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) filesize = int_or_none(xpath_text(asset, './fileSize', 'file size')) - format_id = [media_type] - if vbr or abr: - format_id.append(compat_str(vbr or abr)) - f = { 'url': video_url, - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(media_type, vbr or abr), 'filesize': filesize, 'abr': abr, 'vbr': vbr, diff --git a/yt_dlp/extractor/mediaite.py b/yt_dlp/extractor/mediaite.py new file mode 100644 index 0000000000..b670f0d615 --- /dev/null +++ b/yt_dlp/extractor/mediaite.py @@ -0,0 +1,93 @@ +from __future__ import unicode_literals + + +from .common import InfoExtractor + + +class MediaiteIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mediaite.com(?!/category)(?:/[\w-]+){2}' + _TESTS = [{ + 'url': 'https://www.mediaite.com/sports/bill-burr-roasts-nfl-for-promoting-black-lives-matter-while-scheduling-more-games-after-all-the-sht-they-know-about-cte/', + 'info_dict': { + 'id': 'vPHKITzy', + 'ext': 'm4a', + 'title': 'Bill Burr On NFL And Black Lives Matter', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/vPHKITzy/poster.jpg?width=720', + 'duration': 55, + 'timestamp': 1631630185, + 'upload_date': '20210914', + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.mediaite.com/tv/joe-scarborough-goes-off-on-tax-breaks-for-super-wealthy-largest-income-redistribution-scam-in-american-history/', + 'info_dict': { + 'id': 'eeFcK4Xm', + 'ext': 'mp4', + 'title': 'Morning Joe-6_16_52 am - 6_21_10 am-2021-09-14.mp4', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/eeFcK4Xm/poster.jpg?width=720', + 'duration': 258, + 'timestamp': 1631618057, + 'upload_date': '20210914', + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.mediaite.com/politics/watch-rudy-giuliani-impersonates-queen-elizabeth-calls-mark-milley-an-asshle-in-bizarre-9-11-speech/', + 'info_dict': { + 'id': 'EiyiXKcr', + 'ext': 'mp4', + 'title': 'Giuliani 1', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/EiyiXKcr/poster.jpg?width=720', + 'duration': 39, + 'timestamp': 1631536476, + 'upload_date': '20210913', + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.mediaite.com/podcasts/clarissa-ward-says-she-decided-to-become-a-journalist-on-9-11/', + 'info_dict': { + 'id': 'TxavoRTx', + 'ext': 'mp4', + 'title': 'clarissa-ward-3.mp4', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/TxavoRTx/poster.jpg?width=720', + 'duration': 83, + 'timestamp': 1631311188, + 'upload_date': '20210910', + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.mediaite.com/opinion/mainstream-media-ignores-rose-mcgowans-bombshell-allegation-that-newsoms-wife-tried-to-silence-her-on-weinstein/', + 'info_dict': { + 'id': 'sEIWvKR7', + 'ext': 'mp4', + 'title': 'KTTV_09-13-2021_05.34.21', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/sEIWvKR7/poster.jpg?width=720', + 'duration': 52, + 'timestamp': 1631553328, + 'upload_date': '20210913', + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.mediaite.com/news/watch-cnbcs-jim-cramer-says-nobody-wants-to-die-getting-infected-by-unvaccinated-coworker-even-for-22-an-hour/', + 'info_dict': { + 'id': 'nwpt1elX', + 'ext': 'mp4', + 'title': "CNBC's Jim Cramer Says Nobody Wants to Die Getting Infected by Unvaccinated Coworker 'Even for $22 an Hour'.mp4", + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/nwpt1elX/poster.jpg?width=720', + 'duration': 60, + 'timestamp': 1633014214, + 'upload_date': '20210930', + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + webpage = self._download_webpage(url, None) + id = self._search_regex(r'data-video-id\s?=\s?\"([^\"]+)\"', webpage, 'id') + data_json = self._download_json(f'https://cdn.jwplayer.com/v2/media/{id}', id) + return self._parse_jwplayer_data(data_json) diff --git a/yt_dlp/extractor/mediaklikk.py b/yt_dlp/extractor/mediaklikk.py new file mode 100644 index 0000000000..18ff3befae --- /dev/null +++ b/yt_dlp/extractor/mediaklikk.py @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ..utils import ( + unified_strdate +) +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_unquote, + compat_str +) + + +class MediaKlikkIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.)? + (?:mediaklikk|m4sport|hirado|petofilive)\.hu/.*?(?:videok?|cikk)/ + (?:(?P[0-9]{4})/(?P[0-9]{1,2})/(?P[0-9]{1,2})/)? + (?P[^/#?_]+)''' + + _TESTS = [{ + # mediaklikk. date in html. + 'url': 'https://mediaklikk.hu/video/hazajaro-delnyugat-bacska-a-duna-menten-palankatol-doroszloig/', + 'info_dict': { + 'id': '4754129', + 'title': 'Hazajáró, DÉLNYUGAT-BÁCSKA – A Duna mentén Palánkától Doroszlóig', + 'ext': 'mp4', + 'upload_date': '20210901', + 'thumbnail': 'http://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg' + } + }, { + # m4sport + 'url': 'https://m4sport.hu/video/2021/08/30/gyemant-liga-parizs/', + 'info_dict': { + 'id': '4754999', + 'title': 'Gyémánt Liga, Párizs', + 'ext': 'mp4', + 'upload_date': '20210830', + 'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/08/vlcsnap-2021-08-30-18h21m20s10-1024x576.jpg' + } + }, { + # m4sport with *video/ url and no date + 'url': 'https://m4sport.hu/bl-video/real-madrid-chelsea-1-1/', + 'info_dict': { + 'id': '4492099', + 'title': 'Real Madrid - Chelsea 1-1', + 'ext': 'mp4', + 'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/04/Sequence-01.Still001-1024x576.png' + } + }, { + # hirado + 'url': 'https://hirado.hu/videok/felteteleket-szabott-a-fovaros/', + 'info_dict': { + 'id': '4760120', + 'title': 'Feltételeket szabott a főváros', + 'ext': 'mp4', + 'thumbnail': 'http://hirado.hu/wp-content/uploads/sites/4/2021/09/vlcsnap-2021-09-01-20h20m37s165.jpg' + } + }, { + # petofilive + 'url': 'https://petofilive.hu/video/2021/06/07/tha-shudras-az-akusztikban/', + 'info_dict': { + 'id': '4571948', + 'title': 'Tha Shudras az Akusztikban', + 'ext': 'mp4', + 'upload_date': '20210607', + 'thumbnail': 'http://petofilive.hu/wp-content/uploads/sites/4/2021/06/vlcsnap-2021-06-07-22h14m23s915-1024x576.jpg' + } + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id = mobj.group('id') + webpage = self._download_webpage(url, display_id) + + player_data_str = self._html_search_regex( + r'mtva_player_manager\.player\(document.getElementById\(.*\),\s?(\{.*\}).*\);', webpage, 'player data') + player_data = self._parse_json(player_data_str, display_id, compat_urllib_parse_unquote) + video_id = compat_str(player_data['contentId']) + title = player_data.get('title') or self._og_search_title(webpage, fatal=False) or \ + self._html_search_regex(r']+\bclass="article_title">([^<]+)<', webpage, 'title') + + upload_date = unified_strdate( + '%s-%s-%s' % (mobj.group('year'), mobj.group('month'), mobj.group('day'))) + if not upload_date: + upload_date = unified_strdate(self._html_search_regex( + r']+\bclass="article_date">([^<]+)<', webpage, 'upload date', default=None)) + + player_data['video'] = player_data.pop('token') + player_page = self._download_webpage('https://player.mediaklikk.hu/playernew/player.php', video_id, query=player_data) + playlist_url = self._proto_relative_url(compat_urllib_parse_unquote( + self._html_search_regex(r'\"file\":\s*\"(\\?/\\?/.*playlist\.m3u8)\"', player_page, 'playlist_url')).replace('\\/', '/')) + + formats = self._extract_wowza_formats( + playlist_url, video_id, skip_protocols=['f4m', 'smil', 'dash']) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'display_id': display_id, + 'formats': formats, + 'upload_date': upload_date, + 'thumbnail': player_data.get('bgImage') or self._og_search_thumbnail(webpage) + } diff --git a/yt_dlp/extractor/mediaset.py b/yt_dlp/extractor/mediaset.py index d8f12dca6b..119b39997a 100644 --- a/yt_dlp/extractor/mediaset.py +++ b/yt_dlp/extractor/mediaset.py @@ -1,13 +1,17 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import re from .theplatform import ThePlatformBaseIE from ..utils import ( ExtractorError, int_or_none, + OnDemandPagedList, parse_qs, + try_get, + urljoin, update_url_query, ) @@ -44,7 +48,7 @@ class MediasetIE(ThePlatformBaseIE): }, }, { 'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501', - 'md5': '288532f0ad18307705b01e581304cd7b', + 'md5': '1276f966ac423d16ba255ce867de073e', 'info_dict': { 'id': 'F309013801000501', 'ext': 'mp4', @@ -58,6 +62,38 @@ class MediasetIE(ThePlatformBaseIE): 'uploader': 'Canale 5', 'uploader_id': 'C5', }, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/video/cameracafe5/episodio-69-pezzo-di-luna_F303843101017801', + 'md5': 'd1650ac9ff944f185556126a736df148', + 'info_dict': { + 'id': 'F303843101017801', + 'ext': 'mp4', + 'title': 'Episodio 69 - Pezzo di luna', + 'description': '', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 263.008, + 'upload_date': '20200902', + 'series': 'Camera Café 5', + 'timestamp': 1599064700, + 'uploader': 'Italia 1', + 'uploader_id': 'I1', + }, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/video/cameracafe5/episodio-51-tu-chi-sei_F303843107000601', + 'md5': '567e9ad375b7a27a0e370650f572a1e3', + 'info_dict': { + 'id': 'F303843107000601', + 'ext': 'mp4', + 'title': 'Episodio 51 - Tu chi sei?', + 'description': '', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 367.021, + 'upload_date': '20200902', + 'series': 'Camera Café 5', + 'timestamp': 1599069817, + 'uploader': 'Italia 1', + 'uploader_id': 'I1', + }, }, { # clip 'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680', @@ -132,7 +168,7 @@ def _real_extract(self, url): formats = [] subtitles = {} first_e = None - asset_type = 'HD,browser,geoIT|SD,browser,geoIT|geoNo:HD,browser,geoIT|geoNo:SD,browser,geoIT|geoNo' + asset_type = 'geoNo:HD,browser,geoIT|geoNo:HD,geoIT|geoNo:SD,browser,geoIT|geoNo:SD,geoIT|geoNo|HD|SD' # TODO: fixup ISM+none manifest URLs for f in ('MPEG4', 'MPEG-DASH+none', 'M3U+none'): try: @@ -180,3 +216,81 @@ def _real_extract(self, url): 'subtitles': subtitles, }) return info + + +class MediasetShowIE(MediasetIE): + _VALID_URL = r'''(?x) + (?: + https?:// + (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ + (?: + (?:fiction|programmi-tv|serie-tv)/(?:.+?/)? + (?:[a-z]+)_SE(?P\d{12}) + (?:,ST(?P\d{12}))? + (?:,sb(?P\d{9}))?$ + ) + ) + ''' + _TESTS = [{ + # TV Show webpage (with a single playlist) + 'url': 'https://www.mediasetplay.mediaset.it/serie-tv/fireforce/episodi_SE000000001556', + 'info_dict': { + 'id': '000000001556', + 'title': 'Fire Force', + }, + 'playlist_count': 1, + }, { + # TV Show webpage (with multiple playlists) + 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/leiene_SE000000000061,ST000000002763', + 'info_dict': { + 'id': '000000002763', + 'title': 'Le Iene', + }, + 'playlist_count': 7, + }, { + # TV Show specific playlist (single page) + 'url': 'https://www.mediasetplay.mediaset.it/serie-tv/fireforce/episodi_SE000000001556,ST000000002738,sb100013107', + 'info_dict': { + 'id': '100013107', + 'title': 'Episodi', + }, + 'playlist_count': 4, + }, { + # TV Show specific playlist (with multiple pages) + 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/iservizi_SE000000000061,ST000000002763,sb100013375', + 'info_dict': { + 'id': '100013375', + 'title': 'I servizi', + }, + 'playlist_count': 53, + }] + + _BY_SUBBRAND = 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs-v2?byCustomValue={subBrandId}{%s}&sort=:publishInfo_lastPublished|desc,tvSeasonEpisodeNumber|desc&range=%d-%d' + _PAGE_SIZE = 25 + + def _fetch_page(self, sb, page): + lower_limit = page * self._PAGE_SIZE + 1 + upper_limit = lower_limit + self._PAGE_SIZE - 1 + content = self._download_json( + self._BY_SUBBRAND % (sb, lower_limit, upper_limit), sb) + for entry in content.get('entries') or []: + yield self.url_result( + 'mediaset:' + entry['guid'], + playlist_title=entry['mediasetprogram$subBrandDescription']) + + def _real_extract(self, url): + playlist_id, st, sb = self._match_valid_url(url).group('id', 'st', 'sb') + if not sb: + page = self._download_webpage(url, playlist_id) + entries = [self.url_result(urljoin('https://www.mediasetplay.mediaset.it', url)) + for url in re.findall(r'href="([^<>=]+SE\d{12},ST\d{12},sb\d{9})">[^<]+<', page)] + title = (self._html_search_regex(r'(?s)]*>(.+?)

    ', page, 'title', default=None) + or self._og_search_title(page)) + return self.playlist_result(entries, st or playlist_id, title) + + entries = OnDemandPagedList( + functools.partial(self._fetch_page, sb), + self._PAGE_SIZE) + title = try_get(entries, lambda x: x[0]['playlist_title']) + + return self.playlist_result(entries, sb, title) diff --git a/yt_dlp/extractor/microsoftstream.py b/yt_dlp/extractor/microsoftstream.py new file mode 100644 index 0000000000..4d5a9df1ff --- /dev/null +++ b/yt_dlp/extractor/microsoftstream.py @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from base64 import b64decode + +from .common import InfoExtractor +from ..utils import ( + merge_dicts, + parse_iso8601, + parse_duration, + parse_resolution, + try_get, + url_basename, +) + + +class MicrosoftStreamIE(InfoExtractor): + IE_NAME = 'microsoftstream' + IE_DESC = 'Microsoft Stream' + _VALID_URL = r'https?://(?:web|www|msit)\.microsoftstream\.com/video/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + + _TESTS = [{ + 'url': 'https://web.microsoftstream.com/video/6e51d928-4f46-4f1c-b141-369925e37b62?list=user&userId=f5491e02-e8fe-4e34-b67c-ec2e79a6ecc0', + 'only_matching': True, + }, { + 'url': 'https://msit.microsoftstream.com/video/b60f5987-aabd-4e1c-a42f-c559d138f2ca', + 'only_matching': True, + }] + + def _get_all_subtitles(self, api_url, video_id, headers): + subtitles = {} + automatic_captions = {} + text_tracks = self._download_json( + f'{api_url}/videos/{video_id}/texttracks', video_id, + note='Downloading subtitles JSON', fatal=False, headers=headers, + query={'api-version': '1.4-private'}).get('value') or [] + for track in text_tracks: + if not track.get('language') or not track.get('url'): + continue + sub_dict = automatic_captions if track.get('autoGenerated') else subtitles + sub_dict.setdefault(track['language'], []).append({ + 'ext': 'vtt', + 'url': track.get('url') + }) + return { + 'subtitles': subtitles, + 'automatic_captions': automatic_captions + } + + def extract_all_subtitles(self, *args, **kwargs): + if (self.get_param('writesubtitles', False) + or self.get_param('writeautomaticsub', False) + or self.get_param('listsubtitles')): + return self._get_all_subtitles(*args, **kwargs) + return {} + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + if 'Microsoft Stream' not in webpage: + self.raise_login_required(method='cookies') + + access_token = self._html_search_regex(r'"AccessToken":"(.+?)"', webpage, 'access token') + api_url = self._html_search_regex(r'"ApiGatewayUri":"(.+?)"', webpage, 'api url') + + headers = {'Authorization': f'Bearer {access_token}'} + + video_data = self._download_json( + f'{api_url}/videos/{video_id}', video_id, + headers=headers, query={ + '$expand': 'creator,tokens,status,liveEvent,extensions', + 'api-version': '1.4-private' + }) + video_id = video_data.get('id') or video_id + language = video_data.get('language') + + thumbnails = [] + for thumbnail_id in ('extraSmall', 'small', 'medium', 'large'): + thumbnail_url = try_get(video_data, lambda x: x['posterImage'][thumbnail_id]['url'], str) + if not thumbnail_url: + continue + thumb = { + 'id': thumbnail_id, + 'url': thumbnail_url, + } + thumb_name = url_basename(thumbnail_url) + thumb_name = str(b64decode(thumb_name + '=' * (-len(thumb_name) % 4))) + thumb.update(parse_resolution(thumb_name)) + thumbnails.append(thumb) + + formats = [] + for playlist in video_data['playbackUrls']: + if playlist['mimeType'] == 'application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats( + playlist['playbackUrl'], video_id, + ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False, headers=headers)) + elif playlist['mimeType'] == 'application/dash+xml': + formats.extend(self._extract_mpd_formats( + playlist['playbackUrl'], video_id, mpd_id='dash', + fatal=False, headers=headers)) + elif playlist['mimeType'] == 'application/vnd.ms-sstr+xml': + formats.extend(self._extract_ism_formats( + playlist['playbackUrl'], video_id, ism_id='mss', + fatal=False, headers=headers)) + formats = [merge_dicts(f, {'language': language}) for f in formats] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data['name'], + 'description': video_data.get('description'), + 'uploader': try_get(video_data, lambda x: x['creator']['name'], str), + 'uploader_id': try_get(video_data, (lambda x: x['creator']['mail'], + lambda x: x['creator']['id']), str), + 'thumbnails': thumbnails, + **self.extract_all_subtitles(api_url, video_id, headers), + 'timestamp': parse_iso8601(video_data.get('created')), + 'duration': parse_duration(try_get(video_data, lambda x: x['media']['duration'])), + 'webpage_url': f'https://web.microsoftstream.com/video/{video_id}', + 'view_count': try_get(video_data, lambda x: x['metrics']['views'], int), + 'like_count': try_get(video_data, lambda x: x['metrics']['likes'], int), + 'comment_count': try_get(video_data, lambda x: x['metrics']['comments'], int), + 'formats': formats, + } diff --git a/yt_dlp/extractor/minoto.py b/yt_dlp/extractor/minoto.py index dba82db5ff..603ce940ba 100644 --- a/yt_dlp/extractor/minoto.py +++ b/yt_dlp/extractor/minoto.py @@ -37,7 +37,7 @@ def _real_extract(self, url): 'filesize': int_or_none(fmt.get('filesize')), 'width': int_or_none(fmt.get('width')), 'height': int_or_none(fmt.get('height')), - 'codecs': parse_codecs(fmt.get('codecs')), + **parse_codecs(fmt.get('codecs')), }) self._sort_formats(formats) diff --git a/yt_dlp/extractor/mixch.py b/yt_dlp/extractor/mixch.py new file mode 100644 index 0000000000..a99ddd172e --- /dev/null +++ b/yt_dlp/extractor/mixch.py @@ -0,0 +1,55 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, +) + + +class MixchIE(InfoExtractor): + IE_NAME = 'mixch' + _VALID_URL = r'https?://(?:www\.)?mixch\.tv/u/(?P\d+)' + + TESTS = [{ + 'url': 'https://mixch.tv/u/16236849/live', + 'skip': 'don\'t know if this live persists', + 'info_dict': { + 'id': '16236849', + 'title': '24配信シェア⭕️投票🙏💦', + 'comment_count': 13145, + 'view_count': 28348, + 'timestamp': 1636189377, + 'uploader': '🦥伊咲👶🏻#フレアワ', + 'uploader_id': '16236849', + } + }, { + 'url': 'https://mixch.tv/u/16137876/live', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(f'https://mixch.tv/u/{video_id}/live', video_id) + + initial_js_state = self._parse_json(self._search_regex( + r'(?m)^\s*window\.__INITIAL_JS_STATE__\s*=\s*(\{.+?\});\s*$', webpage, 'initial JS state'), video_id) + if not initial_js_state.get('liveInfo'): + raise ExtractorError('Livestream has ended.', expected=True) + + return { + 'id': video_id, + 'title': traverse_obj(initial_js_state, ('liveInfo', 'title')), + 'comment_count': traverse_obj(initial_js_state, ('liveInfo', 'comments')), + 'view_count': traverse_obj(initial_js_state, ('liveInfo', 'visitor')), + 'timestamp': traverse_obj(initial_js_state, ('liveInfo', 'created')), + 'uploader': traverse_obj(initial_js_state, ('broadcasterInfo', 'name')), + 'uploader_id': video_id, + 'formats': [{ + 'format_id': 'hls', + 'url': traverse_obj(initial_js_state, ('liveInfo', 'hls')) or 'https://d1hd0ww6piyb43.cloudfront.net/hls/torte_%s.m3u8' % video_id, + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + 'is_live': True, + } diff --git a/yt_dlp/extractor/mlssoccer.py b/yt_dlp/extractor/mlssoccer.py new file mode 100644 index 0000000000..1d6d4b8040 --- /dev/null +++ b/yt_dlp/extractor/mlssoccer.py @@ -0,0 +1,117 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MLSSoccerIE(InfoExtractor): + _VALID_DOMAINS = r'(?:(?:cfmontreal|intermiamicf|lagalaxy|lafc|houstondynamofc|dcunited|atlutd|mlssoccer|fcdallas|columbuscrew|coloradorapids|fccincinnati|chicagofirefc|austinfc|nashvillesc|whitecapsfc|sportingkc|soundersfc|sjearthquakes|rsl|timbers|philadelphiaunion|orlandocitysc|newyorkredbulls|nycfc)\.com|(?:torontofc)\.ca|(?:revolutionsoccer)\.net)' + _VALID_URL = r'https?://(?:www\.)?%s/video/#?(?P[^/&$#?]+)' % _VALID_DOMAINS + + _TESTS = [{ + 'url': 'https://www.mlssoccer.com/video/the-octagon-can-alphonso-davies-lead-canada-to-first-world-cup-since-1986#the-octagon-can-alphonso-davies-lead-canada-to-first-world-cup-since-1986', + 'info_dict': { + 'id': '6276033198001', + 'ext': 'mp4', + 'title': 'The Octagon | Can Alphonso Davies lead Canada to first World Cup since 1986?', + 'description': 'md5:f0a883ee33592a0221798f451a98be8f', + 'thumbnail': 'https://cf-images.us-east-1.prod.boltdns.net/v1/static/5530036772001/1bbc44f6-c63c-4981-82fa-46b0c1f891e0/5c1ca44a-a033-4e98-b531-ff24c4947608/160x90/match/image.jpg', + 'duration': 350.165, + 'timestamp': 1633627291, + 'uploader_id': '5530036772001', + 'tags': ['club/canada'], + 'is_live': False, + 'upload_date': '20211007', + 'filesize_approx': 255193528.83200002 + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.whitecapsfc.com/video/highlights-san-jose-earthquakes-vs-vancouver-whitecaps-fc-october-23-2021#highlights-san-jose-earthquakes-vs-vancouver-whitecaps-fc-october-23-2021', + 'only_matching': True + }, { + 'url': 'https://www.torontofc.ca/video/highlights-toronto-fc-vs-cf-montreal-october-23-2021-x6733#highlights-toronto-fc-vs-cf-montreal-october-23-2021-x6733', + 'only_matching': True + }, { + 'url': 'https://www.sportingkc.com/video/post-match-press-conference-john-pulskamp-oct-27-2021#post-match-press-conference-john-pulskamp-oct-27-2021', + 'only_matching': True + }, { + 'url': 'https://www.soundersfc.com/video/highlights-seattle-sounders-fc-vs-sporting-kansas-city-october-23-2021', + 'only_matching': True + }, { + 'url': 'https://www.sjearthquakes.com/video/#highlights-austin-fc-vs-san-jose-earthquakes-june-19-2021', + 'only_matching': True + }, { + 'url': 'https://www.rsl.com/video/2021-u-of-u-health-mic-d-up-vs-colorado-10-16-21#2021-u-of-u-health-mic-d-up-vs-colorado-10-16-21', + 'only_matching': True + }, { + 'url': 'https://www.timbers.com/video/highlights-d-chara-asprilla-with-goals-in-portland-timbers-2-0-win-over-san-jose#highlights-d-chara-asprilla-with-goals-in-portland-timbers-2-0-win-over-san-jose', + 'only_matching': True + }, { + 'url': 'https://www.philadelphiaunion.com/video/highlights-torvphi', + 'only_matching': True + }, { + 'url': 'https://www.orlandocitysc.com/video/highlight-columbus-crew-vs-orlando-city-sc', + 'only_matching': True + }, { + 'url': 'https://www.newyorkredbulls.com/video/all-access-matchday-double-derby-week#all-access-matchday-double-derby-week', + 'only_matching': True + }, { + 'url': 'https://www.nycfc.com/video/highlights-nycfc-1-0-chicago-fire-fc#highlights-nycfc-1-0-chicago-fire-fc', + 'only_matching': True + }, { + 'url': 'https://www.revolutionsoccer.net/video/two-minute-highlights-revs-1-rapids-0-october-27-2021#two-minute-highlights-revs-1-rapids-0-october-27-2021', + 'only_matching': True + }, { + 'url': 'https://www.nashvillesc.com/video/goal-c-j-sapong-nashville-sc-92nd-minute', + 'only_matching': True + }, { + 'url': 'https://www.cfmontreal.com/video/faits-saillants-tor-v-mtl#faits-saillants-orl-v-mtl-x5645', + 'only_matching': True + }, { + 'url': 'https://www.intermiamicf.com/video/all-access-victory-vs-nashville-sc-by-ukg#all-access-victory-vs-nashville-sc-by-ukg', + 'only_matching': True + }, { + 'url': 'https://www.lagalaxy.com/video/#moment-of-the-month-presented-by-san-manuel-casino-rayan-raveloson-scores-his-se', + 'only_matching': True + }, { + 'url': 'https://www.lafc.com/video/breaking-down-lafc-s-final-6-matches-of-the-2021-mls-regular-season#breaking-down-lafc-s-final-6-matches-of-the-2021-mls-regular-season', + 'only_matching': True + }, { + 'url': 'https://www.houstondynamofc.com/video/postgame-press-conference-michael-nelson-presented-by-coushatta-casino-res-x9660#postgame-press-conference-michael-nelson-presented-by-coushatta-casino-res-x9660', + 'only_matching': True + }, { + 'url': 'https://www.dcunited.com/video/tony-alfaro-my-family-pushed-me-to-believe-everything-was-possible', + 'only_matching': True + }, { + 'url': 'https://www.fcdallas.com/video/highlights-fc-dallas-vs-minnesota-united-fc-october-02-2021#highlights-fc-dallas-vs-minnesota-united-fc-october-02-2021', + 'only_matching': True + }, { + 'url': 'https://www.columbuscrew.com/video/match-rewind-columbus-crew-vs-new-york-red-bulls-october-23-2021', + 'only_matching': True + }, { + 'url': 'https://www.coloradorapids.com/video/postgame-reaction-robin-fraser-october-27#postgame-reaction-robin-fraser-october-27', + 'only_matching': True + }, { + 'url': 'https://www.fccincinnati.com/video/#keeping-cincy-chill-presented-by-coors-lite', + 'only_matching': True + }, { + 'url': 'https://www.chicagofirefc.com/video/all-access-fire-score-dramatic-road-win-in-cincy#all-access-fire-score-dramatic-road-win-in-cincy', + 'only_matching': True + }, { + 'url': 'https://www.austinfc.com/video/highlights-colorado-rapids-vs-austin-fc-september-29-2021#highlights-colorado-rapids-vs-austin-fc-september-29-2021', + 'only_matching': True + }, { + 'url': 'https://www.atlutd.com/video/goal-josef-martinez-scores-in-the-73rd-minute#goal-josef-martinez-scores-in-the-73rd-minute', + 'only_matching': True + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._parse_json(self._html_search_regex(r'data-options\=\"([^\"]+)\"', webpage, 'json'), id)['videoList'][0] + return { + 'id': id, + '_type': 'url', + 'url': 'https://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (data_json['accountId'], data_json['videoId']), + 'ie_key': 'BrightcoveNew', + } diff --git a/yt_dlp/extractor/motherless.py b/yt_dlp/extractor/motherless.py index ef1e081f20..111c7c5442 100644 --- a/yt_dlp/extractor/motherless.py +++ b/yt_dlp/extractor/motherless.py @@ -127,9 +127,9 @@ def _real_extract(self, url): comment_count = webpage.count('class="media-comment-contents"') uploader_id = self._html_search_regex( - r'"thumb-member-username">\s+\s+]+\bclass="username">([^<]+)'), + webpage, 'uploader_id', fatal=False) categories = self._html_search_meta('keywords', webpage, default=None) if categories: categories = [cat.strip() for cat in categories.split(',')] @@ -169,7 +169,18 @@ class MotherlessGroupIE(InfoExtractor): 'description': 'Sex can be funny. Wide smiles,laugh, games, fun of ' 'any kind!' }, - 'playlist_mincount': 9, + 'playlist_mincount': 0, + 'expected_warnings': [ + 'This group has no videos.', + ] + }, { + 'url': 'https://motherless.com/g/beautiful_cock', + 'info_dict': { + 'id': 'beautiful_cock', + 'title': 'Beautiful Cock', + 'description': 'Group for lovely cocks yours, mine, a friends anything human', + }, + 'playlist_mincount': 2500, }] @classmethod @@ -209,11 +220,18 @@ def _real_extract(self, url): description = self._html_search_meta( 'description', webpage, fatal=False) page_count = self._int(self._search_regex( - r'(\d+)<(?:a|span)[^>]+>\s*NEXT', - webpage, 'page_count'), 'page_count') + r'(\d+)<(?:a|span)[^>]+rel="next">', + webpage, 'page_count', default=0), 'page_count') + if not page_count: + message = self._search_regex( + r'class="error-page"[^>]*>\s*]*>\s*(?P[^<]+)(?<=\S)\s*', + webpage, 'error_msg', default=None) or 'This group has no videos.' + self.report_warning(message, group_id) PAGE_SIZE = 80 def _get_page(idx): + if not page_count: + return webpage = self._download_webpage( page_url, group_id, query={'page': idx + 1}, note='Downloading page %d/%d' % (idx + 1, page_count) diff --git a/yt_dlp/extractor/mtv.py b/yt_dlp/extractor/mtv.py index e446a955b9..be5de0a70c 100644 --- a/yt_dlp/extractor/mtv.py +++ b/yt_dlp/extractor/mtv.py @@ -15,6 +15,7 @@ float_or_none, HEADRequest, int_or_none, + join_nonempty, RegexNotFoundError, sanitized_Request, strip_or_none, @@ -44,7 +45,7 @@ def _remove_template_parameter(url): # Remove the templates, like &device={device} return re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', url) - def _get_feed_url(self, uri): + def _get_feed_url(self, uri, url=None): return self._FEED_URL def _get_thumbnail_url(self, uri, itemdoc): @@ -99,9 +100,9 @@ def _extract_video_formats(self, mdoc, mtvn_id, video_id): formats.extend([{ 'ext': 'flv' if rtmp_video_url.startswith('rtmp') else ext, 'url': rtmp_video_url, - 'format_id': '-'.join(filter(None, [ + 'format_id': join_nonempty( 'rtmp' if rtmp_video_url.startswith('rtmp') else None, - rendition.get('bitrate')])), + rendition.get('bitrate')), 'width': int(rendition.get('width')), 'height': int(rendition.get('height')), }]) @@ -229,9 +230,9 @@ def _get_feed_query(self, uri): data['lang'] = self._LANG return data - def _get_videos_info(self, uri, use_hls=True): + def _get_videos_info(self, uri, use_hls=True, url=None): video_id = self._id_from_uri(uri) - feed_url = self._get_feed_url(uri) + feed_url = self._get_feed_url(uri, url) info_url = update_url_query(feed_url, self._get_feed_query(uri)) return self._get_videos_info_from_url(info_url, video_id, use_hls) @@ -311,7 +312,17 @@ def _extract_mgid(self, webpage): main_container = self._extract_child_with_type(data, 'MainContainer') ab_testing = self._extract_child_with_type(main_container, 'ABTesting') video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer') - mgid = video_player['props']['media']['video']['config']['uri'] + if video_player: + mgid = try_get(video_player, lambda x: x['props']['media']['video']['config']['uri']) + else: + flex_wrapper = self._extract_child_with_type(ab_testing or main_container, 'FlexWrapper') + auth_suite_wrapper = self._extract_child_with_type(flex_wrapper, 'AuthSuiteWrapper') + player = self._extract_child_with_type(auth_suite_wrapper or flex_wrapper, 'Player') + if player: + mgid = try_get(player, lambda x: x['props']['videoDetail']['mgid']) + + if not mgid: + raise ExtractorError('Could not extract mgid') return mgid @@ -319,7 +330,7 @@ def _real_extract(self, url): title = url_basename(url) webpage = self._download_webpage(url, title) mgid = self._extract_mgid(webpage) - videos_info = self._get_videos_info(mgid) + videos_info = self._get_videos_info(mgid, url=url) return videos_info @@ -348,7 +359,7 @@ def _extract_url(webpage): if mobj: return mobj.group('url') - def _get_feed_url(self, uri): + def _get_feed_url(self, uri, url=None): video_id = self._id_from_uri(uri) config = self._download_json( 'http://media.mtvnservices.com/pmt/e1/access/index.html?uri=%s&configtype=edge' % uri, video_id) diff --git a/yt_dlp/extractor/muenchentv.py b/yt_dlp/extractor/muenchentv.py index d256236d18..a53929e1b4 100644 --- a/yt_dlp/extractor/muenchentv.py +++ b/yt_dlp/extractor/muenchentv.py @@ -33,7 +33,7 @@ def _real_extract(self, url): display_id = 'live' webpage = self._download_webpage(url, display_id) - title = self._live_title(self._og_search_title(webpage)) + title = self._og_search_title(webpage) data_js = self._search_regex( r'(?s)\nplaylist:\s*(\[.*?}\]),', diff --git a/yt_dlp/extractor/musescore.py b/yt_dlp/extractor/musescore.py new file mode 100644 index 0000000000..09fadf8d90 --- /dev/null +++ b/yt_dlp/extractor/musescore.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MuseScoreIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?musescore\.com/(?:user/\d+|[^/]+)(?:/scores)?/(?P[^#&?]+)' + _TESTS = [{ + 'url': 'https://musescore.com/user/73797/scores/142975', + 'info_dict': { + 'id': '142975', + 'ext': 'mp3', + 'title': 'WA Mozart Marche Turque (Turkish March fingered)', + 'description': 'md5:7ede08230e4eaabd67a4a98bb54d07be', + 'thumbnail': r're:https?://(?:www\.)?musescore\.com/.*\.png[^$]+', + 'uploader': 'PapyPiano', + 'creator': 'Wolfgang Amadeus Mozart', + } + }, { + 'url': 'https://musescore.com/user/36164500/scores/6837638', + 'info_dict': { + 'id': '6837638', + 'ext': 'mp3', + 'title': 'Sweet Child O\' Mine – Guns N\' Roses sweet child', + 'description': 'md5:4dca71191c14abc312a0a4192492eace', + 'thumbnail': r're:https?://(?:www\.)?musescore\.com/.*\.png[^$]+', + 'uploader': 'roxbelviolin', + 'creator': 'Guns N´Roses Arr. Roxbel Violin', + } + }, { + 'url': 'https://musescore.com/classicman/fur-elise', + 'info_dict': { + 'id': '33816', + 'ext': 'mp3', + 'title': 'Für Elise – Beethoven', + 'description': 'md5:49515a3556d5ecaf9fa4b2514064ac34', + 'thumbnail': r're:https?://(?:www\.)?musescore\.com/.*\.png[^$]+', + 'uploader': 'ClassicMan', + 'creator': 'Ludwig van Beethoven (1770–1827)', + } + }, { + 'url': 'https://musescore.com/minh_cuteee/scores/6555384', + 'only_matching': True, + }] + + def _real_extract(self, url): + webpage = self._download_webpage(url, None) + url = self._og_search_url(webpage) or url + id = self._match_id(url) + mp3_url = self._download_json(f'https://musescore.com/api/jmuse?id={id}&index=0&type=mp3&v2=1', id, + headers={'authorization': '63794e5461e4cfa046edfbdddfccc1ac16daffd2'})['info']['url'] + formats = [{ + 'url': mp3_url, + 'ext': 'mp3', + 'vcodec': 'none', + }] + + return { + 'id': id, + 'formats': formats, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'uploader': self._html_search_meta('musescore:author', webpage, 'uploader'), + 'creator': self._html_search_meta('musescore:composer', webpage, 'composer'), + } diff --git a/yt_dlp/extractor/mxplayer.py b/yt_dlp/extractor/mxplayer.py index 0f1c439aa8..3c2afd838d 100644 --- a/yt_dlp/extractor/mxplayer.py +++ b/yt_dlp/extractor/mxplayer.py @@ -3,43 +3,68 @@ from .common import InfoExtractor from ..compat import compat_str -from ..utils import ( - ExtractorError, - js_to_json, - qualities, - try_get, - url_or_none, - urljoin, -) +from ..utils import try_get class MxplayerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mxplayer\.in/(?:movie|show/[-\w]+/[-\w]+)/(?P[-\w]+)-(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?mxplayer\.in/(?Pmovie|show/[-\w]+/[-\w]+)/(?P[-\w]+)-(?P\w+)' _TESTS = [{ + 'url': 'https://www.mxplayer.in/show/watch-my-girlfriend-is-an-alien-hindi-dubbed/season-1/episode-1-online-9d2013d31d5835bb8400e3b3c5e7bb72', + 'info_dict': { + 'id': '9d2013d31d5835bb8400e3b3c5e7bb72', + 'ext': 'mp4', + 'title': 'Episode 1', + 'description': 'md5:62ed43eb9fec5efde5cf3bd1040b7670', + 'season_number': 1, + 'episode_number': 1, + 'duration': 2451, + 'season': 'Season 1', + 'series': 'My Girlfriend Is An Alien (Hindi Dubbed)', + 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/9d2013d31d5835bb8400e3b3c5e7bb72/en/16x9/320x180/9562f5f8df42cad09c9a9c4e69eb1567_1920x1080.webp', + 'episode': 'Episode 1' + }, + 'params': { + 'format': 'bv', + 'skip_download': True, + }, + }, { 'url': 'https://www.mxplayer.in/movie/watch-knock-knock-hindi-dubbed-movie-online-b9fa28df3bfb8758874735bbd7d2655a?watch=true', 'info_dict': { 'id': 'b9fa28df3bfb8758874735bbd7d2655a', 'ext': 'mp4', 'title': 'Knock Knock (Hindi Dubbed)', - 'description': 'md5:b195ba93ff1987309cfa58e2839d2a5b' + 'description': 'md5:b195ba93ff1987309cfa58e2839d2a5b', + 'season_number': 0, + 'episode_number': 0, + 'duration': 5970, + 'season': 'Season 0', + 'series': None, + 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/b9fa28df3bfb8758874735bbd7d2655a/en/16x9/320x180/test_pic1588676032011.webp', + 'episode': 'Episode 0' }, 'params': { + 'format': 'bv', 'skip_download': True, - 'format': 'bestvideo' - } + }, }, { 'url': 'https://www.mxplayer.in/show/watch-shaitaan/season-1/the-infamous-taxi-gang-of-meerut-online-45055d5bcff169ad48f2ad7552a83d6c', 'info_dict': { 'id': '45055d5bcff169ad48f2ad7552a83d6c', - 'ext': 'm3u8', + 'ext': 'mp4', 'title': 'The infamous taxi gang of Meerut', 'description': 'md5:033a0a7e3fd147be4fb7e07a01a3dc28', + 'season_number': 1, + 'episode_number': 1, + 'duration': 2332, 'season': 'Season 1', - 'series': 'Shaitaan' + 'series': 'Shaitaan', + 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/45055d5bcff169ad48f2ad7552a83d6c/en/16x9/320x180/voot_8e7d5f8d8183340869279c732c1e3a43.webp', + 'episode': 'Episode 1' }, 'params': { + 'format': 'best', 'skip_download': True, - } + }, }, { 'url': 'https://www.mxplayer.in/show/watch-aashram/chapter-1/duh-swapna-online-d445579792b0135598ba1bc9088a84cb', 'info_dict': { @@ -47,93 +72,115 @@ class MxplayerIE(InfoExtractor): 'ext': 'mp4', 'title': 'Duh Swapna', 'description': 'md5:35ff39c4bdac403c53be1e16a04192d8', + 'season_number': 1, + 'episode_number': 3, + 'duration': 2568, 'season': 'Chapter 1', - 'series': 'Aashram' + 'series': 'Aashram', + 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/d445579792b0135598ba1bc9088a84cb/en/4x3/1600x1200/test_pic1624819307993.webp', + 'episode': 'Episode 3' }, - 'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'], 'params': { + 'format': 'bv', 'skip_download': True, - 'format': 'bestvideo' - } + }, + }, { + 'url': 'https://www.mxplayer.in/show/watch-dangerous/season-1/chapter-1-online-5a351b4f9fb69436f6bd6ae3a1a75292', + 'info_dict': { + 'id': '5a351b4f9fb69436f6bd6ae3a1a75292', + 'ext': 'mp4', + 'title': 'Chapter 1', + 'description': 'md5:233886b8598bc91648ac098abe1d288f', + 'season_number': 1, + 'episode_number': 1, + 'duration': 1305, + 'season': 'Season 1', + 'series': 'Dangerous', + 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/5a351b4f9fb69436f6bd6ae3a1a75292/en/4x3/1600x1200/test_pic1624706302350.webp', + 'episode': 'Episode 1' + }, + 'params': { + 'format': 'bv', + 'skip_download': True, + }, + }, { + 'url': 'https://www.mxplayer.in/movie/watch-the-attacks-of-2611-movie-online-0452f0d80226c398d63ce7e3ea40fa2d', + 'info_dict': { + 'id': '0452f0d80226c398d63ce7e3ea40fa2d', + 'ext': 'mp4', + 'title': 'The Attacks of 26/11', + 'description': 'md5:689bacd29e97b3f31eaf519eb14127e5', + 'season_number': 0, + 'episode_number': 0, + 'duration': 6085, + 'season': 'Season 0', + 'series': None, + 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/0452f0d80226c398d63ce7e3ea40fa2d/en/16x9/320x180/00c8955dab5e5d340dbde643f9b1f6fd_1920x1080.webp', + 'episode': 'Episode 0' + }, + 'params': { + 'format': 'best', + 'skip_download': True, + }, }] - def _get_stream_urls(self, video_dict): - stream_provider_dict = try_get( - video_dict, - lambda x: x['stream'][x['stream']['provider']]) - if not stream_provider_dict: - raise ExtractorError('No stream provider found', expected=True) - - for stream_name, stream in stream_provider_dict.items(): - if stream_name in ('hls', 'dash', 'hlsUrl', 'dashUrl'): - stream_type = stream_name.replace('Url', '') - if isinstance(stream, dict): - for quality, stream_url in stream.items(): - if stream_url: - yield stream_type, quality, stream_url - else: - yield stream_type, 'base', stream - def _real_extract(self, url): - display_id, video_id = self._match_valid_url(url).groups() - webpage = self._download_webpage(url, video_id) - - source = self._parse_json( - js_to_json(self._html_search_regex( - r'(?s)).*', - webpage, 'WindowState')), - video_id) - if not source: - raise ExtractorError('Cannot find source', expected=True) - - config_dict = source['config'] - video_dict = source['entities'][video_id] + type, display_id, video_id = self._match_valid_url(url).groups() + type = 'movie_film' if type == 'movie' else 'tvshow_episode' + API_URL = 'https://androidapi.mxplay.com/v1/detail/' + headers = { + 'X-Av-Code': '23', + 'X-Country': 'IN', + 'X-Platform': 'android', + 'X-App-Version': '1370001318', + 'X-Resolution': '3840x2160', + } + data_json = self._download_json(f'{API_URL}{type}/{video_id}', display_id, headers=headers)['profile'] + season, series = None, None + for dct in data_json.get('levelInfos', []): + if dct.get('type') == 'tvshow_season': + season = dct.get('name') + elif dct.get('type') == 'tvshow_show': + series = dct.get('name') thumbnails = [] - for i in video_dict.get('imageInfo') or []: + for thumb in data_json.get('poster', []): thumbnails.append({ - 'url': urljoin(config_dict['imageBaseUrl'], i['url']), - 'width': i['width'], - 'height': i['height'], + 'url': thumb.get('url'), + 'width': thumb.get('width'), + 'height': thumb.get('height'), }) formats = [] - get_quality = qualities(['main', 'base', 'high']) - for stream_type, quality, stream_url in self._get_stream_urls(video_dict): - format_url = url_or_none(urljoin(config_dict['videoCdnBaseUrl'], stream_url)) - if not format_url: - continue - if stream_type == 'dash': - dash_formats = self._extract_mpd_formats( - format_url, video_id, mpd_id='dash-%s' % quality, headers={'Referer': url}) - for frmt in dash_formats: - frmt['quality'] = get_quality(quality) - formats.extend(dash_formats) - dash_formats_h265 = self._extract_mpd_formats( - format_url.replace('h264_high', 'h265_main'), video_id, mpd_id='dash-%s' % quality, headers={'Referer': url}, fatal=False) - for frmt in dash_formats_h265: - frmt['quality'] = get_quality(quality) - formats.extend(dash_formats_h265) - elif stream_type == 'hls': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, fatal=False, - m3u8_id='hls-%s' % quality, quality=get_quality(quality), ext='mp4')) - + subtitles = {} + for dct in data_json.get('playInfo', []): + if dct.get('extension') == 'mpd': + frmt, subs = self._extract_mpd_formats_and_subtitles(dct.get('playUrl'), display_id, fatal=False) + formats.extend(frmt) + subtitles = self._merge_subtitles(subtitles, subs) + elif dct.get('extension') == 'm3u8': + frmt, subs = self._extract_m3u8_formats_and_subtitles(dct.get('playUrl'), display_id, fatal=False) + formats.extend(frmt) + subtitles = self._merge_subtitles(subtitles, subs) self._sort_formats(formats) return { 'id': video_id, 'display_id': display_id, - 'title': video_dict['title'] or self._og_search_title(webpage), - 'formats': formats, - 'description': video_dict.get('description'), - 'season': try_get(video_dict, lambda x: x['container']['title']), - 'series': try_get(video_dict, lambda x: x['container']['container']['title']), + 'title': data_json.get('name') or display_id, + 'description': data_json.get('description'), + 'season_number': data_json.get('seasonNum'), + 'episode_number': data_json.get('episodeNum'), + 'duration': data_json.get('duration'), + 'season': season, + 'series': series, 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, } class MxplayerShowIE(InfoExtractor): - _VALID_URL = r'(?:https?://)(?:www\.)?mxplayer\.in/show/(?P[-\w]+)-(?P\w+)/?(?:$|[#?])' + _VALID_URL = r'https?://(?:www\.)?mxplayer\.in/show/(?P[-\w]+)-(?P\w+)/?(?:$|[#?])' _TESTS = [{ 'url': 'https://www.mxplayer.in/show/watch-chakravartin-ashoka-samrat-series-online-a8f44e3cc0814b5601d17772cedf5417', 'playlist_mincount': 440, diff --git a/yt_dlp/extractor/n1.py b/yt_dlp/extractor/n1.py new file mode 100644 index 0000000000..fdb7f32dbe --- /dev/null +++ b/yt_dlp/extractor/n1.py @@ -0,0 +1,142 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + unified_timestamp, + extract_attributes, +) + + +class N1InfoAssetIE(InfoExtractor): + _VALID_URL = r'https?://best-vod\.umn\.cdn\.united\.cloud/stream\?asset=(?P[^&]+)' + _TESTS = [{ + 'url': 'https://best-vod.umn.cdn.united.cloud/stream?asset=ljsottomazilirija3060921-n1info-si-worldwide&stream=hp1400&t=0&player=m3u8v&sp=n1info&u=n1info&p=n1Sh4redSecre7iNf0', + 'md5': '28b08b32aeaff2b8562736ccd5a66fe7', + 'info_dict': { + 'id': 'ljsottomazilirija3060921-n1info-si-worldwide', + 'ext': 'mp4', + 'title': 'ljsottomazilirija3060921-n1info-si-worldwide', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + formats = self._extract_m3u8_formats( + url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + +class N1InfoIIE(InfoExtractor): + IE_NAME = 'N1Info:article' + _VALID_URL = r'https?://(?:(?:(?:ba|rs|hr)\.)?n1info\.(?:com|si)|nova\.rs)/(?:[^/]+/){1,2}(?P[^/]+)' + _TESTS = [{ + # Youtube embedded + 'url': 'https://rs.n1info.com/sport-klub/tenis/kako-je-djokovic-propustio-istorijsku-priliku-video/', + 'md5': '01ddb6646d0fd9c4c7d990aa77fe1c5a', + 'info_dict': { + 'id': 'L5Hd4hQVUpk', + 'ext': 'mp4', + 'upload_date': '20210913', + 'title': 'Ozmo i USO21, ep. 13: Novak Đoković – Danil Medvedev | Ključevi Poraza, Budućnost | SPORT KLUB TENIS', + 'description': 'md5:467f330af1effedd2e290f10dc31bb8e', + 'uploader': 'Sport Klub', + 'uploader_id': 'sportklub', + } + }, { + 'url': 'https://rs.n1info.com/vesti/djilas-los-plan-za-metro-nece-resiti-nijedan-saobracajni-problem/', + 'info_dict': { + 'id': 'bgmetrosot2409zta20210924174316682-n1info-rs-worldwide', + 'ext': 'mp4', + 'title': 'Đilas: Predlog izgradnje metroa besmislen; SNS odbacuje navode', + 'upload_date': '20210924', + 'timestamp': 1632481347, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://n1info.si/novice/slovenija/zadnji-dnevi-na-kopaliscu-ilirija-ilirija-ni-umrla-ubili-so-jo/', + 'info_dict': { + 'id': 'ljsottomazilirija3060921-n1info-si-worldwide', + 'ext': 'mp4', + 'title': 'Zadnji dnevi na kopališču Ilirija: “Ilirija ni umrla, ubili so jo”', + 'timestamp': 1632567630, + 'upload_date': '20210925', + }, + 'params': { + 'skip_download': True, + }, + }, { + # Reddit embedded + 'url': 'https://ba.n1info.com/lifestyle/vucic-bolji-od-tita-ako-izgubi-ja-cu-da-crknem-jugoslavija-je-gotova/', + 'info_dict': { + 'id': '2wmfee9eycp71', + 'ext': 'mp4', + 'title': '"Ako Vučić izgubi izbore, ja ću da crknem, Jugoslavija je gotova"', + 'upload_date': '20210924', + 'timestamp': 1632448649.0, + 'uploader': 'YouLotWhatDontStop', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://nova.rs/vesti/politika/zaklina-tatalovic-ani-brnabic-pricate-lazi-video/', + 'info_dict': { + 'id': 'tnjganabrnabicizaklinatatalovic100danavladegp-novas-worldwide', + 'ext': 'mp4', + 'title': 'Žaklina Tatalović Ani Brnabić: Pričate laži (VIDEO)', + 'upload_date': '20211102', + 'timestamp': 1635861677, + }, + }, { + 'url': 'https://hr.n1info.com/vijesti/pravobraniteljica-o-ubojstvu-u-zagrebu-radi-se-o-doista-nezapamcenoj-situaciji/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r']+>(.+?)', webpage, 'title') + timestamp = unified_timestamp(self._html_search_meta('article:published_time', webpage)) + + videos = re.findall(r'(?m)(]+>)', webpage) + entries = [] + for video in videos: + video_data = extract_attributes(video) + entries.append({ + '_type': 'url_transparent', + 'url': video_data.get('data-url'), + 'id': video_data.get('id'), + 'title': title, + 'thumbnail': video_data.get('data-thumbnail'), + 'timestamp': timestamp, + 'ie_key': 'N1InfoAsset'}) + + embedded_videos = re.findall(r'(]+>)', webpage) + for embedded_video in embedded_videos: + video_data = extract_attributes(embedded_video) + url = video_data.get('src') or '' + if url.startswith('https://www.youtube.com'): + entries.append(self.url_result(url, ie='Youtube')) + elif url.startswith('https://www.redditmedia.com'): + entries.append(self.url_result(url, ie='RedditR')) + + return { + '_type': 'playlist', + 'id': video_id, + 'title': title, + 'timestamp': timestamp, + 'entries': entries, + } diff --git a/yt_dlp/extractor/nate.py b/yt_dlp/extractor/nate.py new file mode 100644 index 0000000000..072faf6ea3 --- /dev/null +++ b/yt_dlp/extractor/nate.py @@ -0,0 +1,124 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + str_or_none, + traverse_obj, + unified_strdate, +) + + +class NateIE(InfoExtractor): + _VALID_URL = r'https?://tv\.nate\.com/clip/(?P[0-9]+)' + + _TESTS = [{ + 'url': 'https://tv.nate.com/clip/1848976', + 'info_dict': { + 'id': '1848976', + 'ext': 'mp4', + 'title': '[결승 오프닝 타이틀] 2018 LCK 서머 스플릿 결승전 kt Rolster VS Griffin', + 'description': 'md5:e1b79a7dcf0d8d586443f11366f50e6f', + 'thumbnail': r're:^https?://.*\.jpg', + 'upload_date': '20180908', + 'age_limit': 15, + 'duration': 73, + 'uploader': '2018 LCK 서머 스플릿(롤챔스)', + 'channel': '2018 LCK 서머 스플릿(롤챔스)', + 'channel_id': '3606', + 'uploader_id': '3606', + 'tags': 'count:59', + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://tv.nate.com/clip/4300566', + 'info_dict': { + 'id': '4300566', + 'ext': 'mp4', + 'title': '[심쿵엔딩] 이준호x이세영, 서로를 기억하며 끌어안는 두 사람!💕, MBC 211204 방송', + 'description': 'md5:be1653502d9c13ce344ddf7828e089fa', + 'thumbnail': r're:^https?://.*\.jpg', + 'upload_date': '20211204', + 'age_limit': 15, + 'duration': 201, + 'uploader': '옷소매 붉은 끝동', + 'channel': '옷소매 붉은 끝동', + 'channel_id': '27987', + 'uploader_id': '27987', + 'tags': 'count:20', + }, + 'params': {'skip_download': True} + }] + + _QUALITY = { + '36': 2160, + '35': 1080, + '34': 720, + '33': 480, + '32': 360, + '31': 270, + } + + def _real_extract(self, url): + id = self._match_id(url) + video_data = self._download_json(f'https://tv.nate.com/api/v1/clip/{id}', id) + formats = [{ + 'format_id': f_url[-2:], + 'url': f_url, + 'height': self._QUALITY.get(f_url[-2:]), + 'quality': int_or_none(f_url[-2:]), + } for f_url in video_data.get('smcUriList') or []] + self._sort_formats(formats) + return { + 'id': id, + 'title': video_data.get('clipTitle'), + 'description': video_data.get('synopsis'), + 'thumbnail': video_data.get('contentImg'), + 'upload_date': unified_strdate(traverse_obj(video_data, 'broadDate', 'regDate')), + 'age_limit': video_data.get('targetAge'), + 'duration': video_data.get('playTime'), + 'formats': formats, + 'uploader': video_data.get('programTitle'), + 'channel': video_data.get('programTitle'), + 'channel_id': str_or_none(video_data.get('programSeq')), + 'uploader_id': str_or_none(video_data.get('programSeq')), + 'tags': video_data['hashTag'].split(',') if video_data.get('hashTag') else None, + } + + +class NateProgramIE(InfoExtractor): + _VALID_URL = r'https?://tv\.nate\.com/program/clips/(?P[0-9]+)' + + _TESTS = [{ + 'url': 'https://tv.nate.com/program/clips/27987', + 'playlist_mincount': 191, + 'info_dict': { + 'id': '27987', + }, + }, { + 'url': 'https://tv.nate.com/program/clips/3606', + 'playlist_mincount': 15, + 'info_dict': { + 'id': '3606', + }, + }] + + def _entries(self, id): + for page_num in itertools.count(1): + program_data = self._download_json(f'https://tv.nate.com/api/v1/program/{id}/clip/ranking?size=20&page={page_num}', + id, note=f'Downloading page {page_num}') + for clip in program_data.get('content') or []: + clip_id = clip.get('clipSeq') + if clip_id: + yield self.url_result( + 'https://tv.nate.com/clip/%s' % clip_id, + ie=NateIE.ie_key(), video_id=clip_id) + if program_data.get('last'): + break + + def _real_extract(self, url): + id = self._match_id(url) + return self.playlist_result(self._entries(id), playlist_id=id) diff --git a/yt_dlp/extractor/naver.py b/yt_dlp/extractor/naver.py index acf53c1ff2..a6821ba86d 100644 --- a/yt_dlp/extractor/naver.py +++ b/yt_dlp/extractor/naver.py @@ -40,6 +40,7 @@ def extract_formats(streams, stream_type, query={}): formats.append({ 'format_id': '%s_%s' % (stream.get('type') or stream_type, dict_get(encoding_option, ('name', 'id'))), 'url': stream_url, + 'ext': 'mp4', 'width': int_or_none(encoding_option.get('width')), 'height': int_or_none(encoding_option.get('height')), 'vbr': int_or_none(bitrate.get('video')), @@ -174,7 +175,7 @@ class NaverLiveIE(InfoExtractor): 'url': 'https://tv.naver.com/l/52010', 'info_dict': { 'id': '52010', - 'ext': 'm3u8', + 'ext': 'mp4', 'title': '[LIVE] 뉴스특보 : "수도권 거리두기, 2주간 2단계로 조정"', 'description': 'md5:df7f0c237a5ed5e786ce5c91efbeaab3', 'channel_id': 'NTV-ytnnews24-0', @@ -184,7 +185,7 @@ class NaverLiveIE(InfoExtractor): 'url': 'https://tv.naver.com/l/51549', 'info_dict': { 'id': '51549', - 'ext': 'm3u8', + 'ext': 'mp4', 'title': '연합뉴스TV - 코로나19 뉴스특보', 'description': 'md5:c655e82091bc21e413f549c0eaccc481', 'channel_id': 'NTV-yonhapnewstv-0', @@ -233,7 +234,7 @@ def _extract_video_info(self, video_id, url): continue formats.extend(self._extract_m3u8_formats( - quality.get('url'), video_id, 'm3u8', + quality.get('url'), video_id, 'mp4', m3u8_id=quality.get('qualityId'), live=True )) self._sort_formats(formats) diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index f304f191af..cd573690b2 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -305,7 +305,7 @@ def _real_extract(self, url): self._sort_formats(formats) return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': live_source.get('description'), 'formats': formats, 'is_live': is_live, @@ -545,8 +545,6 @@ def _real_extract(self, url): title = event_config['eventTitle'] is_live = {'live': True, 'replay': False}.get(event_config.get('eventStatus')) - if is_live: - title = self._live_title(title) source_url = self._download_json( f'https://api-leap.nbcsports.com/feeds/assets/{pid}?application=NBCOlympics&platform=desktop&format=nbc-player&env=staging', diff --git a/yt_dlp/extractor/ndr.py b/yt_dlp/extractor/ndr.py index 45aa106c80..1917254b8d 100644 --- a/yt_dlp/extractor/ndr.py +++ b/yt_dlp/extractor/ndr.py @@ -1,15 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, - merge_dicts, - parse_iso8601, + parse_duration, qualities, try_get, + unified_strdate, urljoin, ) @@ -28,110 +27,110 @@ class NDRIE(NDRBaseIE): IE_DESC = 'NDR.de - Norddeutscher Rundfunk' _VALID_URL = r'https?://(?:www\.)?(?:daserste\.)?ndr\.de/(?:[^/]+/)*(?P[^/?#]+),(?P[\da-z]+)\.html' _TESTS = [{ - # httpVideo, same content id 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', - 'md5': '6515bc255dc5c5f8c85bbc38e035a659', 'info_dict': { 'id': 'hafengeburtstag988', - 'display_id': 'Party-Poette-und-Parade', 'ext': 'mp4', 'title': 'Party, Pötte und Parade', + 'thumbnail': 'https://www.ndr.de/fernsehen/hafengeburtstag990_v-contentxl.jpg', 'description': 'md5:ad14f9d2f91d3040b6930c697e5f6b4c', - 'uploader': 'ndrtv', - 'timestamp': 1431108900, - 'upload_date': '20150510', + 'series': None, + 'channel': 'NDR Fernsehen', + 'upload_date': '20150508', 'duration': 3498, }, - 'params': { - 'skip_download': True, - }, }, { - # httpVideo, different content id - 'url': 'http://www.ndr.de/sport/fussball/40-Osnabrueck-spielt-sich-in-einen-Rausch,osna270.html', - 'md5': '1043ff203eab307f0c51702ec49e9a71', + 'url': 'https://www.ndr.de/sport/fussball/Rostocks-Matchwinner-Froede-Ein-Hansa-Debuet-wie-im-Maerchen,hansa10312.html', + 'only_matching': True + }, { + 'url': 'https://www.ndr.de/nachrichten/niedersachsen/kommunalwahl_niedersachsen_2021/Grosse-Parteien-zufrieden-mit-Ergebnissen-der-Kommunalwahl,kommunalwahl1296.html', 'info_dict': { - 'id': 'osna272', - 'display_id': '40-Osnabrueck-spielt-sich-in-einen-Rausch', + 'id': 'kommunalwahl1296', 'ext': 'mp4', - 'title': 'Osnabrück - Wehen Wiesbaden: Die Highlights', - 'description': 'md5:32e9b800b3d2d4008103752682d5dc01', - 'uploader': 'ndrtv', - 'timestamp': 1442059200, - 'upload_date': '20150912', - 'duration': 510, - }, - 'params': { - 'skip_download': True, + 'title': 'Die Spitzenrunde: Die Wahl aus Sicht der Landespolitik', + 'thumbnail': 'https://www.ndr.de/fernsehen/screenshot1194912_v-contentxl.jpg', + 'description': 'md5:5c6e2ad744cef499135735a1036d7aa7', + 'series': 'Hallo Niedersachsen', + 'channel': 'NDR Fernsehen', + 'upload_date': '20210913', + 'duration': 438, }, }, { - # httpAudio, same content id - 'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html', - 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', - 'info_dict': { - 'id': 'audio51535', - 'display_id': 'La-Valette-entgeht-der-Hinrichtung', - 'ext': 'mp3', - 'title': 'La Valette entgeht der Hinrichtung', - 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', - 'uploader': 'ndrinfo', - 'timestamp': 1290626100, - 'upload_date': '20140729', - 'duration': 884, - }, - 'params': { - 'skip_download': True, - }, - }, { - # with subtitles 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html', 'info_dict': { - 'id': 'extra18674', - 'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring', + 'id': 'sendung1091858', 'ext': 'mp4', 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring', - 'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6', - 'uploader': 'ndrtv', - 'upload_date': '20201113', + 'thumbnail': 'https://www.ndr.de/fernsehen/screenshot983938_v-contentxl.jpg', + 'description': 'md5:700f6de264010585012a72f97b0ac0c9', + 'series': 'extra 3', + 'channel': 'NDR Fernsehen', + 'upload_date': '20201111', 'duration': 1749, - 'subtitles': { - 'de': [{ - 'ext': 'ttml', - 'url': r're:^https://www\.ndr\.de.+', - }], - }, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download f4m manifest'], + } }, { - 'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html', - 'only_matching': True, + 'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html', + 'info_dict': { + 'id': 'audio51535', + 'ext': 'mp3', + 'title': 'La Valette entgeht der Hinrichtung', + 'thumbnail': 'https://www.ndr.de/mediathek/mediathekbild140_v-podcast.jpg', + 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', + 'upload_date': '20140729', + 'duration': 884.0, + }, + 'expected_warnings': ['unable to extract json url'], }] def _extract_embed(self, webpage, display_id, id): - embed_url = self._html_search_meta( - 'embedURL', webpage, 'embed URL', - default=None) or self._search_regex( - r'\bembedUrl["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'embed URL', fatal=False, group='url') - if embed_url is None: - return self.url_result('ndr:%s' % id, ie=NDREmbedBaseIE.ie_key()) - description = self._search_regex( - r']+itemprop="description">([^<]+)

    ', - webpage, 'description', default=None) or self._og_search_description(webpage) - timestamp = parse_iso8601( - self._search_regex( - r']+itemprop="(?:datePublished|uploadDate)"[^>]+content="([^"]+)"', - webpage, 'upload date', default=None)) - info = self._search_json_ld(webpage, display_id, default={}) - return merge_dicts({ - '_type': 'url_transparent', - 'url': embed_url, - 'display_id': display_id, - 'description': description, - 'timestamp': timestamp, - }, info) + formats = [] + base_url = 'https://www.ndr.de' + json_url = self._search_regex(r']+src=\"([^\"]+)_theme-ndrde[^\.]*\.html\"', webpage, + 'json url', fatal=False) + if json_url: + data_json = self._download_json(base_url + json_url.replace('ardplayer_image', 'ardjson_image') + '.json', + id, fatal=False) + info_json = data_json.get('_info', {}) + media_json = try_get(data_json, lambda x: x['_mediaArray'][0]['_mediaStreamArray']) + for media in media_json: + if media.get('_quality') == 'auto': + formats.extend(self._extract_m3u8_formats(media['_stream'], id)) + subtitles = {} + sub_url = data_json.get('_subtitleUrl') + if sub_url: + subtitles.setdefault('de', []).append({ + 'url': base_url + sub_url, + }) + self._sort_formats(formats) + return { + 'id': id, + 'title': info_json.get('clipTitle'), + 'thumbnail': base_url + data_json.get('_previewImage'), + 'description': info_json.get('clipDescription'), + 'series': info_json.get('seriesTitle') or None, + 'channel': info_json.get('channelTitle'), + 'upload_date': unified_strdate(info_json.get('clipDate')), + 'duration': data_json.get('_duration'), + 'formats': formats, + 'subtitles': subtitles, + } + else: + json_url = base_url + self._search_regex(r'apiUrl\s?=\s?\'([^\']+)\'', webpage, 'json url').replace( + '_belongsToPodcast-', '') + data_json = self._download_json(json_url, id, fatal=False) + return { + 'id': id, + 'title': data_json.get('title'), + 'thumbnail': base_url + data_json.get('poster'), + 'description': data_json.get('summary'), + 'upload_date': unified_strdate(data_json.get('publicationDate')), + 'duration': parse_duration(data_json.get('duration')), + 'formats': [{ + 'url': try_get(data_json, (lambda x: x['audio'][0]['url'], lambda x: x['files'][0]['url'])), + 'vcodec': 'none', + 'ext': 'mp3', + }], + } class NJoyIE(NDRBaseIE): @@ -246,8 +245,6 @@ def _real_extract(self, url): live = playlist.get('config', {}).get('streamType') in ['httpVideoLive', 'httpAudioLive'] title = config['title'] - if live: - title = self._live_title(title) uploader = ppjson.get('config', {}).get('branding') upload_date = ppjson.get('config', {}).get('publicationDate') duration = int_or_none(config.get('duration')) diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py index 4426a8fdc9..d235805c35 100644 --- a/yt_dlp/extractor/nebula.py +++ b/yt_dlp/extractor/nebula.py @@ -1,238 +1,290 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import time - -from urllib.error import HTTPError -from .common import InfoExtractor -from ..compat import compat_str, compat_urllib_parse_unquote, compat_urllib_parse_quote -from ..utils import ( - ExtractorError, - parse_iso8601, - try_get, - urljoin, -) - - -class NebulaIE(InfoExtractor): - - _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/videos/(?P[-\w]+)' - _TESTS = [ - { - 'url': 'https://nebula.app/videos/that-time-disney-remade-beauty-and-the-beast', - 'md5': 'fe79c4df8b3aa2fea98a93d027465c7e', - 'info_dict': { - 'id': '5c271b40b13fd613090034fd', - 'ext': 'mp4', - 'title': 'That Time Disney Remade Beauty and the Beast', - 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.', - 'upload_date': '20180731', - 'timestamp': 1533009600, - 'channel': 'Lindsay Ellis', - 'uploader': 'Lindsay Ellis', - }, - 'params': { - 'usenetrc': True, - }, - 'skip': 'All Nebula content requires authentication', - }, - { - 'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', - 'md5': '6d4edd14ce65720fa63aba5c583fb328', - 'info_dict': { - 'id': '5e7e78171aaf320001fbd6be', - 'ext': 'mp4', - 'title': 'Landing Craft - How The Allies Got Ashore', - 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.', - 'upload_date': '20200327', - 'timestamp': 1585348140, - 'channel': 'The Logistics of D-Day', - 'uploader': 'The Logistics of D-Day', - }, - 'params': { - 'usenetrc': True, - }, - 'skip': 'All Nebula content requires authentication', - }, - { - 'url': 'https://nebula.app/videos/money-episode-1-the-draw', - 'md5': '8c7d272910eea320f6f8e6d3084eecf5', - 'info_dict': { - 'id': '5e779ebdd157bc0001d1c75a', - 'ext': 'mp4', - 'title': 'Episode 1: The Draw', - 'description': r'contains:There’s free money on offer… if the players can all work together.', - 'upload_date': '20200323', - 'timestamp': 1584980400, - 'channel': 'Tom Scott Presents: Money', - 'uploader': 'Tom Scott Presents: Money', - }, - 'params': { - 'usenetrc': True, - }, - 'skip': 'All Nebula content requires authentication', - }, - { - 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', - 'only_matching': True, - }, - ] - _NETRC_MACHINE = 'watchnebula' - - _nebula_token = None - - def _retrieve_nebula_auth(self): - """ - Log in to Nebula, and returns a Nebula API token - """ - - username, password = self._get_login_info() - if not (username and password): - self.raise_login_required() - - self.report_login() - data = json.dumps({'email': username, 'password': password}).encode('utf8') - response = self._download_json( - 'https://api.watchnebula.com/api/v1/auth/login/', - data=data, fatal=False, video_id=None, - headers={ - 'content-type': 'application/json', - # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint - 'cookie': '' - }, - note='Authenticating to Nebula with supplied credentials', - errnote='Authentication failed or rejected') - if not response or not response.get('key'): - self.raise_login_required() - - # save nebula token as cookie - self._set_cookie( - 'nebula.app', 'nebula-auth', - compat_urllib_parse_quote( - json.dumps({ - "apiToken": response["key"], - "isLoggingIn": False, - "isLoggingOut": False, - }, separators=(",", ":"))), - expire_time=int(time.time()) + 86400 * 365, - ) - - return response['key'] - - def _retrieve_zype_api_key(self, page_url, display_id): - """ - Retrieves the Zype API key - """ - - # Find the js that has the API key from the webpage and download it - webpage = self._download_webpage(page_url, video_id=display_id) - main_script_relpath = self._search_regex( - r']*src="(?P[^"]*main.[0-9a-f]*.chunk.js)"[^>]*>', webpage, - group='script_relpath', name='script relative path', fatal=True) - main_script_abspath = urljoin(page_url, main_script_relpath) - main_script = self._download_webpage(main_script_abspath, video_id=display_id, - note='Retrieving Zype API key') - - api_key = self._search_regex( - r'REACT_APP_ZYPE_API_KEY\s*:\s*"(?P[\w-]*)"', main_script, - group='api_key', name='API key', fatal=True) - - return api_key - - def _call_zype_api(self, path, params, video_id, api_key, note): - """ - A helper for making calls to the Zype API. - """ - query = {'api_key': api_key, 'per_page': 1} - query.update(params) - return self._download_json('https://api.zype.com' + path, video_id, query=query, note=note) - - def _call_nebula_api(self, path, video_id, access_token, note): - """ - A helper for making calls to the Nebula API. - """ - return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={ - 'Authorization': 'Token {access_token}'.format(access_token=access_token) - }, note=note) - - def _fetch_zype_access_token(self, video_id): - try: - user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token') - except ExtractorError as exc: - # if 401, attempt credential auth and retry - if exc.cause and isinstance(exc.cause, HTTPError) and exc.cause.code == 401: - self._nebula_token = self._retrieve_nebula_auth() - user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token') - else: - raise - - access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str) - if not access_token: - if try_get(user_object, lambda x: x['is_subscribed'], bool): - # TODO: Reimplement the same Zype token polling the Nebula frontend implements - # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532 - raise ExtractorError( - 'Unable to extract Zype access token from Nebula API authentication endpoint. ' - 'Open an arbitrary video in a browser with this account to generate a token', - expected=True) - raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') - return access_token - - def _extract_channel_title(self, video_meta): - # TODO: Implement the API calls giving us the channel list, - # so that we can do the title lookup and then figure out the channel URL - categories = video_meta.get('categories', []) if video_meta else [] - # the channel name is the value of the first category - for category in categories: - if category.get('value'): - return category['value'][0] - - def _real_initialize(self): - # check cookie jar for valid token - nebula_cookies = self._get_cookies('https://nebula.app') - nebula_cookie = nebula_cookies.get('nebula-auth') - if nebula_cookie: - self.to_screen('Authenticating to Nebula with token from cookie jar') - nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value) - self._nebula_token = self._parse_json(nebula_cookie_value, None).get('apiToken') - - # try to authenticate using credentials if no valid token has been found - if not self._nebula_token: - self._nebula_token = self._retrieve_nebula_auth() - - def _real_extract(self, url): - display_id = self._match_id(url) - api_key = self._retrieve_zype_api_key(url, display_id) - - response = self._call_zype_api('/videos', {'friendly_title': display_id}, - display_id, api_key, note='Retrieving metadata from Zype') - if len(response.get('response') or []) != 1: - raise ExtractorError('Unable to find video on Zype API') - video_meta = response['response'][0] - - video_id = video_meta['_id'] - zype_access_token = self._fetch_zype_access_token(display_id) - - channel_title = self._extract_channel_title(video_meta) - - return { - 'id': video_id, - 'display_id': display_id, - '_type': 'url_transparent', - 'ie_key': 'Zype', - 'url': 'https://player.zype.com/embed/%s.html?access_token=%s' % (video_id, zype_access_token), - 'title': video_meta.get('title'), - 'description': video_meta.get('description'), - 'timestamp': parse_iso8601(video_meta.get('published_at')), - 'thumbnails': [{ - 'id': tn.get('name'), # this appears to be null - 'url': tn['url'], - 'width': tn.get('width'), - 'height': tn.get('height'), - } for tn in video_meta.get('thumbnails', [])], - 'duration': video_meta.get('duration'), - 'channel': channel_title, - 'uploader': channel_title, # we chose uploader = channel name - # TODO: uploader_url, channel_id, channel_url - } +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +import json +import time +import urllib + +from ..utils import ( + ExtractorError, + parse_iso8601, + try_get, +) +from .common import InfoExtractor + + +class NebulaBaseIE(InfoExtractor): + _NETRC_MACHINE = 'watchnebula' + + _nebula_api_token = None + _nebula_bearer_token = None + _zype_access_token = None + + def _perform_nebula_auth(self): + username, password = self._get_login_info() + if not (username and password): + self.raise_login_required() + + data = json.dumps({'email': username, 'password': password}).encode('utf8') + response = self._download_json( + 'https://api.watchnebula.com/api/v1/auth/login/', + data=data, fatal=False, video_id=None, + headers={ + 'content-type': 'application/json', + # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint + 'cookie': '' + }, + note='Logging in to Nebula with supplied credentials', + errnote='Authentication failed or rejected') + if not response or not response.get('key'): + self.raise_login_required() + + # save nebula token as cookie + self._set_cookie( + 'nebula.app', 'nebula-auth', + urllib.parse.quote( + json.dumps({ + "apiToken": response["key"], + "isLoggingIn": False, + "isLoggingOut": False, + }, separators=(",", ":"))), + expire_time=int(time.time()) + 86400 * 365, + ) + + return response['key'] + + def _retrieve_nebula_api_token(self): + """ + Check cookie jar for valid token. Try to authenticate using credentials if no valid token + can be found in the cookie jar. + """ + nebula_cookies = self._get_cookies('https://nebula.app') + nebula_cookie = nebula_cookies.get('nebula-auth') + if nebula_cookie: + self.to_screen('Authenticating to Nebula with token from cookie jar') + nebula_cookie_value = urllib.parse.unquote(nebula_cookie.value) + nebula_api_token = self._parse_json(nebula_cookie_value, None).get('apiToken') + if nebula_api_token: + return nebula_api_token + + return self._perform_nebula_auth() + + def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''): + assert method in ('GET', 'POST',) + assert auth_type in ('api', 'bearer',) + + def inner_call(): + authorization = f'Token {self._nebula_api_token}' if auth_type == 'api' else f'Bearer {self._nebula_bearer_token}' + return self._download_json( + url, video_id, note=note, headers={'Authorization': authorization}, + data=b'' if method == 'POST' else None) + + try: + return inner_call() + except ExtractorError as exc: + # if 401 or 403, attempt credential re-auth and retry + if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403): + self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}') + self._login() + return inner_call() + else: + raise + + def _fetch_nebula_bearer_token(self): + """ + Get a Bearer token for the Nebula API. This will be required to fetch video meta data. + """ + response = self._call_nebula_api('https://api.watchnebula.com/api/v1/authorization/', + method='POST', + note='Authorizing to Nebula') + return response['token'] + + def _fetch_zype_access_token(self): + """ + Get a Zype access token, which is required to access video streams -- in our case: to + generate video URLs. + """ + user_object = self._call_nebula_api('https://api.watchnebula.com/api/v1/auth/user/', note='Retrieving Zype access token') + + access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], str) + if not access_token: + if try_get(user_object, lambda x: x['is_subscribed'], bool): + # TODO: Reimplement the same Zype token polling the Nebula frontend implements + # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532 + raise ExtractorError( + 'Unable to extract Zype access token from Nebula API authentication endpoint. ' + 'Open an arbitrary video in a browser with this account to generate a token', + expected=True) + raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') + return access_token + + def _build_video_info(self, episode): + zype_id = episode['zype_id'] + zype_video_url = f'https://player.zype.com/embed/{zype_id}.html?access_token={self._zype_access_token}' + channel_slug = episode['channel_slug'] + return { + 'id': episode['zype_id'], + 'display_id': episode['slug'], + '_type': 'url_transparent', + 'ie_key': 'Zype', + 'url': zype_video_url, + 'title': episode['title'], + 'description': episode['description'], + 'timestamp': parse_iso8601(episode['published_at']), + 'thumbnails': [{ + # 'id': tn.get('name'), # this appears to be null + 'url': tn['original'], + 'height': key, + } for key, tn in episode['assets']['thumbnail'].items()], + 'duration': episode['duration'], + 'channel': episode['channel_title'], + 'channel_id': channel_slug, + 'channel_url': f'https://nebula.app/{channel_slug}', + 'uploader': episode['channel_title'], + 'uploader_id': channel_slug, + 'uploader_url': f'https://nebula.app/{channel_slug}', + 'series': episode['channel_title'], + 'creator': episode['channel_title'], + } + + def _login(self): + self._nebula_api_token = self._retrieve_nebula_api_token() + self._nebula_bearer_token = self._fetch_nebula_bearer_token() + self._zype_access_token = self._fetch_zype_access_token() + + def _real_initialize(self): + self._login() + + +class NebulaIE(NebulaBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/videos/(?P[-\w]+)' + _TESTS = [ + { + 'url': 'https://nebula.app/videos/that-time-disney-remade-beauty-and-the-beast', + 'md5': 'fe79c4df8b3aa2fea98a93d027465c7e', + 'info_dict': { + 'id': '5c271b40b13fd613090034fd', + 'ext': 'mp4', + 'title': 'That Time Disney Remade Beauty and the Beast', + 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.', + 'upload_date': '20180731', + 'timestamp': 1533009600, + 'channel': 'Lindsay Ellis', + 'channel_id': 'lindsayellis', + 'uploader': 'Lindsay Ellis', + 'uploader_id': 'lindsayellis', + }, + 'params': { + 'usenetrc': True, + }, + }, + { + 'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', + 'md5': '6d4edd14ce65720fa63aba5c583fb328', + 'info_dict': { + 'id': '5e7e78171aaf320001fbd6be', + 'ext': 'mp4', + 'title': 'Landing Craft - How The Allies Got Ashore', + 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.', + 'upload_date': '20200327', + 'timestamp': 1585348140, + 'channel': 'Real Engineering', + 'channel_id': 'realengineering', + 'uploader': 'Real Engineering', + 'uploader_id': 'realengineering', + }, + 'params': { + 'usenetrc': True, + }, + }, + { + 'url': 'https://nebula.app/videos/money-episode-1-the-draw', + 'md5': '8c7d272910eea320f6f8e6d3084eecf5', + 'info_dict': { + 'id': '5e779ebdd157bc0001d1c75a', + 'ext': 'mp4', + 'title': 'Episode 1: The Draw', + 'description': r'contains:There’s free money on offer… if the players can all work together.', + 'upload_date': '20200323', + 'timestamp': 1584980400, + 'channel': 'Tom Scott Presents: Money', + 'channel_id': 'tom-scott-presents-money', + 'uploader': 'Tom Scott Presents: Money', + 'uploader_id': 'tom-scott-presents-money', + }, + 'params': { + 'usenetrc': True, + }, + }, + { + 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', + 'only_matching': True, + }, + ] + + def _fetch_video_metadata(self, slug): + return self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/', + video_id=slug, + auth_type='bearer', + note='Fetching video meta data') + + def _real_extract(self, url): + slug = self._match_id(url) + video = self._fetch_video_metadata(slug) + return self._build_video_info(video) + + +class NebulaCollectionIE(NebulaBaseIE): + IE_NAME = 'nebula:collection' + _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/(?!videos/)(?P[-\w]+)' + _TESTS = [ + { + 'url': 'https://nebula.app/tom-scott-presents-money', + 'info_dict': { + 'id': 'tom-scott-presents-money', + 'title': 'Tom Scott Presents: Money', + 'description': 'Tom Scott hosts a series all about trust, negotiation and money.', + }, + 'playlist_count': 5, + 'params': { + 'usenetrc': True, + }, + }, { + 'url': 'https://nebula.app/lindsayellis', + 'info_dict': { + 'id': 'lindsayellis', + 'title': 'Lindsay Ellis', + 'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.', + }, + 'playlist_mincount': 100, + 'params': { + 'usenetrc': True, + }, + }, + ] + + def _generate_playlist_entries(self, collection_id, channel): + episodes = channel['episodes']['results'] + for page_num in itertools.count(2): + for episode in episodes: + yield self._build_video_info(episode) + next_url = channel['episodes']['next'] + if not next_url: + break + channel = self._call_nebula_api(next_url, collection_id, auth_type='bearer', + note=f'Retrieving channel page {page_num}') + episodes = channel['episodes']['results'] + + def _real_extract(self, url): + collection_id = self._match_id(url) + channel_url = f'https://content.watchnebula.com/video/channels/{collection_id}/' + channel = self._call_nebula_api(channel_url, collection_id, auth_type='bearer', note='Retrieving channel') + channel_details = channel['details'] + + return self.playlist_result( + entries=self._generate_playlist_entries(collection_id, channel), + playlist_id=collection_id, + playlist_title=channel_details['title'], + playlist_description=channel_details['description'] + ) diff --git a/yt_dlp/extractor/newgrounds.py b/yt_dlp/extractor/newgrounds.py index 41549a2f1e..1e1274ef05 100644 --- a/yt_dlp/extractor/newgrounds.py +++ b/yt_dlp/extractor/newgrounds.py @@ -1,15 +1,20 @@ +# coding: utf-8 from __future__ import unicode_literals +import functools import re from .common import InfoExtractor from ..utils import ( + clean_html, extract_attributes, + get_element_by_id, int_or_none, parse_count, parse_duration, - parse_filesize, unified_timestamp, + OnDemandPagedList, + try_get, ) @@ -26,7 +31,8 @@ class NewgroundsIE(InfoExtractor): 'timestamp': 1378878540, 'upload_date': '20130911', 'duration': 143, - 'description': 'md5:6d885138814015dfd656c2ddb00dacfc', + 'view_count': int, + 'description': 'md5:b8b3c2958875189f07d8e313462e8c4f', }, }, { 'url': 'https://www.newgrounds.com/portal/view/1', @@ -38,7 +44,9 @@ class NewgroundsIE(InfoExtractor): 'uploader': 'Brian-Beaton', 'timestamp': 955064100, 'upload_date': '20000406', + 'view_count': int, 'description': 'Scrotum plays "catch."', + 'age_limit': 17, }, }, { # source format unavailable, additional mp4 formats @@ -50,7 +58,9 @@ class NewgroundsIE(InfoExtractor): 'uploader': 'ZONE-SAMA', 'timestamp': 1487965140, 'upload_date': '20170224', - 'description': 'ZTV News Episode 8 (February 2017)', + 'view_count': int, + 'description': 'md5:aff9b330ec2e78ed93b1ad6d017accc6', + 'age_limit': 17, }, 'params': { 'skip_download': True, @@ -65,7 +75,9 @@ class NewgroundsIE(InfoExtractor): 'uploader': 'Egoraptor', 'timestamp': 1140663240, 'upload_date': '20060223', - 'description': 'Metal Gear is awesome is so is this movie.', + 'view_count': int, + 'description': 'md5:9246c181614e23754571995104da92e0', + 'age_limit': 13, } }, { 'url': 'https://www.newgrounds.com/portal/view/297383/format/flash', @@ -74,12 +86,19 @@ class NewgroundsIE(InfoExtractor): 'id': '297383', 'ext': 'swf', 'title': 'Metal Gear Awesome', - 'description': 'Metal Gear is awesome is so is this movie.', + 'description': 'Metal Gear Awesome', 'uploader': 'Egoraptor', 'upload_date': '20060223', 'timestamp': 1140663240, + 'age_limit': 13, } }] + _AGE_LIMIT = { + 'e': 0, + 't': 13, + 'm': 17, + 'a': 18, + } def _real_extract(self, url): media_id = self._match_id(url) @@ -88,10 +107,10 @@ def _real_extract(self, url): webpage = self._download_webpage(url, media_id) title = self._html_search_regex( - r'([^>]+)', webpage, 'title') + r'(.+?)', webpage, 'title') media_url_string = self._search_regex( - r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None, fatal=False) + r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None) if media_url_string: media_url = self._parse_json(media_url_string, media_id) @@ -124,24 +143,37 @@ def _real_extract(self, url): r'(?:Author|Writer)\s*]+>([^<]+)'), webpage, 'uploader', fatal=False) + age_limit = self._html_search_regex( + r']+>', webpage, 'age_limit', default='e') + age_limit = self._AGE_LIMIT.get(age_limit) + timestamp = unified_timestamp(self._html_search_regex( (r'
    \s*Uploaded\s*
    \s*
    ([^<]+
    \s*
    [^<]+)', r'
    \s*Uploaded\s*
    \s*
    ([^<]+)'), webpage, 'timestamp', default=None)) - duration = parse_duration(self._search_regex( - r'(?s)
    \s*Song\s*
    \s*
    .+?
    \s*
    ([^<]+)', webpage, + + duration = parse_duration(self._html_search_regex( + r'"duration"\s*:\s*["\']?(\d+)["\']?', webpage, 'duration', default=None)) - view_count = parse_count(self._html_search_regex(r'(?s)
    \s*Views\s*
    \s*
    ([\d\.,]+)
    ', webpage, - 'view_count', fatal=False, default=None)) + description = clean_html(get_element_by_id('author_comments', webpage)) or self._og_search_description(webpage) - filesize_approx = parse_filesize(self._html_search_regex( - r'(?s)
    \s*Song\s*
    \s*
    (.+?)
    ', webpage, 'filesize', + view_count = parse_count(self._html_search_regex( + r'(?s)
    \s*(?:Views|Listens)\s*
    \s*
    ([\d\.,]+)
    ', webpage, + 'view count', default=None)) + + filesize = int_or_none(self._html_search_regex( + r'"filesize"\s*:\s*["\']?([\d]+)["\']?,', webpage, 'filesize', default=None)) - if len(formats) == 1: - formats[0]['filesize_approx'] = filesize_approx - if '
    Song' in webpage: + video_type_description = self._html_search_regex( + r'"description"\s*:\s*["\']?([^"\']+)["\']?,', webpage, 'filesize', + default=None) + + if len(formats) == 1: + formats[0]['filesize'] = filesize + + if video_type_description == 'Audio File': formats[0]['vcodec'] = 'none' self._check_formats(formats, media_id) self._sort_formats(formats) @@ -154,12 +186,14 @@ def _real_extract(self, url): 'duration': duration, 'formats': formats, 'thumbnail': self._og_search_thumbnail(webpage), - 'description': self._og_search_description(webpage), + 'description': description, + 'age_limit': age_limit, 'view_count': view_count, } class NewgroundsPlaylistIE(InfoExtractor): + IE_NAME = 'Newgrounds:playlist' _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:collection|[^/]+/search/[^/]+)/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.newgrounds.com/collection/cats', @@ -202,7 +236,57 @@ def _real_extract(self, url): continue entries.append( self.url_result( - 'https://www.newgrounds.com/%s' % path, + f'https://www.newgrounds.com/{path}', ie=NewgroundsIE.ie_key(), video_id=media_id)) return self.playlist_result(entries, playlist_id, title) + + +class NewgroundsUserIE(InfoExtractor): + IE_NAME = 'Newgrounds:user' + _VALID_URL = r'https?://(?P[^\.]+)\.newgrounds\.com/(?:movies|audio)/?(?:[#?]|$)' + _TESTS = [{ + 'url': 'https://burn7.newgrounds.com/audio', + 'info_dict': { + 'id': 'burn7', + }, + 'playlist_mincount': 150, + }, { + 'url': 'https://burn7.newgrounds.com/movies', + 'info_dict': { + 'id': 'burn7', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://brian-beaton.newgrounds.com/movies', + 'info_dict': { + 'id': 'brian-beaton', + }, + 'playlist_mincount': 10, + }] + _PAGE_SIZE = 30 + + def _fetch_page(self, channel_id, url, page): + page += 1 + posts_info = self._download_json( + f'{url}/page/{page}', channel_id, + note=f'Downloading page {page}', headers={ + 'Accept': 'application/json, text/javascript, */*; q = 0.01', + 'X-Requested-With': 'XMLHttpRequest', + }) + sequence = posts_info.get('sequence', []) + for year in sequence: + posts = try_get(posts_info, lambda x: x['years'][str(year)]['items']) + for post in posts: + path, media_id = self._search_regex( + r']+\bhref=["\'][^"\']+((?:portal/view|audio/listen)/(\d+))[^>]+>', + post, 'url', group=(1, 2)) + yield self.url_result(f'https://www.newgrounds.com/{path}', NewgroundsIE.ie_key(), media_id) + + def _real_extract(self, url): + channel_id = self._match_id(url) + + entries = OnDemandPagedList(functools.partial( + self._fetch_page, channel_id, url), self._PAGE_SIZE) + + return self.playlist_result(entries, channel_id) diff --git a/yt_dlp/extractor/nexx.py b/yt_dlp/extractor/nexx.py index 860d636e23..8aceebd492 100644 --- a/yt_dlp/extractor/nexx.py +++ b/yt_dlp/extractor/nexx.py @@ -385,8 +385,7 @@ def find_video(result): elif cdn == 'free': formats = self._extract_free_formats(video, video_id) else: - # TODO: reverse more cdns - assert False + self.raise_no_formats(f'{cdn} formats are currently not supported', video_id) self._sort_formats(formats) @@ -427,7 +426,6 @@ class NexxEmbedIE(InfoExtractor): 'upload_date': '20140305', }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { diff --git a/yt_dlp/extractor/nfl.py b/yt_dlp/extractor/nfl.py index 871923e4c6..821276a31b 100644 --- a/yt_dlp/extractor/nfl.py +++ b/yt_dlp/extractor/nfl.py @@ -89,7 +89,7 @@ def _parse_video_config(self, video_config, display_id): 'ext': determine_ext(image_url, 'jpg'), }] info.update({ - 'title': self._live_title(title) if is_live else title, + 'title': title, 'is_live': is_live, 'description': clean_html(item.get('description')), 'thumbnails': thumbnails, diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index 950a3d0d4a..4998fed831 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -73,6 +73,7 @@ def get_clean_field(key): m3u8_id='hls', fatal=False) for f in info['formats']: f['language'] = lang + self._sort_formats(info['formats']) else: info.update({ '_type': 'url_transparent', diff --git a/yt_dlp/extractor/nick.py b/yt_dlp/extractor/nick.py index 084538d711..ba7da76026 100644 --- a/yt_dlp/extractor/nick.py +++ b/yt_dlp/extractor/nick.py @@ -67,6 +67,7 @@ class NickIE(MTVServicesInfoExtractor): 'description': 'md5:9d65a66df38e02254852794b2809d1cf', 'title': 'Blue\'s Imagination Station', }, + 'skip': 'Not accessible?' }] def _get_feed_query(self, uri): @@ -75,10 +76,6 @@ def _get_feed_query(self, uri): 'mgid': uri, } - def _extract_mgid(self, webpage): - mgid = self._search_regex(r'"media":{"video":{"config":{"uri":"(mgid:.*?)"', webpage, 'mgid', default=None) - return mgid - def _real_extract(self, url): domain, video_type, display_id = self._match_valid_url(url).groups() if video_type.startswith("episodes"): diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 2fa81b5c2e..ee888e9d35 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -1,11 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import json import datetime +import itertools +import json +import re -from .common import InfoExtractor +from .common import InfoExtractor, SearchInfoExtractor from ..postprocessor.ffmpeg import FFmpegPostProcessor from ..compat import ( compat_str, @@ -661,6 +662,101 @@ def pagefunc(pagenum): } +class NicovideoSearchBaseIE(InfoExtractor): + def _entries(self, url, item_id, query=None, note='Downloading page %(page)s'): + query = query or {} + pages = [query['page']] if 'page' in query else itertools.count(1) + for page_num in pages: + query['page'] = str(page_num) + webpage = self._download_webpage(url, item_id, query=query, note=note % {'page': page_num}) + results = re.findall(r'(?<=data-video-id=)["\']?(?P.*?)(?=["\'])', webpage) + for item in results: + yield self.url_result(f'http://www.nicovideo.jp/watch/{item}', 'Niconico', item) + if not results: + break + + def _search_results(self, query): + return self._entries( + self._proto_relative_url(f'//www.nicovideo.jp/search/{query}'), query) + + +class NicovideoSearchIE(NicovideoSearchBaseIE, SearchInfoExtractor): + IE_DESC = 'Nico video search' + IE_NAME = 'nicovideo:search' + _SEARCH_KEY = 'nicosearch' + + +class NicovideoSearchURLIE(NicovideoSearchBaseIE): + IE_NAME = f'{NicovideoSearchIE.IE_NAME}_url' + IE_DESC = 'Nico video search URLs' + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/search/(?P[^?#&]+)?' + _TESTS = [{ + 'url': 'http://www.nicovideo.jp/search/sm9', + 'info_dict': { + 'id': 'sm9', + 'title': 'sm9' + }, + 'playlist_mincount': 40, + }, { + 'url': 'https://www.nicovideo.jp/search/sm9?sort=h&order=d&end=2020-12-31&start=2020-01-01', + 'info_dict': { + 'id': 'sm9', + 'title': 'sm9' + }, + 'playlist_count': 31, + }] + + def _real_extract(self, url): + query = self._match_id(url) + return self.playlist_result(self._entries(url, query), query, query) + + +class NicovideoSearchDateIE(NicovideoSearchBaseIE, SearchInfoExtractor): + IE_DESC = 'Nico video search, newest first' + IE_NAME = f'{NicovideoSearchIE.IE_NAME}:date' + _SEARCH_KEY = 'nicosearchdate' + _TESTS = [{ + 'url': 'nicosearchdateall:a', + 'info_dict': { + 'id': 'a', + 'title': 'a' + }, + 'playlist_mincount': 1610, + }] + + _START_DATE = datetime.date(2007, 1, 1) + _RESULTS_PER_PAGE = 32 + _MAX_PAGES = 50 + + def _entries(self, url, item_id, start_date=None, end_date=None): + start_date, end_date = start_date or self._START_DATE, end_date or datetime.datetime.now().date() + + # If the last page has a full page of videos, we need to break down the query interval further + last_page_len = len(list(self._get_entries_for_date( + url, item_id, start_date, end_date, self._MAX_PAGES, + note=f'Checking number of videos from {start_date} to {end_date}'))) + if (last_page_len == self._RESULTS_PER_PAGE and start_date != end_date): + midpoint = start_date + ((end_date - start_date) // 2) + yield from self._entries(url, item_id, midpoint, end_date) + yield from self._entries(url, item_id, start_date, midpoint) + else: + self.to_screen(f'{item_id}: Downloading results from {start_date} to {end_date}') + yield from self._get_entries_for_date( + url, item_id, start_date, end_date, note=' Downloading page %(page)s') + + def _get_entries_for_date(self, url, item_id, start_date, end_date=None, page_num=None, note=None): + query = { + 'start': str(start_date), + 'end': str(end_date or start_date), + 'sort': 'f', + 'order': 'd', + } + if page_num: + query['page'] = str(page_num) + + yield from super()._entries(url, item_id, query=query, note=note) + + class NiconicoUserIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/user/(?P\d+)/?(?:$|[#?])' _TEST = { @@ -678,7 +774,7 @@ class NiconicoUserIE(InfoExtractor): 'X-Frontend-Version': '0' } - def _entries(self, list_id, ): + def _entries(self, list_id): total_count = 1 count = page_num = 0 while count < total_count: diff --git a/yt_dlp/extractor/ninecninemedia.py b/yt_dlp/extractor/ninecninemedia.py index 4aaf21a120..781842721b 100644 --- a/yt_dlp/extractor/ninecninemedia.py +++ b/yt_dlp/extractor/ninecninemedia.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( float_or_none, @@ -99,3 +98,37 @@ def _real_extract(self, url): } return info + + +class CPTwentyFourIE(InfoExtractor): + IE_NAME = 'cp24' + _GEO_COUNTRIES = ['CA'] + _VALID_URL = r'https?://(?:www\.)?cp24\.com/news/(?P[^?#]+)' + + _TESTS = [{ + 'url': 'https://www.cp24.com/news/video-shows-atm-being-ripped-out-of-business-by-pickup-truck-driver-in-mississauga-1.5676877', + 'info_dict': { + 'id': '2328005', + 'ext': 'mp4', + 'title': 'WATCH: Truck rips ATM from Mississauga business', + 'description': 'md5:cf7498480885f080a754389a2b2f7073', + 'timestamp': 1637618377, + 'episode_number': None, + 'season': 'Season 0', + 'season_number': 0, + 'season_id': 57974, + 'series': 'CTV News Toronto', + 'duration': 26.86, + 'thumbnail': 'http://images2.9c9media.com/image_asset/2014_11_5_2eb609a0-475b-0132-fbd6-34b52f6f1279_jpg_2000x1125.jpg', + 'upload_date': '20211122', + }, + 'params': {'skip_download': True, 'format': 'bv'} + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + id, destination = self._search_regex( + r'getAuthStates\("(?P[^"]+)",\s?"(?P[^"]+)"\);', + webpage, 'video id and destination', group=('id', 'destination')) + return self.url_result(f'9c9media:{destination}:{id}', ie=NineCNineMediaIE.ie_key(), video_id=id) diff --git a/yt_dlp/extractor/ninenow.py b/yt_dlp/extractor/ninenow.py index 0ee450cc5c..6043674ba1 100644 --- a/yt_dlp/extractor/ninenow.py +++ b/yt_dlp/extractor/ninenow.py @@ -8,6 +8,10 @@ int_or_none, float_or_none, smuggle_url, + str_or_none, + try_get, + unified_strdate, + unified_timestamp, ) @@ -37,6 +41,24 @@ class NineNowIE(InfoExtractor): # DRM protected 'url': 'https://www.9now.com.au/andrew-marrs-history-of-the-world/season-1/episode-1', 'only_matching': True, + }, { + # episode of series + 'url': 'https://www.9now.com.au/lego-masters/season-3/episode-3', + 'info_dict': { + 'id': '6249614030001', + 'title': 'Episode 3', + 'ext': 'mp4', + 'season_number': 3, + 'episode_number': 3, + 'description': 'In the first elimination of the competition, teams will have 10 hours to build a world inside a snow globe.', + 'uploader_id': '4460760524001', + 'timestamp': 1619002200, + 'upload_date': '20210421', + }, + 'expected_warnings': ['Ignoring subtitle tracks'], + 'params':{ + 'skip_download': True, + } }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4460760524001/default_default/index.html?videoId=%s' @@ -59,26 +81,31 @@ def _real_extract(self, url): cache = page_data.get(kind, {}).get('%sCache' % kind, {}) if not cache: continue - common_data = (cache.get(current_key) or list(cache.values())[0])[kind] + common_data = { + 'episode': (cache.get(current_key) or list(cache.values())[0])[kind], + 'season': (cache.get(current_key) or list(cache.values())[0]).get('season', None) + } break else: raise ExtractorError('Unable to find video data') - video_data = common_data['video'] - - brightcove_id = video_data.get('brightcoveId') or 'ref:' + video_data['referenceId'] - video_id = compat_str(video_data.get('id') or brightcove_id) - - if not self.get_param('allow_unplayable_formats') and video_data.get('drm'): - self.report_drm(video_id) - - title = common_data['name'] + if not self.get_param('allow_unplayable_formats') and try_get(common_data, lambda x: x['episode']['video']['drm'], bool): + self.report_drm(display_id) + brightcove_id = try_get( + common_data, lambda x: x['episode']['video']['brightcoveId'], compat_str) or 'ref:%s' % common_data['episode']['video']['referenceId'] + video_id = str_or_none(try_get(common_data, lambda x: x['episode']['video']['id'])) or brightcove_id + title = try_get(common_data, lambda x: x['episode']['name'], compat_str) + season_number = try_get(common_data, lambda x: x['season']['seasonNumber'], int) + episode_number = try_get(common_data, lambda x: x['episode']['episodeNumber'], int) + timestamp = unified_timestamp(try_get(common_data, lambda x: x['episode']['airDate'], compat_str)) + release_date = unified_strdate(try_get(common_data, lambda x: x['episode']['availability'], compat_str)) + thumbnails_data = try_get(common_data, lambda x: x['episode']['image']['sizes'], dict) or {} thumbnails = [{ 'id': thumbnail_id, 'url': thumbnail_url, - 'width': int_or_none(thumbnail_id[1:]) - } for thumbnail_id, thumbnail_url in common_data.get('image', {}).get('sizes', {}).items()] + 'width': int_or_none(thumbnail_id[1:]), + } for thumbnail_id, thumbnail_url in thumbnails_data.items()] return { '_type': 'url_transparent', @@ -87,8 +114,12 @@ def _real_extract(self, url): {'geo_countries': self._GEO_COUNTRIES}), 'id': video_id, 'title': title, - 'description': common_data.get('description'), - 'duration': float_or_none(video_data.get('duration'), 1000), + 'description': try_get(common_data, lambda x: x['episode']['description'], compat_str), + 'duration': float_or_none(try_get(common_data, lambda x: x['episode']['video']['duration'], float), 1000), 'thumbnails': thumbnails, 'ie_key': 'BrightcoveNew', + 'season_number': season_number, + 'episode_number': episode_number, + 'timestamp': timestamp, + 'release_date': release_date, } diff --git a/yt_dlp/extractor/njpwworld.py b/yt_dlp/extractor/njpwworld.py index 3639d142ff..89380d039c 100644 --- a/yt_dlp/extractor/njpwworld.py +++ b/yt_dlp/extractor/njpwworld.py @@ -77,13 +77,8 @@ def _real_extract(self, url): for kind, vid in re.findall(r'if\s+\(\s*imageQualityType\s*==\s*\'([^\']+)\'\s*\)\s*{\s*video_id\s*=\s*"(\d+)"', webpage): player_path = '/intent?id=%s&type=url' % vid player_url = compat_urlparse.urljoin(url, player_path) - formats.append({ - 'url': player_url, - 'format_id': kind, - 'ext': 'mp4', - 'protocol': 'm3u8', - 'quality': 2 if kind == 'high' else 1, - }) + formats += self._extract_m3u8_formats( + player_url, video_id, 'mp4', 'm3u8_native', m3u8_id=kind, fatal=False, quality=int(kind == 'high')) self._sort_formats(formats) diff --git a/yt_dlp/extractor/nova.py b/yt_dlp/extractor/nova.py index fdf604d2af..00a64f88d1 100644 --- a/yt_dlp/extractor/nova.py +++ b/yt_dlp/extractor/nova.py @@ -10,6 +10,7 @@ int_or_none, js_to_json, qualities, + traverse_obj, unified_strdate, url_or_none, ) @@ -17,30 +18,45 @@ class NovaEmbedIE(InfoExtractor): _VALID_URL = r'https?://media\.cms\.nova\.cz/embed/(?P[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'https://media.cms.nova.cz/embed/8o0n0r?autoplay=1', - 'md5': 'ee009bafcc794541570edd44b71cbea3', 'info_dict': { 'id': '8o0n0r', - 'ext': 'mp4', 'title': '2180. díl', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 2578, }, - } + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': ['DRM protected', 'Requested format is not available'], + }, { + 'url': 'https://media.cms.nova.cz/embed/KybpWYvcgOa', + 'info_dict': { + 'id': 'KybpWYvcgOa', + 'ext': 'mp4', + 'title': 'Borhyová oslavila 60? Soutěžící z pořadu odboural moderátora Ondřeje Sokola', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 114, + }, + 'params': {'skip_download': 'm3u8'}, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + has_drm = False duration = None formats = [] player = self._parse_json( self._search_regex( - r'Player\.init\s*\([^,]+,\s*({.+?})\s*,\s*{.+?}\s*\)\s*;', - webpage, 'player', default='{}'), video_id, fatal=False) + (r'(?:(?:replacePlaceholders|processAdTagModifier).*?:\s*)?(?:replacePlaceholders|processAdTagModifier)\s*\(\s*(?P{.*?})\s*\)(?:\s*\))?\s*,', + r'Player\.init\s*\([^,]+,(?P\s*\w+\s*\?)?\s*(?P{(?(cndn).+?|.+)})\s*(?(cndn):|,\s*{.+?}\s*\)\s*;)'), + webpage, 'player', default='{}', group='json'), video_id, fatal=False) if player: for format_id, format_list in player['tracks'].items(): if not isinstance(format_list, list): @@ -48,6 +64,10 @@ def _real_extract(self, url): for format_dict in format_list: if not isinstance(format_dict, dict): continue + if (not self.get_param('allow_unplayable_formats') + and traverse_obj(format_dict, ('drm', 'keySystem'))): + has_drm = True + continue format_url = url_or_none(format_dict.get('src')) format_type = format_dict.get('type') ext = determine_ext(format_url) @@ -104,6 +124,8 @@ def _real_extract(self, url): f['format_id'] = f_id formats.append(f) + if not formats and has_drm: + self.report_drm(video_id) self._sort_formats(formats) title = self._og_search_title( diff --git a/yt_dlp/extractor/novaplay.py b/yt_dlp/extractor/novaplay.py new file mode 100644 index 0000000000..724986a060 --- /dev/null +++ b/yt_dlp/extractor/novaplay.py @@ -0,0 +1,63 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import int_or_none, parse_duration, parse_iso8601 + + +class NovaPlayIE(InfoExtractor): + _VALID_URL = r'https://play.nova\.bg/video/.*/(?P\d+)' + _TESTS = [ + { + 'url': 'https://play.nova.bg/video/bratya/season-3/bratq-2021-10-08/548677', + 'md5': 'b1127a84e61bed1632b7c2ca9cbb4153', + 'info_dict': { + 'id': '548677', + 'ext': 'mp4', + 'title': 'Братя', + 'alt_title': 'bratya/season-3/bratq-2021-10-08', + 'duration': 1603.0, + 'timestamp': 1633724150, + 'upload_date': '20211008', + 'thumbnail': 'https://nbg-img.fite.tv/img/548677_460x260.jpg', + 'description': 'Сезон 3 Епизод 25' + }, + }, + { + 'url': 'https://play.nova.bg/video/igri-na-volqta/season-3/igri-na-volqta-2021-09-20-1/548227', + 'md5': '5fd61b8ecbe582fc021019d570965d58', + 'info_dict': { + 'id': '548227', + 'ext': 'mp4', + 'title': 'Игри на волята: България (20.09.2021) - част 1', + 'alt_title': 'gri-na-volqta/season-3/igri-na-volqta-2021-09-20-1', + 'duration': 4060.0, + 'timestamp': 1632167564, + 'upload_date': '20210920', + 'thumbnail': 'https://nbg-img.fite.tv/img/548227_460x260.jpg', + 'description': 'Сезон 3 Епизод 13' + }, + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_props = self._parse_json(self._search_regex( + r'({.+})', + webpage, 'video_props'), video_id)['props']['pageProps']['video'] + m3u8_url = self._download_json( + f'https://nbg-api.fite.tv/api/v2/videos/{video_id}/streams', + video_id, headers={'x-flipps-user-agent': 'Flipps/75/9.7'})[0]['url'] + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_props['title'], + 'alt_title': video_props.get('slug'), + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), + 'formats': formats, + 'duration': parse_duration(video_props['duration']), + 'timestamp': parse_iso8601(video_props['published_at']), + 'view_count': int_or_none(video_props['view_count']), + } diff --git a/yt_dlp/extractor/npo.py b/yt_dlp/extractor/npo.py index ed547d04b3..a8aaef6f16 100644 --- a/yt_dlp/extractor/npo.py +++ b/yt_dlp/extractor/npo.py @@ -467,7 +467,7 @@ def add_format_url(format_url): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': metadata.get('info'), 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], 'upload_date': unified_strdate(metadata.get('gidsdatum')), @@ -561,7 +561,7 @@ def _real_extract(self, url): return { 'id': video_id, 'url': stream['url'], - 'title': self._live_title(title), + 'title': title, 'acodec': codec, 'ext': codec, 'is_live': True, diff --git a/yt_dlp/extractor/npr.py b/yt_dlp/extractor/npr.py index 9d1122f0c7..49f062d7ab 100644 --- a/yt_dlp/extractor/npr.py +++ b/yt_dlp/extractor/npr.py @@ -91,7 +91,8 @@ def _real_extract(self, url): elif format_id == 'smil': smil_formats = self._extract_smil_formats( format_url, media_id, transform_source=lambda s: s.replace( - 'rtmp://flash.npr.org/ondemand/', 'https://ondemand.npr.org/')) + 'rtmp://flash.npr.org/ondemand/', 'https://ondemand.npr.org/'), + fatal=False) self._check_formats(smil_formats, media_id) formats.extend(smil_formats) else: diff --git a/yt_dlp/extractor/nrk.py b/yt_dlp/extractor/nrk.py index b556bc6aa4..49d58a685b 100644 --- a/yt_dlp/extractor/nrk.py +++ b/yt_dlp/extractor/nrk.py @@ -147,7 +147,7 @@ class NRKIE(NRKBaseIE): def _real_extract(self, url): video_id = self._match_id(url).split('/')[-1] - path_templ = 'playback/%s/' + video_id + path_templ = 'playback/%s/program/' + video_id def call_playback_api(item, query=None): return self._call_api(path_templ % item, video_id, item, query=query) @@ -188,7 +188,7 @@ def call_playback_api(item, query=None): title = titles['title'] alt_title = titles.get('subtitle') - description = preplay.get('description') + description = try_get(preplay, lambda x: x['description'].replace('\r', '\n')) duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) thumbnails = [] diff --git a/yt_dlp/extractor/nrl.py b/yt_dlp/extractor/nrl.py index 22a2df8d3f..0bd5086ae2 100644 --- a/yt_dlp/extractor/nrl.py +++ b/yt_dlp/extractor/nrl.py @@ -16,7 +16,6 @@ class NRLTVIE(InfoExtractor): 'params': { # m3u8 download 'skip_download': True, - 'format': 'bestvideo', }, } diff --git a/yt_dlp/extractor/ntvcojp.py b/yt_dlp/extractor/ntvcojp.py index 0c8221b223..c9af91188b 100644 --- a/yt_dlp/extractor/ntvcojp.py +++ b/yt_dlp/extractor/ntvcojp.py @@ -3,8 +3,9 @@ from .common import InfoExtractor from ..utils import ( - js_to_json, + ExtractorError, smuggle_url, + traverse_obj, ) @@ -19,7 +20,7 @@ class NTVCoJpCUIE(InfoExtractor): 'ext': 'mp4', 'title': '桜エビと炒り卵がポイント! 「中華風 エビチリおにぎり」──『美虎』五十嵐美幸', 'upload_date': '20181213', - 'description': 'md5:211b52f4fd60f3e0e72b68b0c6ba52a9', + 'description': 'md5:1985b51a9abc285df0104d982a325f2a', 'uploader_id': '3855502814001', 'timestamp': 1544669941, }, @@ -28,22 +29,30 @@ class NTVCoJpCUIE(InfoExtractor): 'skip_download': True, }, } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - player_config = self._parse_json(self._search_regex( - r'(?s)PLAYER_CONFIG\s*=\s*({.+?})', - webpage, 'player config'), display_id, js_to_json) - video_id = player_config['videoId'] - account_id = player_config.get('account') or '3855502814001' + player_config = self._search_nuxt_data(webpage, display_id) + video_id = traverse_obj(player_config, ('movie', 'video_id')) + if not video_id: + raise ExtractorError('Failed to extract video ID for Brightcove') + account_id = traverse_obj(player_config, ('player', 'account')) or '3855502814001' + title = traverse_obj(player_config, ('movie', 'name')) + if not title: + og_title = self._og_search_title(webpage, fatal=False) or traverse_obj(player_config, ('player', 'title')) + if og_title: + title = og_title.split('(', 1)[0].strip() + description = (traverse_obj(player_config, ('movie', 'description')) + or self._html_search_meta(['description', 'og:description'], webpage)) return { '_type': 'url_transparent', 'id': video_id, 'display_id': display_id, - 'title': self._search_regex(r']+class="title"[^>]*>([^<]+)', webpage, 'title').strip(), - 'description': self._html_search_meta(['description', 'og:description'], webpage), + 'title': title, + 'description': description, 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (account_id, video_id), {'geo_countries': ['JP']}), 'ie_key': 'BrightcoveNew', } diff --git a/yt_dlp/extractor/nuvid.py b/yt_dlp/extractor/nuvid.py index ab6bfcd7f4..7487824f98 100644 --- a/yt_dlp/extractor/nuvid.py +++ b/yt_dlp/extractor/nuvid.py @@ -1,71 +1,73 @@ +# coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( parse_duration, + int_or_none, + try_get, ) class NuvidIE(InfoExtractor): _VALID_URL = r'https?://(?:www|m)\.nuvid\.com/video/(?P[0-9]+)' - _TEST = { - 'url': 'http://m.nuvid.com/video/1310741/', - 'md5': 'eab207b7ac4fccfb4e23c86201f11277', + _TESTS = [{ + 'url': 'https://www.nuvid.com/video/6513023/italian-babe', + 'md5': '772d2f8288f3d3c5c45f7a41761c7844', 'info_dict': { - 'id': '1310741', + 'id': '6513023', 'ext': 'mp4', - 'title': 'Horny babes show their awesome bodeis and', - 'duration': 129, + 'title': 'italian babe', + 'duration': 321.0, 'age_limit': 18, } - } + }, { + 'url': 'https://m.nuvid.com/video/6523263', + 'info_dict': { + 'id': '6523263', + 'ext': 'mp4', + 'age_limit': 18, + 'title': 'Slut brunette college student anal dorm', + } + }] def _real_extract(self, url): video_id = self._match_id(url) - page_url = 'http://m.nuvid.com/video/%s' % video_id - webpage = self._download_webpage( - page_url, video_id, 'Downloading video page') - # When dwnld_speed exists and has a value larger than the MP4 file's - # bitrate, Nuvid returns the MP4 URL - # It's unit is 100bytes/millisecond, see mobile-nuvid-min.js for the algorithm - self._set_cookie('nuvid.com', 'dwnld_speed', '10.0') - mp4_webpage = self._download_webpage( - page_url, video_id, 'Downloading video page for MP4 format') + qualities = { + 'lq': '360p', + 'hq': '720p', + } - html5_video_re = r'(?s)<(?:video|audio)[^<]*(?:>.*?]*)?\s+src=["\'](.*?)["\']', - video_url = self._html_search_regex(html5_video_re, webpage, video_id) - mp4_video_url = self._html_search_regex(html5_video_re, mp4_webpage, video_id) - formats = [{ - 'url': video_url, - }] - if mp4_video_url != video_url: - formats.append({ - 'url': mp4_video_url, + json_url = f'https://www.nuvid.com/player_config_json/?vid={video_id}&aid=0&domain_id=0&embed=0&check_speed=0' + video_data = self._download_json( + json_url, video_id, headers={ + 'Accept': 'application/json, text/javascript, */*; q = 0.01', + 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', }) - title = self._html_search_regex( - [r'', - r'
    \s*]*>([^<]+)', - r']+class="title_thumb">([^<]+)'], webpage, 'title').strip() - thumbnails = [ - { - 'url': thumb_url, - } for thumb_url in re.findall(r'', webpage) - ] - thumbnail = thumbnails[0]['url'] if thumbnails else None - duration = parse_duration(self._html_search_regex( - [r'\s*(\d{2}:\d{2})', - r']+class="view_time">([^<]+)'], webpage, 'duration', fatal=False)) + formats = [{ + 'url': source, + 'format_id': qualities.get(quality), + 'height': int_or_none(qualities.get(quality)[:-1]), + } for quality, source in video_data.get('files').items() if source] + + self._check_formats(formats, video_id) + self._sort_formats(formats) + + title = video_data.get('title') + thumbnail_base_url = try_get(video_data, lambda x: x['thumbs']['url']) + thumbnail_extension = try_get(video_data, lambda x: x['thumbs']['extension']) + thumbnail_id = self._search_regex( + r'/media/videos/tmb/6523263/preview/(/d+)' + thumbnail_extension, video_data.get('poster', ''), 'thumbnail id', default=19) + thumbnail = f'{thumbnail_base_url}player/{thumbnail_id}{thumbnail_extension}' + duration = parse_duration(video_data.get('duration') or video_data.get('duration_format')) return { 'id': video_id, + 'formats': formats, 'title': title, - 'thumbnails': thumbnails, 'thumbnail': thumbnail, 'duration': duration, 'age_limit': 18, - 'formats': formats, } diff --git a/yt_dlp/extractor/nzherald.py b/yt_dlp/extractor/nzherald.py new file mode 100644 index 0000000000..e5601b4953 --- /dev/null +++ b/yt_dlp/extractor/nzherald.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor + +from ..compat import compat_str +from ..utils import ( + ExtractorError, + traverse_obj +) + + +class NZHeraldIE(InfoExtractor): + IE_NAME = 'nzherald' + _VALID_URL = r'https?://(?:www\.)?nzherald\.co\.nz/[\w\/-]+\/(?P[A-Z0-9]+)' + _TESTS = [ + { + 'url': 'https://www.nzherald.co.nz/nz/weather-heavy-rain-gales-across-nz-most-days-this-week/PTG7QWY4E2225YHZ5NAIRBTYTQ/', + 'info_dict': { + 'id': '6271084466001', + 'ext': 'mp4', + 'title': 'MetService severe weather warning: September 6th - 7th', + 'timestamp': 1630891576, + 'upload_date': '20210906', + 'uploader_id': '1308227299001', + 'description': 'md5:db6ca335a22e2cdf37ab9d2bcda52902' + } + + }, { + # Webpage has brightcove embed player url + 'url': 'https://www.nzherald.co.nz/travel/pencarrow-coastal-trail/HDVTPJEPP46HJ2UEMK4EGD2DFI/', + 'info_dict': { + 'id': '6261791733001', + 'ext': 'mp4', + 'title': 'Pencarrow Coastal Trail', + 'timestamp': 1625102897, + 'upload_date': '20210701', + 'uploader_id': '1308227299001', + 'description': 'md5:d361aaa0c6498f7ac1bc4fc0a0aec1e4' + } + + }, { + # two video embeds of the same video + 'url': 'https://www.nzherald.co.nz/nz/truck-driver-captured-cutting-off-motorist-on-state-highway-1-in-canterbury/FIHNJB7PLLPHWQPK4S7ZBDUC4I/', + 'info_dict': { + 'id': '6251114530001', + 'ext': 'mp4', + 'title': 'Truck travelling north from Rakaia runs car off road', + 'timestamp': 1619730509, + 'upload_date': '20210429', + 'uploader_id': '1308227299001', + 'description': 'md5:4cae7dfb7613ac4c73b9e73a75c6b5d7' + } + }, { + 'url': 'https://www.nzherald.co.nz/kahu/kaupapa-companies-my-taiao-supporting-maori-in-study-and-business/PQBO2J25WCG77VGRX7W7BVYEAI/', + 'only_matching': True + }, { + 'url': 'https://nzherald.co.nz/the-country/video/focus-nzs-first-mass-covid-19-vaccination-event/N5I7IL3BRFLZSD33TLDLYJDGK4/', + 'only_matching': True + }, { + 'url': 'https://www.nzherald.co.nz/the-vision-is-clear/news/tvic-damian-roper-planting-trees-an-addiction/AN2AAEPNRK5VLISDWQAJZB6ATQ', + 'only_matching': True + } + ] + + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1308227299001/S1BXZn8t_default/index.html?videoId=%s' + + def _extract_bc_embed_url(self, webpage): + """The initial webpage may include the brightcove player embed url""" + bc_url = BrightcoveNewIE._extract_url(self, webpage) + return bc_url or self._search_regex( + r'(?:embedUrl)\"\s*:\s*\"(?P%s)' % BrightcoveNewIE._VALID_URL, + webpage, 'embed url', default=None, group='embed_url') + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + bc_url = self._extract_bc_embed_url(webpage) + + if not bc_url: + fusion_metadata = self._parse_json( + self._search_regex(r'Fusion\.globalContent\s*=\s*({.+?})\s*;', webpage, 'fusion metadata'), article_id) + + video_metadata = fusion_metadata.get('video') + bc_video_id = traverse_obj( + video_metadata or fusion_metadata, # fusion metadata is the video metadata for video-only pages + 'brightcoveId', ('content_elements', ..., 'referent', 'id'), + get_all=False, expected_type=compat_str) + + if not bc_video_id: + if isinstance(video_metadata, dict) and len(video_metadata) == 0: + raise ExtractorError('This article does not have a video.', expected=True) + else: + raise ExtractorError('Failed to extract brightcove video id') + bc_url = self.BRIGHTCOVE_URL_TEMPLATE % bc_video_id + + return self.url_result(bc_url, 'BrightcoveNew') diff --git a/yt_dlp/extractor/odnoklassniki.py b/yt_dlp/extractor/odnoklassniki.py index 9cacd38158..42f210a9b7 100644 --- a/yt_dlp/extractor/odnoklassniki.py +++ b/yt_dlp/extractor/odnoklassniki.py @@ -12,6 +12,7 @@ ) from ..utils import ( ExtractorError, + float_or_none, unified_strdate, int_or_none, qualities, @@ -96,6 +97,14 @@ class OdnoklassnikiIE(InfoExtractor): 'skip_download': True, }, 'skip': 'Video has not been found', + }, { + 'note': 'Only available in mobile webpage', + 'url': 'https://m.ok.ru/video/2361249957145', + 'info_dict': { + 'id': '2361249957145', + 'title': 'Быковское крещение', + 'duration': 3038.181, + }, }, { 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', 'only_matching': True, @@ -131,13 +140,24 @@ def _extract_url(webpage): return mobj.group('url') def _real_extract(self, url): + try: + return self._extract_desktop(url) + except ExtractorError as e: + try: + return self._extract_mobile(url) + except ExtractorError: + # error message of desktop webpage is in English + raise e + + def _extract_desktop(self, url): start_time = int_or_none(compat_parse_qs( compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0]) video_id = self._match_id(url) webpage = self._download_webpage( - 'http://ok.ru/video/%s' % video_id, video_id) + 'http://ok.ru/video/%s' % video_id, video_id, + note='Downloading desktop webpage') error = self._search_regex( r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<', @@ -215,7 +235,7 @@ def _real_extract(self, url): assert title if provider == 'LIVE_TV_APP': - info['title'] = self._live_title(title) + info['title'] = title quality = qualities(('4', '0', '1', '2', '3', '5')) @@ -265,3 +285,32 @@ def _real_extract(self, url): info['formats'] = formats return info + + def _extract_mobile(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://m.ok.ru/video/%s' % video_id, video_id, + note='Downloading mobile webpage') + + error = self._search_regex( + r'видео\s*(.+?)
    ', + webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + + json_data = self._search_regex( + r'data-video="(.+?)"', webpage, 'json data') + json_data = self._parse_json(unescapeHTML(json_data), video_id) or {} + + return { + 'id': video_id, + 'title': json_data.get('videoName'), + 'duration': float_or_none(json_data.get('videoDuration'), scale=1000), + 'thumbnail': json_data.get('videoPosterSrc'), + 'formats': [{ + 'format_id': 'mobile', + 'url': json_data.get('videoSrc'), + 'ext': 'mp4', + }] + } diff --git a/yt_dlp/extractor/oktoberfesttv.py b/yt_dlp/extractor/oktoberfesttv.py index a914068f95..276567436b 100644 --- a/yt_dlp/extractor/oktoberfesttv.py +++ b/yt_dlp/extractor/oktoberfesttv.py @@ -25,8 +25,8 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._live_title(self._html_search_regex( - r'

    .*?(.*?)

    ', webpage, 'title')) + title = self._html_search_regex( + r'

    .*?(.*?)

    ', webpage, 'title') clip = self._search_regex( r"clip:\s*\{\s*url:\s*'([^']+)'", webpage, 'clip') diff --git a/yt_dlp/extractor/olympics.py b/yt_dlp/extractor/olympics.py new file mode 100644 index 0000000000..784f282c7b --- /dev/null +++ b/yt_dlp/extractor/olympics.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get +) + + +class OlympicsReplayIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?olympics\.com(?:/tokyo-2020)?/[a-z]{2}/(?:replay|video)/(?P[^/#&?]+)' + _TESTS = [{ + 'url': 'https://olympics.com/fr/video/men-s-109kg-group-a-weightlifting-tokyo-2020-replays', + 'info_dict': { + 'id': 'f6a0753c-8e6f-4b7d-a435-027054a4f8e9', + 'ext': 'mp4', + 'title': '+109kg (H) Groupe A - Haltérophilie | Replay de Tokyo 2020', + 'upload_date': '20210801', + 'timestamp': 1627783200, + 'description': 'md5:c66af4a5bc7429dbcc43d15845ff03b3', + 'uploader': 'International Olympic Committee', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://olympics.com/tokyo-2020/en/replay/bd242924-4b22-49a5-a846-f1d4c809250d/mens-bronze-medal-match-hun-esp', + 'only_matching': True, + }] + + def _real_extract(self, url): + id = self._match_id(url) + + webpage = self._download_webpage(url, id) + title = self._html_search_meta(('title', 'og:title', 'twitter:title'), webpage) + uuid = self._html_search_meta('episode_uid', webpage) + m3u8_url = self._html_search_meta('video_url', webpage) + json_ld = self._search_json_ld(webpage, uuid) + thumbnails_list = json_ld.get('image') + if not thumbnails_list: + thumbnails_list = self._html_search_regex( + r'["\']image["\']:\s*["\']([^"\']+)["\']', webpage, 'images', default='') + thumbnails_list = thumbnails_list.replace('[', '').replace(']', '').split(',') + thumbnails_list = [thumbnail.strip() for thumbnail in thumbnails_list] + thumbnails = [] + for thumbnail in thumbnails_list: + width_a, height_a, width = self._search_regex( + r'/images/image/private/t_(?P\d+)-(?P\d+)_(?P\d+)/primary/[\W\w\d]+', + thumbnail, 'thumb', group=(1, 2, 3), default=(None, None, None)) + width_a, height_a, width = int_or_none(width_a), int_or_none(height_a), int_or_none(width) + thumbnails.append({ + 'url': thumbnail, + 'width': width, + 'height': int_or_none(try_get(width, lambda x: x * height_a / width_a)) + }) + m3u8_url = self._download_json( + f'https://olympics.com/tokenGenerator?url={m3u8_url}', uuid, note='Downloading m3u8 url') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, uuid, m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': uuid, + 'title': title, + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, + **json_ld + } diff --git a/yt_dlp/extractor/on24.py b/yt_dlp/extractor/on24.py new file mode 100644 index 0000000000..d4d824430f --- /dev/null +++ b/yt_dlp/extractor/on24.py @@ -0,0 +1,91 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + strip_or_none, + try_get, + urljoin, +) + + +class On24IE(InfoExtractor): + IE_NAME = 'on24' + IE_DESC = 'ON24' + + _VALID_URL = r'''(?x) + https?://event\.on24\.com/(?: + wcc/r/(?P\d{7})/(?P[0-9A-F]{32})| + eventRegistration/(?:console/EventConsoleApollo|EventLobbyServlet\?target=lobby30) + \.jsp\?(?:[^/#?]*&)?eventid=(?P\d{7})[^/#?]*&key=(?P[0-9A-F]{32}) + )''' + + _TESTS = [{ + 'url': 'https://event.on24.com/eventRegistration/console/EventConsoleApollo.jsp?uimode=nextgeneration&eventid=2197467&sessionid=1&key=5DF57BE53237F36A43B478DD36277A84&contenttype=A&eventuserid=305999&playerwidth=1000&playerheight=650&caller=previewLobby&text_language_id=en&format=fhaudio&newConsole=false', + 'info_dict': { + 'id': '2197467', + 'ext': 'wav', + 'title': 'Pearson Test of English General/Pearson English International Certificate Teacher Training Guide', + 'upload_date': '20200219', + 'timestamp': 1582149600.0, + 'view_count': int, + } + }, { + 'url': 'https://event.on24.com/wcc/r/2639291/82829018E813065A122363877975752E?mode=login&email=johnsmith@gmail.com', + 'only_matching': True, + }, { + 'url': 'https://event.on24.com/eventRegistration/console/EventConsoleApollo.jsp?&eventid=2639291&sessionid=1&username=&partnerref=&format=fhvideo1&mobile=&flashsupportedmobiledevice=&helpcenter=&key=82829018E813065A122363877975752E&newConsole=true&nxChe=true&newTabCon=true&text_language_id=en&playerwidth=748&playerheight=526&eventuserid=338788762&contenttype=A&mediametricsessionid=384764716&mediametricid=3558192&usercd=369267058&mode=launch', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + event_id = mobj.group('id_1') or mobj.group('id_2') + event_key = mobj.group('key_1') or mobj.group('key_2') + + event_data = self._download_json( + 'https://event.on24.com/apic/utilApp/EventConsoleCachedServlet', + event_id, query={ + 'eventId': event_id, + 'displayProfile': 'player', + 'key': event_key, + 'contentType': 'A' + }) + event_id = str(try_get(event_data, lambda x: x['presentationLogInfo']['eventid'])) or event_id + language = event_data.get('localelanguagecode') + + formats = [] + for media in event_data.get('mediaUrlInfo', []): + media_url = urljoin('https://event.on24.com/media/news/corporatevideo/events/', str(media.get('url'))) + if not media_url: + continue + media_type = media.get('code') + if media_type == 'fhvideo1': + formats.append({ + 'format_id': 'video', + 'url': media_url, + 'language': language, + 'ext': 'mp4', + 'vcodec': 'avc1.640020', + 'acodec': 'mp4a.40.2', + }) + elif media_type == 'audio': + formats.append({ + 'format_id': 'audio', + 'url': media_url, + 'language': language, + 'ext': 'wav', + 'vcodec': 'none', + 'acodec': 'wav' + }) + self._sort_formats(formats) + + return { + 'id': event_id, + 'title': strip_or_none(event_data.get('description')), + 'timestamp': int_or_none(try_get(event_data, lambda x: x['session']['startdate']), 1000), + 'webpage_url': f'https://event.on24.com/wcc/r/{event_id}/{event_key}', + 'view_count': event_data.get('registrantcount'), + 'formats': formats, + } diff --git a/yt_dlp/extractor/ondemandkorea.py b/yt_dlp/extractor/ondemandkorea.py index cc3c587bc4..e933ea2cc8 100644 --- a/yt_dlp/extractor/ondemandkorea.py +++ b/yt_dlp/extractor/ondemandkorea.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -71,8 +73,8 @@ def _real_extract(self, url): jw_config = self._parse_json( self._search_regex( - r'(?s)odkPlayer\.init.*?(?P{[^;]+}).*?;', - webpage, 'jw config', group='options'), + r'playlist\s*=\s*\[(?P.+)];?$', + webpage, 'jw config', flags=re.MULTILINE, group='options'), video_id, transform_source=js_to_json) info = self._parse_jwplayer_data( jw_config, video_id, require_title=False, m3u8_id='hls', diff --git a/yt_dlp/extractor/onefootball.py b/yt_dlp/extractor/onefootball.py new file mode 100644 index 0000000000..826faadd2e --- /dev/null +++ b/yt_dlp/extractor/onefootball.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class OneFootballIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?onefootball\.com/[a-z]{2}/video/[^/&?#]+-(?P\d+)' + + _TESTS = [{ + 'url': 'https://onefootball.com/en/video/highlights-fc-zuerich-3-3-fc-basel-34012334', + 'info_dict': { + 'id': '34012334', + 'ext': 'mp4', + 'title': 'Highlights: FC Zürich 3-3 FC Basel', + 'description': 'md5:33d9855cb790702c4fe42a513700aba8', + 'thumbnail': 'https://photobooth-api.onefootball.com/api/screenshot/https:%2F%2Fperegrine-api.onefootball.com%2Fv2%2Fphotobooth%2Fcms%2Fen%2F34012334', + 'timestamp': 1635874604, + 'upload_date': '20211102' + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://onefootball.com/en/video/klopp-fumes-at-var-decisions-in-west-ham-defeat-34041020', + 'info_dict': { + 'id': '34041020', + 'ext': 'mp4', + 'title': 'Klopp fumes at VAR decisions in West Ham defeat', + 'description': 'md5:9c50371095a01ad3f63311c73d8f51a5', + 'thumbnail': 'https://photobooth-api.onefootball.com/api/screenshot/https:%2F%2Fperegrine-api.onefootball.com%2Fv2%2Fphotobooth%2Fcms%2Fen%2F34041020', + 'timestamp': 1636314103, + 'upload_date': '20211107' + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._search_json_ld(webpage, id) + m3u8_url = self._html_search_regex(r'(https://cdn\.jwplayer\.com/manifests/.+\.m3u8)', webpage, 'm3u8_url') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, id) + self._sort_formats(formats) + return { + 'id': id, + 'title': data_json.get('title'), + 'description': data_json.get('description'), + 'thumbnail': data_json.get('thumbnail'), + 'timestamp': data_json.get('timestamp'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/opencast.py b/yt_dlp/extractor/opencast.py new file mode 100644 index 0000000000..cf8d917176 --- /dev/null +++ b/yt_dlp/extractor/opencast.py @@ -0,0 +1,177 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + parse_iso8601, + traverse_obj, + variadic, +) + + +class OpencastBaseIE(InfoExtractor): + _INSTANCES_RE = r'''(?: + opencast\.informatik\.kit\.edu| + electures\.uni-muenster\.de| + oc-presentation\.ltcc\.tuwien\.ac\.at| + medien\.ph-noe\.ac\.at| + oc-video\.ruhr-uni-bochum\.de| + oc-video1\.ruhr-uni-bochum\.de| + opencast\.informatik\.uni-goettingen\.de| + heicast\.uni-heidelberg\.de| + opencast\.hawk\.de:8080| + opencast\.hs-osnabrueck\.de| + video[0-9]+\.virtuos\.uni-osnabrueck\.de| + opencast\.uni-koeln\.de| + media\.opencast\.hochschule-rhein-waal\.de| + matterhorn\.dce\.harvard\.edu| + hs-harz\.opencast\.uni-halle\.de| + videocampus\.urz\.uni-leipzig\.de| + media\.uct\.ac\.za| + vid\.igb\.illinois\.edu| + cursosabertos\.c3sl\.ufpr\.br| + mcmedia\.missioncollege\.org| + clases\.odon\.edu\.uy + )''' + _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' + + def _call_api(self, host, video_id, **kwargs): + return self._download_json(self._API_BASE % (host, video_id), video_id, **kwargs) + + def _parse_mediapackage(self, video): + video_id = video.get('id') + if video_id is None: + raise ExtractorError('Video id was not found') + + formats = [] + for track in variadic(traverse_obj(video, ('media', 'track')) or []): + href = track.get('url') + if href is None: + continue + ext = determine_ext(href, None) + + transport = track.get('transport') + + if transport == 'DASH' or ext == 'mpd': + formats.extend(self._extract_mpd_formats_and_subtitles(href, video_id, mpd_id='dash', fatal=False)) + elif transport == 'HLS' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats_and_subtitles( + href, video_id, m3u8_id='hls', entry_protocol='m3u8_native', fatal=False)) + elif transport == 'HDS' or ext == 'f4m': + formats.extend(self._extract_f4m_formats(href, video_id, f4m_id='hds', fatal=False)) + elif transport == 'SMOOTH': + formats.extend(self._extract_ism_formats(href, video_id, ism_id='smooth', fatal=False)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats(href, video_id, fatal=False)) + else: + track_obj = { + 'url': href, + 'ext': ext, + 'format_note': track.get('transport'), + 'resolution': traverse_obj(track, ('video', 'resolution')), + 'fps': int_or_none(traverse_obj(track, ('video', 'framerate'))), + 'vbr': int_or_none(traverse_obj(track, ('video', 'bitrate')), scale=1000), + 'vcodec': traverse_obj(track, ('video', 'encoder', 'type')) if track.get('video') else 'none', + 'abr': int_or_none(traverse_obj(track, ('audio', 'bitrate')), scale=1000), + 'asr': int_or_none(traverse_obj(track, ('audio', 'samplingrate'))), + 'acodec': traverse_obj(track, ('audio', 'encoder', 'type')) if track.get('audio') else 'none', + } + + if transport == 'RTMP': + m_obj = re.search(r'(?:rtmp://[^/]+/(?P[^/]+))/(?P.+):(?P.+)', href) + if not m_obj: + continue + track_obj.update({ + 'app': m_obj.group('app'), + 'ext': m_obj.group('ext'), + 'play_path': m_obj.group('ext') + ':' + m_obj.group('playpath'), + 'rtmp_live': True, + 'preference': -2, + }) + formats.append(track_obj) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': video.get('title'), + 'series': video.get('seriestitle'), + 'season_id': video.get('series'), + 'creator': traverse_obj(video, ('creators', 'creator')), + 'timestamp': parse_iso8601(video.get('start')), + 'thumbnail': traverse_obj(video, ('attachments', 'attachment', ..., 'url'), get_all=False), + } + + +class OpencastIE(OpencastBaseIE): + _VALID_URL = r'''(?x) + https?://(?P%s)/paella/ui/watch.html\?.*? + id=(?P%s) + ''' % (OpencastBaseIE._INSTANCES_RE, OpencastBaseIE._UUID_RE) + + _API_BASE = 'https://%s/search/episode.json?id=%s' + + _TESTS = [ + { + 'url': 'https://oc-video1.ruhr-uni-bochum.de/paella/ui/watch.html?id=ed063cd5-72c8-46b5-a60a-569243edcea8', + 'md5': '554c8e99a90f7be7e874619fcf2a3bc9', + 'info_dict': { + 'id': 'ed063cd5-72c8-46b5-a60a-569243edcea8', + 'ext': 'mp4', + 'title': '11 - Kryptographie - 24.11.2015', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1606208400, + 'upload_date': '20201124', + }, + } + ] + + def _real_extract(self, url): + host, video_id = self._match_valid_url(url).group('host', 'id') + return self._parse_mediapackage( + self._call_api(host, video_id)['search-results']['result']['mediapackage']) + + +class OpencastPlaylistIE(OpencastBaseIE): + _VALID_URL = r'''(?x) + https?://(?P%s)/engage/ui/index.html\?.*? + epFrom=(?P%s) + ''' % (OpencastBaseIE._INSTANCES_RE, OpencastBaseIE._UUID_RE) + + _API_BASE = 'https://%s/search/episode.json?sid=%s' + + _TESTS = [ + { + 'url': 'https://oc-video1.ruhr-uni-bochum.de/engage/ui/index.html?epFrom=cf68a4a1-36b1-4a53-a6ba-61af5705a0d0', + 'info_dict': { + 'id': 'cf68a4a1-36b1-4a53-a6ba-61af5705a0d0', + 'title': 'Kryptographie - WiSe 15/16', + }, + 'playlist_mincount': 28, + }, + { + 'url': 'https://oc-video.ruhr-uni-bochum.de/engage/ui/index.html?e=1&p=1&epFrom=b1a54262-3684-403f-9731-8e77c3766f9a', + 'info_dict': { + 'id': 'b1a54262-3684-403f-9731-8e77c3766f9a', + 'title': 'inSTUDIES-Social movements and prefigurative politics in a global perspective', + }, + 'playlist_mincount': 6, + }, + ] + + def _real_extract(self, url): + host, video_id = self._match_valid_url(url).group('host', 'id') + + entries = [ + self._parse_mediapackage(episode['mediapackage']) + for episode in variadic(self._call_api(host, video_id)['search-results']['result']) + if episode.get('mediapackage') + ] + + return self.playlist_result(entries, video_id, traverse_obj(entries, (0, 'series'))) diff --git a/yt_dlp/extractor/openload.py b/yt_dlp/extractor/openload.py index dfdd0e526e..6ec54509b6 100644 --- a/yt_dlp/extractor/openload.py +++ b/yt_dlp/extractor/openload.py @@ -17,7 +17,7 @@ get_exe_version, is_outdated_version, std_headers, - process_communicate_or_kill, + Popen, ) @@ -223,11 +223,10 @@ def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on w else: self.extractor.to_screen('%s: %s' % (video_id, note2)) - p = subprocess.Popen([ - self.exe, '--ssl-protocol=any', - self._TMP_FILES['script'].name - ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = process_communicate_or_kill(p) + p = Popen( + [self.exe, '--ssl-protocol=any', self._TMP_FILES['script'].name], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = p.communicate_or_kill() if p.returncode != 0: raise ExtractorError( 'Executing JS failed\n:' + encodeArgument(err)) diff --git a/yt_dlp/extractor/orf.py b/yt_dlp/extractor/orf.py index 428ec97e4f..e2b7038805 100644 --- a/yt_dlp/extractor/orf.py +++ b/yt_dlp/extractor/orf.py @@ -11,6 +11,7 @@ float_or_none, HEADRequest, int_or_none, + join_nonempty, orderedSet, remove_end, str_or_none, @@ -82,12 +83,7 @@ def _real_extract(self, url): src = url_or_none(fd.get('src')) if not src: continue - format_id_list = [] - for key in ('delivery', 'quality', 'quality_string'): - value = fd.get(key) - if value: - format_id_list.append(value) - format_id = '-'.join(format_id_list) + format_id = join_nonempty('delivery', 'quality', 'quality_string', from_dict=fd) ext = determine_ext(src) if ext == 'm3u8': m3u8_formats = self._extract_m3u8_formats( diff --git a/yt_dlp/extractor/palcomp3.py b/yt_dlp/extractor/palcomp3.py index 269e67a57e..d0a62fb17e 100644 --- a/yt_dlp/extractor/palcomp3.py +++ b/yt_dlp/extractor/palcomp3.py @@ -108,7 +108,7 @@ class PalcoMP3ArtistIE(PalcoMP3BaseIE): } name''' - @ classmethod + @classmethod def suitable(cls, url): return False if PalcoMP3IE._match_valid_url(url) else super(PalcoMP3ArtistIE, cls).suitable(url) diff --git a/yt_dlp/extractor/paramountplus.py b/yt_dlp/extractor/paramountplus.py index 338b84d5b8..17138985ae 100644 --- a/yt_dlp/extractor/paramountplus.py +++ b/yt_dlp/extractor/paramountplus.py @@ -60,7 +60,6 @@ class ParamountPlusIE(CBSBaseIE): }, 'params': { 'skip_download': 'm3u8', - 'format': 'bestvideo', }, 'expected_warnings': ['Ignoring subtitle tracks'], # TODO: Investigate this }, { @@ -76,7 +75,6 @@ class ParamountPlusIE(CBSBaseIE): }, 'params': { 'skip_download': 'm3u8', - 'format': 'bestvideo', }, 'expected_warnings': ['Ignoring subtitle tracks'], }, { diff --git a/yt_dlp/extractor/parliamentliveuk.py b/yt_dlp/extractor/parliamentliveuk.py index bdd5ff5654..974d65482f 100644 --- a/yt_dlp/extractor/parliamentliveuk.py +++ b/yt_dlp/extractor/parliamentliveuk.py @@ -1,6 +1,14 @@ +# coding: utf-8 from __future__ import unicode_literals +import json +import uuid + from .common import InfoExtractor +from ..utils import ( + unified_timestamp, + try_get, +) class ParliamentLiveUKIE(InfoExtractor): @@ -11,12 +19,11 @@ class ParliamentLiveUKIE(InfoExtractor): _TESTS = [{ 'url': 'http://parliamentlive.tv/Event/Index/c1e9d44d-fd6c-4263-b50f-97ed26cc998b', 'info_dict': { - 'id': '1_af9nv9ym', + 'id': 'c1e9d44d-fd6c-4263-b50f-97ed26cc998b', 'ext': 'mp4', 'title': 'Home Affairs Committee', - 'uploader_id': 'FFMPEG-01', - 'timestamp': 1422696664, - 'upload_date': '20150131', + 'timestamp': 1395153872, + 'upload_date': '20140318', }, }, { 'url': 'http://parliamentlive.tv/event/index/3f24936f-130f-40bf-9a5d-b3d6479da6a4', @@ -25,19 +32,49 @@ class ParliamentLiveUKIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://vodplayer.parliamentlive.tv/?mid=' + video_id, video_id) - widget_config = self._parse_json(self._search_regex( - r'(?s)kWidgetConfig\s*=\s*({.+});', - webpage, 'kaltura widget config'), video_id) - kaltura_url = 'kaltura:%s:%s' % ( - widget_config['wid'][1:], widget_config['entry_id']) - event_title = self._download_json( - 'http://parliamentlive.tv/Event/GetShareVideo/' + video_id, video_id)['event']['title'] + video_info = self._download_json(f'https://www.parliamentlive.tv/Event/GetShareVideo/{video_id}', video_id) + _DEVICE_ID = str(uuid.uuid4()) + auth = 'Bearer ' + self._download_json( + 'https://exposure.api.redbee.live/v2/customer/UKParliament/businessunit/ParliamentLive/auth/anonymous', + video_id, headers={ + 'Origin': 'https://videoplayback.parliamentlive.tv', + 'Accept': 'application/json, text/plain, */*', + 'Content-Type': 'application/json;charset=utf-8' + }, data=json.dumps({ + 'deviceId': _DEVICE_ID, + 'device': { + 'deviceId': _DEVICE_ID, + 'width': 653, + 'height': 368, + 'type': 'WEB', + 'name': ' Mozilla Firefox 91' + } + }).encode('utf-8'))['sessionToken'] + + video_urls = self._download_json( + f'https://exposure.api.redbee.live/v2/customer/UKParliament/businessunit/ParliamentLive/entitlement/{video_id}/play', + video_id, headers={'Authorization': auth, 'Accept': 'application/json, text/plain, */*'})['formats'] + + formats = [] + for format in video_urls: + if not format.get('mediaLocator'): + continue + if format.get('format') == 'DASH': + formats.extend(self._extract_mpd_formats( + format['mediaLocator'], video_id, mpd_id='dash', fatal=False)) + elif format.get('format') == 'SMOOTHSTREAMING': + formats.extend(self._extract_ism_formats( + format['mediaLocator'], video_id, ism_id='ism', fatal=False)) + elif format.get('format') == 'HLS': + formats.extend(self._extract_m3u8_formats( + format['mediaLocator'], video_id, m3u8_id='hls', fatal=False)) + + self._sort_formats(formats) + return { - '_type': 'url_transparent', - 'title': event_title, - 'description': '', - 'url': kaltura_url, - 'ie_key': 'Kaltura', + 'id': video_id, + 'formats': formats, + 'title': video_info['event']['title'], + 'timestamp': unified_timestamp(try_get(video_info, lambda x: x['event']['publishedStartTime'])), + 'thumbnail': video_info.get('thumbnailUrl'), } diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index 7bd892fa56..d3ee071e0f 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools + from .common import InfoExtractor from .vimeo import VimeoIE @@ -14,7 +16,7 @@ parse_iso8601, str_or_none, try_get, - url_or_none + url_or_none, ) @@ -159,7 +161,7 @@ def _real_extract(self, url): if try_get(attributes, lambda x: x['embed']['provider']) == 'Vimeo': embed_html = try_get(attributes, lambda x: x['embed']['html']) v_url = url_or_none(compat_urllib_parse_unquote( - self._search_regex(r'src=(https%3A%2F%2Fplayer\.vimeo\.com.+)%3F', embed_html, 'vimeo url', fatal=False))) + self._search_regex(r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', embed_html, 'vimeo url', fatal=False))) if v_url: info.update({ '_type': 'url_transparent', @@ -185,3 +187,56 @@ def _real_extract(self, url): }) return info + + +class PatreonUserIE(InfoExtractor): + + _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?!rss)(?P[-\w]+)' + + _TESTS = [{ + 'url': 'https://www.patreon.com/dissonancepod/', + 'info_dict': { + 'title': 'dissonancepod', + }, + 'playlist_mincount': 68, + 'expected_warnings': 'Post not viewable by current user! Skipping!', + }, { + 'url': 'https://www.patreon.com/dissonancepod/posts', + 'only_matching': True + }, ] + + @classmethod + def suitable(cls, url): + return False if PatreonIE.suitable(url) else super(PatreonUserIE, cls).suitable(url) + + def _entries(self, campaign_id, user_id): + cursor = None + params = { + 'fields[campaign]': 'show_audio_post_download_links,name,url', + 'fields[post]': 'current_user_can_view,embed,image,is_paid,post_file,published_at,patreon_url,url,post_type,thumbnail_url,title', + 'filter[campaign_id]': campaign_id, + 'filter[is_draft]': 'false', + 'sort': '-published_at', + 'json-api-version': 1.0, + 'json-api-use-default-includes': 'false', + } + + for page in itertools.count(1): + + params.update({'page[cursor]': cursor} if cursor else {}) + posts_json = self._download_json('https://www.patreon.com/api/posts', user_id, note='Downloading posts page %d' % page, query=params, headers={'Cookie': '.'}) + + cursor = try_get(posts_json, lambda x: x['meta']['pagination']['cursors']['next']) + + for post in posts_json.get('data') or []: + yield self.url_result(url_or_none(try_get(post, lambda x: x['attributes']['patreon_url'])), 'Patreon') + + if cursor is None: + break + + def _real_extract(self, url): + + user_id = self._match_id(url) + webpage = self._download_webpage(url, user_id, headers={'Cookie': '.'}) + campaign_id = self._search_regex(r'https://www.patreon.com/api/campaigns/(\d+)/?', webpage, 'Campaign ID') + return self.playlist_result(self._entries(campaign_id, user_id), playlist_title=user_id) diff --git a/yt_dlp/extractor/pbs.py b/yt_dlp/extractor/pbs.py index d68855d62d..ffaa6bf929 100644 --- a/yt_dlp/extractor/pbs.py +++ b/yt_dlp/extractor/pbs.py @@ -193,7 +193,7 @@ class PBSIE(InfoExtractor): # Article with embedded player (or direct video) (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P[^/]+?)(?:\.html)?/?(?:$|[?\#]) | # Player - (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P[^/]+)/ + (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P[^/]+) ) ''' % '|'.join(list(zip(*_STATIONS))[0]) @@ -600,6 +600,7 @@ def extract_redirect_urls(info): formats = [] http_url = None + hls_subs = {} for num, redirect in enumerate(redirects): redirect_id = redirect.get('eeid') @@ -622,8 +623,9 @@ def extract_redirect_urls(info): continue if determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, display_id, 'mp4', m3u8_id='hls', fatal=False)) + hls_formats, hls_subs = self._extract_m3u8_formats_and_subtitles( + format_url, display_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(hls_formats) else: formats.append({ 'url': format_url, @@ -666,25 +668,12 @@ def extract_redirect_urls(info): age_limit = US_RATINGS.get(rating_str) subtitles = {} - closed_captions_url = info.get('closed_captions_url') - if closed_captions_url: - subtitles['en'] = [{ - 'ext': 'ttml', - 'url': closed_captions_url, - }] - mobj = re.search(r'/(\d+)_Encoded\.dfxp', closed_captions_url) - if mobj: - ttml_caption_suffix, ttml_caption_id = mobj.group(0, 1) - ttml_caption_id = int(ttml_caption_id) - subtitles['en'].extend([{ - 'url': closed_captions_url.replace( - ttml_caption_suffix, '/%d_Encoded.srt' % (ttml_caption_id + 1)), - 'ext': 'srt', - }, { - 'url': closed_captions_url.replace( - ttml_caption_suffix, '/%d_Encoded.vtt' % (ttml_caption_id + 2)), - 'ext': 'vtt', - }]) + captions = info.get('cc') or {} + for caption_url in captions.values(): + subtitles.setdefault('en', []).append({ + 'url': caption_url + }) + subtitles = self._merge_subtitles(subtitles, hls_subs) # info['title'] is often incomplete (e.g. 'Full Episode', 'Episode 5', etc) # Try turning it to 'program - title' naming scheme if possible diff --git a/yt_dlp/extractor/peertube.py b/yt_dlp/extractor/peertube.py index fb9fbb2e55..1e22f24e3f 100644 --- a/yt_dlp/extractor/peertube.py +++ b/yt_dlp/extractor/peertube.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import re from .common import InfoExtractor @@ -13,12 +14,644 @@ unified_timestamp, url_or_none, urljoin, + OnDemandPagedList, ) class PeerTubeIE(InfoExtractor): _INSTANCES_RE = r'''(?: # Taken from https://instances.joinpeertube.org/instances + 40two\.tube| + a\.metube\.ch| + advtv\.ml| + algorithmic\.tv| + alimulama\.com| + arcana\.fun| + archive\.vidicon\.org| + artefac-paris\.tv| + auf1\.eu| + battlepenguin\.video| + beertube\.epgn\.ch| + befree\.nohost\.me| + bideoak\.argia\.eus| + birkeundnymphe\.de| + bitcointv\.com| + cattube\.org| + clap\.nerv-project\.eu| + climatejustice\.video| + comf\.tube| + conspiracydistillery\.com| + darkvapor\.nohost\.me| + daschauher\.aksel\.rocks| + digitalcourage\.video| + dreiecksnebel\.alex-detsch\.de| + eduvid\.org| + evangelisch\.video| + exo\.tube| + fair\.tube| + fediverse\.tv| + film\.k-prod\.fr| + flim\.txmn\.tk| + fotogramas\.politicaconciencia\.org| + ftsi\.ru| + gary\.vger\.cloud| + graeber\.video| + greatview\.video| + grypstube\.uni-greifswald\.de| + highvoltage\.tv| + hpstube\.fr| + htp\.live| + hyperreal\.tube| + juggling\.digital| + kino\.kompot\.si| + kino\.schuerz\.at| + kinowolnosc\.pl| + kirche\.peertube-host\.de| + kodcast\.com| + kolektiva\.media| + kraut\.zone| + kumi\.tube| + lastbreach\.tv| + lepetitmayennais\.fr\.nf| + lexx\.impa\.me| + libertynode\.tv| + libra\.syntazia\.org| + libremedia\.video| + live\.libratoi\.org| + live\.nanao\.moe| + live\.toobnix\.org| + livegram\.net| + lolitube\.freedomchan\.moe| + lucarne\.balsamine\.be| + maindreieck-tv\.de| + mani\.tube| + manicphase\.me| + media\.gzevd\.de| + media\.inno3\.cricket| + media\.kaitaia\.life| + media\.krashboyz\.org| + media\.over-world\.org| + media\.skewed\.de| + media\.undeadnetwork\.de| + medias\.pingbase\.net| + melsungen\.peertube-host\.de| + mirametube\.fr| + mojotube\.net| + monplaisirtube\.ddns\.net| + mountaintown\.video| + my\.bunny\.cafe| + myfreetube\.de| + mytube\.kn-cloud\.de| + mytube\.madzel\.de| + myworkoutarenapeertube\.cf| + nanawel-peertube\.dyndns\.org| + nastub\.cz| + offenes\.tv| + orgdup\.media| + ovaltube\.codinglab\.ch| + p2ptv\.ru| + p\.eertu\.be| + p\.lu| + peer\.azurs\.fr| + peertube1\.zeteo\.me| + peertube\.020\.pl| + peertube\.0x5e\.eu| + peertube\.alpharius\.io| + peertube\.am-networks\.fr| + peertube\.anduin\.net| + peertube\.anzui\.dev| + peertube\.arbleizez\.bzh| + peertube\.art3mis\.de| + peertube\.atilla\.org| + peertube\.atsuchan\.page| + peertube\.aukfood\.net| + peertube\.aventer\.biz| + peertube\.b38\.rural-it\.org| + peertube\.beeldengeluid\.nl| + peertube\.be| + peertube\.bgzashtita\.es| + peertube\.bitsandlinux\.com| + peertube\.biz| + peertube\.boba\.best| + peertube\.br0\.fr| + peertube\.bridaahost\.ynh\.fr| + peertube\.bubbletea\.dev| + peertube\.bubuit\.net| + peertube\.cabaal\.net| + peertube\.cats-home\.net| + peertube\.chemnitz\.freifunk\.net| + peertube\.chevro\.fr| + peertube\.chrisspiegl\.com| + peertube\.chtisurel\.net| + peertube\.cipherbliss\.com| + peertube\.cloud\.sans\.pub| + peertube\.cpge-brizeux\.fr| + peertube\.ctseuro\.com| + peertube\.cuatrolibertades\.org| + peertube\.cybercirujas\.club| + peertube\.cythin\.com| + peertube\.davigge\.com| + peertube\.dc\.pini\.fr| + peertube\.debian\.social| + peertube\.demonix\.fr| + peertube\.designersethiques\.org| + peertube\.desmu\.fr| + peertube\.devloprog\.org| + peertube\.devol\.it| + peertube\.dtmf\.ca| + peertube\.ecologie\.bzh| + peertube\.eu\.org| + peertube\.european-pirates\.eu| + peertube\.euskarabildua\.eus| + peertube\.fenarinarsa\.com| + peertube\.fomin\.site| + peertube\.forsud\.be| + peertube\.francoispelletier\.org| + peertube\.freenet\.ru| + peertube\.freetalklive\.com| + peertube\.functional\.cafe| + peertube\.gardeludwig\.fr| + peertube\.gargantia\.fr| + peertube\.gcfamily\.fr| + peertube\.genma\.fr| + peertube\.get-racing\.de| + peertube\.gidikroon\.eu| + peertube\.gruezishop\.ch| + peertube\.habets\.house| + peertube\.hackerfraternity\.org| + peertube\.ichigo\.everydayimshuflin\.com| + peertube\.ignifi\.me| + peertube\.inapurna\.org| + peertube\.informaction\.info| + peertube\.interhop\.org| + peertube\.iselfhost\.com| + peertube\.it| + peertube\.jensdiemer\.de| + peertube\.joffreyverd\.fr| + peertube\.kalua\.im| + peertube\.kathryl\.fr| + peertube\.keazilla\.net| + peertube\.klaewyss\.fr| + peertube\.kodcast\.com| + peertube\.kx\.studio| + peertube\.lagvoid\.com| + peertube\.lavallee\.tech| + peertube\.le5emeaxe\.fr| + peertube\.lestutosdeprocessus\.fr| + peertube\.librenet\.co\.za| + peertube\.logilab\.fr| + peertube\.louisematic\.site| + peertube\.luckow\.org| + peertube\.luga\.at| + peertube\.lyceeconnecte\.fr| + peertube\.manalejandro\.com| + peertube\.marud\.fr| + peertube\.mattone\.net| + peertube\.maxweiss\.io| + peertube\.monlycee\.net| + peertube\.mxinfo\.fr| + peertube\.myrasp\.eu| + peertube\.nebelcloud\.de| + peertube\.netzbegruenung\.de| + peertube\.newsocial\.tech| + peertube\.nicolastissot\.fr| + peertube\.nz| + peertube\.offerman\.com| + peertube\.opencloud\.lu| + peertube\.orthus\.link| + peertube\.patapouf\.xyz| + peertube\.pi2\.dev| + peertube\.plataformess\.org| + peertube\.pl| + peertube\.portaesgnos\.org| + peertube\.r2\.enst\.fr| + peertube\.r5c3\.fr| + peertube\.radres\.xyz| + peertube\.red| + peertube\.robonomics\.network| + peertube\.rtnkv\.cloud| + peertube\.runfox\.tk| + peertube\.satoshishop\.de| + peertube\.scic-tetris\.org| + peertube\.securitymadein\.lu| + peertube\.semweb\.pro| + peertube\.social\.my-wan\.de| + peertube\.soykaf\.org| + peertube\.stefofficiel\.me| + peertube\.stream| + peertube\.su| + peertube\.swrs\.net| + peertube\.takeko\.cyou| + peertube\.tangentfox\.com| + peertube\.taxinachtegel\.de| + peertube\.thenewoil\.xyz| + peertube\.ti-fr\.com| + peertube\.tiennot\.net| + peertube\.troback\.com| + peertube\.tspu\.edu\.ru| + peertube\.tux\.ovh| + peertube\.tv| + peertube\.tweb\.tv| + peertube\.ucy\.de| + peertube\.underworld\.fr| + peertube\.us\.to| + peertube\.ventresmous\.fr| + peertube\.vlaki\.cz| + peertube\.w\.utnw\.de| + peertube\.westring\.digital| + peertube\.xwiki\.com| + peertube\.zoz-serv\.org| + peervideo\.ru| + periscope\.numenaute\.org| + perron-tube\.de| + petitlutinartube\.fr| + phijkchu\.com| + pierre\.tube| + piraten\.space| + play\.rosano\.ca| + player\.ojamajo\.moe| + plextube\.nl| + pocketnetpeertube1\.nohost\.me| + pocketnetpeertube3\.nohost\.me| + pocketnetpeertube4\.nohost\.me| + pocketnetpeertube5\.nohost\.me| + pocketnetpeertube6\.nohost\.me| + pt\.24-7\.ro| + pt\.apathy\.top| + pt\.diaspodon\.fr| + pt\.fedi\.tech| + pt\.maciej\.website| + ptb\.lunarviews\.net| + ptmir1\.inter21\.net| + ptmir2\.inter21\.net| + ptmir3\.inter21\.net| + ptmir4\.inter21\.net| + ptmir5\.inter21\.net| + ptube\.horsentiers\.fr| + ptube\.xmanifesto\.club| + queermotion\.org| + re-wizja\.re-medium\.com| + regarder\.sans\.pub| + ruraletv\.ovh| + s1\.gegenstimme\.tv| + s2\.veezee\.tube| + sdmtube\.fr| + sender-fm\.veezee\.tube| + serv1\.wiki-tube\.de| + serv3\.wiki-tube\.de| + sickstream\.net| + sleepy\.tube| + sovran\.video| + spectra\.video| + stream\.elven\.pw| + stream\.k-prod\.fr| + stream\.shahab\.nohost\.me| + streamsource\.video| + studios\.racer159\.com| + testtube\.florimond\.eu| + tgi\.hosted\.spacebear\.ee| + thaitube\.in\.th| + the\.jokertv\.eu| + theater\.ethernia\.net| + thecool\.tube| + tilvids\.com| + toob\.bub\.org| + tpaw\.video| + truetube\.media| + tuba\.lhub\.pl| + tube-aix-marseille\.beta\.education\.fr| + tube-amiens\.beta\.education\.fr| + tube-besancon\.beta\.education\.fr| + tube-bordeaux\.beta\.education\.fr| + tube-clermont-ferrand\.beta\.education\.fr| + tube-corse\.beta\.education\.fr| + tube-creteil\.beta\.education\.fr| + tube-dijon\.beta\.education\.fr| + tube-education\.beta\.education\.fr| + tube-grenoble\.beta\.education\.fr| + tube-lille\.beta\.education\.fr| + tube-limoges\.beta\.education\.fr| + tube-montpellier\.beta\.education\.fr| + tube-nancy\.beta\.education\.fr| + tube-nantes\.beta\.education\.fr| + tube-nice\.beta\.education\.fr| + tube-normandie\.beta\.education\.fr| + tube-orleans-tours\.beta\.education\.fr| + tube-outremer\.beta\.education\.fr| + tube-paris\.beta\.education\.fr| + tube-poitiers\.beta\.education\.fr| + tube-reims\.beta\.education\.fr| + tube-rennes\.beta\.education\.fr| + tube-strasbourg\.beta\.education\.fr| + tube-toulouse\.beta\.education\.fr| + tube-versailles\.beta\.education\.fr| + tube1\.it\.tuwien\.ac\.at| + tube\.abolivier\.bzh| + tube\.ac-amiens\.fr| + tube\.aerztefueraufklaerung\.de| + tube\.alexx\.ml| + tube\.amic37\.fr| + tube\.anufrij\.de| + tube\.apolut\.net| + tube\.arkhalabs\.io| + tube\.arthack\.nz| + tube\.as211696\.net| + tube\.avensio\.de| + tube\.azbyka\.ru| + tube\.azkware\.net| + tube\.bachaner\.fr| + tube\.bmesh\.org| + tube\.borked\.host| + tube\.bstly\.de| + tube\.chaoszone\.tv| + tube\.chatelet\.ovh| + tube\.cloud-libre\.eu| + tube\.cms\.garden| + tube\.cowfee\.moe| + tube\.cryptography\.dog| + tube\.darknight-coffee\.org| + tube\.dev\.lhub\.pl| + tube\.distrilab\.fr| + tube\.dsocialize\.net| + tube\.ebin\.club| + tube\.fdn\.fr| + tube\.florimond\.eu| + tube\.foxarmy\.ml| + tube\.foxden\.party| + tube\.frischesicht\.de| + tube\.futuretic\.fr| + tube\.gnous\.eu| + tube\.grap\.coop| + tube\.graz\.social| + tube\.grin\.hu| + tube\.hackerscop\.org| + tube\.hordearii\.fr| + tube\.jeena\.net| + tube\.kai-stuht\.com| + tube\.kockatoo\.org| + tube\.kotur\.org| + tube\.lacaveatonton\.ovh| + tube\.linkse\.media| + tube\.lokad\.com| + tube\.lucie-philou\.com| + tube\.melonbread\.xyz| + tube\.mfraters\.net| + tube\.motuhake\.xyz| + tube\.mrbesen\.de| + tube\.nah\.re| + tube\.nchoco\.net| + tube\.novg\.net| + tube\.nox-rhea\.org| + tube\.nuagelibre\.fr| + tube\.nx12\.net| + tube\.octaplex\.net| + tube\.odat\.xyz| + tube\.oisux\.org| + tube\.opportunis\.me| + tube\.org\.il| + tube\.ortion\.xyz| + tube\.others\.social| + tube\.picasoft\.net| + tube\.plomlompom\.com| + tube\.pmj\.rocks| + tube\.portes-imaginaire\.org| + tube\.pyngu\.com| + tube\.rebellion\.global| + tube\.rhythms-of-resistance\.org| + tube\.rita\.moe| + tube\.rsi\.cnr\.it| + tube\.s1gm4\.eu| + tube\.saumon\.io| + tube\.schleuss\.online| + tube\.schule\.social| + tube\.seditio\.fr| + tube\.shanti\.cafe| + tube\.shela\.nu| + tube\.skrep\.in| + tube\.sp-codes\.de| + tube\.sp4ke\.com| + tube\.superseriousbusiness\.org| + tube\.systest\.eu| + tube\.tappret\.fr| + tube\.tardis\.world| + tube\.toontoet\.nl| + tube\.tpshd\.de| + tube\.troopers\.agency| + tube\.tylerdavis\.xyz| + tube\.undernet\.uy| + tube\.vigilian-consulting\.nl| + tube\.vraphim\.com| + tube\.wehost\.lgbt| + tube\.wien\.rocks| + tube\.wolfe\.casa| + tube\.xd0\.de| + tube\.xy-space\.de| + tube\.yapbreak\.fr| + tubedu\.org| + tubes\.jodh\.us| + tuktube\.com| + turkum\.me| + tututu\.tube| + tuvideo\.encanarias\.info| + tv1\.cocu\.cc| + tv1\.gomntu\.space| + tv2\.cocu\.cc| + tv\.adn\.life| + tv\.atmx\.ca| + tv\.bitma\.st| + tv\.generallyrubbish\.net\.au| + tv\.lumbung\.space| + tv\.mattchristiansenmedia\.com| + tv\.netwhood\.online| + tv\.neue\.city| + tv\.piejacker\.net| + tv\.pirateradio\.social| + tv\.undersco\.re| + tvox\.ru| + twctube\.twc-zone\.eu| + unfilter\.tube| + v\.basspistol\.org| + v\.kisombrella\.top| + v\.lastorder\.xyz| + v\.lor\.sh| + v\.phreedom\.club| + v\.sil\.sh| + v\.szy\.io| + v\.xxxapex\.com| + veezee\.tube| + vid\.dascoyote\.xyz| + vid\.garwood\.io| + vid\.ncrypt\.at| + vid\.pravdastalina\.info| + vid\.qorg11\.net| + vid\.rajeshtaylor\.com| + vid\.samtripoli\.com| + vid\.werefox\.dev| + vid\.wildeboer\.net| + video-cave-v2\.de| + video\.076\.ne\.jp| + video\.1146\.nohost\.me| + video\.altertek\.org| + video\.anartist\.org| + video\.apps\.thedoodleproject\.net| + video\.artist\.cx| + video\.asgardius\.company| + video\.balsillie\.net| + video\.bards\.online| + video\.binarydad\.com| + video\.blast-info\.fr| + video\.catgirl\.biz| + video\.cigliola\.com| + video\.cm-en-transition\.fr| + video\.cnt\.social| + video\.coales\.co| + video\.codingfield\.com| + video\.comptoir\.net| + video\.comune\.trento\.it| + video\.cpn\.so| + video\.csc49\.fr| + video\.cybre\.town| + video\.demokratischer-sommer\.de| + video\.discord-insoumis\.fr| + video\.dolphincastle\.com| + video\.dresden\.network| + video\.ecole-89\.com| + video\.elgrillolibertario\.org| + video\.emergeheart\.info| + video\.eradicatinglove\.xyz| + video\.ethantheenigma\.me| + video\.exodus-privacy\.eu\.org| + video\.fbxl\.net| + video\.fhtagn\.org| + video\.greenmycity\.eu| + video\.guerredeclasse\.fr| + video\.gyt\.is| + video\.hackers\.town| + video\.hardlimit\.com| + video\.hooli\.co| + video\.igem\.org| + video\.internet-czas-dzialac\.pl| + video\.islameye\.com| + video\.kicik\.fr| + video\.kuba-orlik\.name| + video\.kyushojitsu\.ca| + video\.lavolte\.net| + video\.lespoesiesdheloise\.fr| + video\.liberta\.vip| + video\.liege\.bike| + video\.linc\.systems| + video\.linux\.it| + video\.linuxtrent\.it| + video\.lokal\.social| + video\.lono\.space| + video\.lunasqu\.ee| + video\.lundi\.am| + video\.marcorennmaus\.de| + video\.mass-trespass\.uk| + video\.mugoreve\.fr| + video\.mundodesconocido\.com| + video\.mycrowd\.ca| + video\.nogafam\.es| + video\.odayacres\.farm| + video\.ozgurkon\.org| + video\.p1ng0ut\.social| + video\.p3x\.de| + video\.pcf\.fr| + video\.pony\.gallery| + video\.potate\.space| + video\.pourpenser\.pro| + video\.progressiv\.dev| + video\.resolutions\.it| + video\.rw501\.de| + video\.screamer\.wiki| + video\.sdm-tools\.net| + video\.sftblw\.moe| + video\.shitposter\.club| + video\.skyn3t\.in| + video\.soi\.ch| + video\.stuartbrand\.co\.uk| + video\.thinkof\.name| + video\.toot\.pt| + video\.triplea\.fr| + video\.turbo\.chat| + video\.vaku\.org\.ua| + video\.veloma\.org| + video\.violoncello\.ch| + video\.wilkie\.how| + video\.wsf2021\.info| + videorelay\.co| + videos-passages\.huma-num\.fr| + videos\.3d-wolf\.com| + videos\.ahp-numerique\.fr| + videos\.alexandrebadalo\.pt| + videos\.archigny\.net| + videos\.benjaminbrady\.ie| + videos\.buceoluegoexisto\.com| + videos\.capas\.se| + videos\.casually\.cat| + videos\.cloudron\.io| + videos\.coletivos\.org| + videos\.danksquad\.org| + videos\.denshi\.live| + videos\.fromouter\.space| + videos\.fsci\.in| + videos\.globenet\.org| + videos\.hauspie\.fr| + videos\.hush\.is| + videos\.john-livingston\.fr| + videos\.jordanwarne\.xyz| + videos\.lavoixdessansvoix\.org| + videos\.leslionsfloorball\.fr| + videos\.lucero\.top| + videos\.martyn\.berlin| + videos\.mastodont\.cat| + videos\.monstro1\.com| + videos\.npo\.city| + videos\.optoutpod\.com| + videos\.petch\.rocks| + videos\.pzelawski\.xyz| + videos\.rampin\.org| + videos\.scanlines\.xyz| + videos\.shmalls\.pw| + videos\.sibear\.fr| + videos\.stadtfabrikanten\.org| + videos\.tankernn\.eu| + videos\.testimonia\.org| + videos\.thisishowidontdisappear\.com| + videos\.traumaheilung\.net| + videos\.trom\.tf| + videos\.wakkerewereld\.nu| + videos\.weblib\.re| + videos\.yesil\.club| + vids\.roshless\.me| + vids\.tekdmn\.me| + vidz\.dou\.bet| + vod\.lumikko\.dev| + vs\.uniter\.network| + vulgarisation-informatique\.fr| + watch\.breadtube\.tv| + watch\.deranalyst\.ch| + watch\.ignorance\.eu| + watch\.krazy\.party| + watch\.libertaria\.space| + watch\.rt4mn\.org| + watch\.softinio\.com| + watch\.tubelab\.video| + web-fellow\.de| + webtv\.vandoeuvre\.net| + wechill\.space| + wikileaks\.video| + wiwi\.video| + worldofvids\.com| + wwtube\.net| + www4\.mir\.inter21\.net| + www\.birkeundnymphe\.de| + www\.captain-german\.com| + www\.wiki-tube\.de| + xxivproduction\.video| + xxx\.noho\.st| + + # from youtube-dl peertube\.rainbowswingers\.net| tube\.stanisic\.nl| peer\.suiri\.us| @@ -439,9 +1072,9 @@ class PeerTubeIE(InfoExtractor): 'uploader': 'Framasoft', 'uploader_id': '3', 'uploader_url': 'https://framatube.org/accounts/framasoft', - 'channel': 'Les vidéos de Framasoft', - 'channel_id': '2', - 'channel_url': 'https://framatube.org/video-channels/bf54d359-cfad-4935-9d45-9d6be93f63e8', + 'channel': 'A propos de PeerTube', + 'channel_id': '2215', + 'channel_url': 'https://framatube.org/video-channels/joinpeertube', 'language': 'en', 'license': 'Attribution - Share Alike', 'duration': 113, @@ -497,20 +1130,20 @@ class PeerTubeIE(InfoExtractor): 'uploader': 'Drew DeVault', } }, { - 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', + 'url': 'https://peertube.debian.social/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', 'only_matching': True, }, { # nsfw - 'url': 'https://tube.22decembre.eu/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39', + 'url': 'https://vod.ksite.de/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39', 'only_matching': True, }, { - 'url': 'https://tube.22decembre.eu/videos/embed/fed67262-6edb-4d1c-833b-daa9085c71d7', + 'url': 'https://vod.ksite.de/videos/embed/fed67262-6edb-4d1c-833b-daa9085c71d7', 'only_matching': True, }, { - 'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8', + 'url': 'https://peertube.tv/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8', 'only_matching': True, }, { - 'url': 'peertube:video.blender.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205', + 'url': 'peertube:framatube.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205', 'only_matching': True, }] @@ -660,3 +1293,110 @@ def channel_data(field, type_): 'subtitles': subtitles, 'webpage_url': webpage_url, } + + +class PeerTubePlaylistIE(InfoExtractor): + IE_NAME = 'PeerTube:Playlist' + _TYPES = { + 'a': 'accounts', + 'c': 'video-channels', + 'w/p': 'video-playlists', + } + _VALID_URL = r'''(?x) + https?://(?P%s)/(?P(?:%s))/ + (?P[^/]+) + ''' % (PeerTubeIE._INSTANCES_RE, '|'.join(_TYPES.keys())) + _TESTS = [{ + 'url': 'https://peertube.tux.ovh/w/p/3af94cba-95e8-4b74-b37a-807ab6d82526', + 'info_dict': { + 'id': '3af94cba-95e8-4b74-b37a-807ab6d82526', + 'description': 'playlist', + 'timestamp': 1611171863, + 'title': 'playlist', + }, + 'playlist_mincount': 6, + }, { + 'url': 'https://peertube.tux.ovh/w/p/wkyqcQBnsvFxtUB2pkYc1e', + 'info_dict': { + 'id': 'wkyqcQBnsvFxtUB2pkYc1e', + 'description': 'Cette liste de vidéos contient uniquement les jeux qui peuvent être terminés en une seule vidéo.', + 'title': 'Let\'s Play', + 'timestamp': 1604147331, + }, + 'playlist_mincount': 6, + }, { + 'url': 'https://peertube.debian.social/w/p/hFdJoTuyhNJVa1cDWd1d12', + 'info_dict': { + 'id': 'hFdJoTuyhNJVa1cDWd1d12', + 'description': 'Diversas palestras do Richard Stallman no Brasil.', + 'title': 'Richard Stallman no Brasil', + 'timestamp': 1599676222, + }, + 'playlist_mincount': 9, + }, { + 'url': 'https://peertube2.cpy.re/a/chocobozzz/videos', + 'info_dict': { + 'id': 'chocobozzz', + 'timestamp': 1553874564, + 'title': 'chocobozzz', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://framatube.org/c/bf54d359-cfad-4935-9d45-9d6be93f63e8/videos', + 'info_dict': { + 'id': 'bf54d359-cfad-4935-9d45-9d6be93f63e8', + 'timestamp': 1519917377, + 'title': 'Les vidéos de Framasoft', + }, + 'playlist_mincount': 345, + }, { + 'url': 'https://peertube2.cpy.re/c/blender_open_movies@video.blender.org/videos', + 'info_dict': { + 'id': 'blender_open_movies@video.blender.org', + 'timestamp': 1542287810, + 'title': 'Official Blender Open Movies', + }, + 'playlist_mincount': 11, + }] + _API_BASE = 'https://%s/api/v1/%s/%s%s' + _PAGE_SIZE = 30 + + def call_api(self, host, name, path, base, **kwargs): + return self._download_json( + self._API_BASE % (host, base, name, path), name, **kwargs) + + def fetch_page(self, host, id, type, page): + page += 1 + video_data = self.call_api( + host, id, + f'/videos?sort=-createdAt&start={self._PAGE_SIZE * (page - 1)}&count={self._PAGE_SIZE}&nsfw=both', + type, note=f'Downloading page {page}').get('data', []) + for video in video_data: + shortUUID = video.get('shortUUID') or try_get(video, lambda x: x['video']['shortUUID']) + video_title = video.get('name') or try_get(video, lambda x: x['video']['name']) + yield self.url_result( + f'https://{host}/w/{shortUUID}', PeerTubeIE.ie_key(), + video_id=shortUUID, video_title=video_title) + + def _extract_playlist(self, host, type, id): + info = self.call_api(host, id, '', type, note='Downloading playlist information', fatal=False) + + playlist_title = info.get('displayName') + playlist_description = info.get('description') + playlist_timestamp = unified_timestamp(info.get('createdAt')) + channel = try_get(info, lambda x: x['ownerAccount']['name']) or info.get('displayName') + channel_id = try_get(info, lambda x: x['ownerAccount']['id']) or info.get('id') + thumbnail = info.get('thumbnailPath') + thumbnail = f'https://{host}{thumbnail}' if thumbnail else None + + entries = OnDemandPagedList(functools.partial( + self.fetch_page, host, id, type), self._PAGE_SIZE) + + return self.playlist_result( + entries, id, playlist_title, playlist_description, + timestamp=playlist_timestamp, channel=channel, channel_id=channel_id, thumbnail=thumbnail) + + def _real_extract(self, url): + type, host, id = self._match_valid_url(url).group('type', 'host', 'id') + type = self._TYPES[type] + return self._extract_playlist(host, type, id) diff --git a/yt_dlp/extractor/peertv.py b/yt_dlp/extractor/peertv.py new file mode 100644 index 0000000000..002d33a880 --- /dev/null +++ b/yt_dlp/extractor/peertv.py @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import js_to_json + + +class PeerTVIE(InfoExtractor): + IE_NAME = 'peer.tv' + _VALID_URL = r'https?://(?:www\.)?peer\.tv/(?:de|it|en)/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.peer.tv/de/841', + 'info_dict': { + 'id': '841', + 'ext': 'mp4', + 'title': 'Die Brunnenburg', + 'description': 'md5:4395f6142b090338340ab88a3aae24ed', + }, + }, { + 'url': 'https://www.peer.tv/it/404', + 'info_dict': { + 'id': '404', + 'ext': 'mp4', + 'title': 'Cascate di ghiaccio in Val Gardena', + 'description': 'md5:e8e5907f236171842674e8090e3577b8', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_key = self._html_search_regex(r'player\.peer\.tv/js/([a-zA-Z0-9]+)', webpage, 'video key') + + js = self._download_webpage(f'https://player.peer.tv/js/{video_key}/', video_id, + headers={'Referer': 'https://www.peer.tv/'}, note='Downloading session id') + + session_id = self._search_regex(r'["\']session_id["\']:\s*["\']([a-zA-Z0-9]+)["\']', js, 'session id') + + player_webpage = self._download_webpage( + f'https://player.peer.tv/jsc/{video_key}/{session_id}?jsr=aHR0cHM6Ly93d3cucGVlci50di9kZS84NDE=&cs=UTF-8&mq=2&ua=0&webm=p&mp4=p&hls=1', + video_id, note='Downloading player webpage') + + m3u8_url = self._search_regex(r'["\']playlist_url["\']:\s*(["\'][^"\']+["\'])', player_webpage, 'm3u8 url') + m3u8_url = self._parse_json(m3u8_url, video_id, transform_source=js_to_json) + + formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls') + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._html_search_regex(r'

    (.+?)

    ', webpage, 'title').replace('\xa0', ' '), + 'formats': formats, + 'description': self._html_search_meta(('og:description', 'description'), webpage), + 'thumbnail': self._html_search_meta(('og:image', 'image'), webpage) + } diff --git a/yt_dlp/extractor/peloton.py b/yt_dlp/extractor/peloton.py new file mode 100644 index 0000000000..7d832253fe --- /dev/null +++ b/yt_dlp/extractor/peloton.py @@ -0,0 +1,221 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_urllib_parse, +) +from ..utils import ( + ExtractorError, + float_or_none, + str_or_none, + traverse_obj, + url_or_none, +) + + +class PelotonIE(InfoExtractor): + IE_NAME = 'peloton' + _NETRC_MACHINE = 'peloton' + _VALID_URL = r'https?://members\.onepeloton\.com/classes/player/(?P[a-f0-9]+)' + _TESTS = [{ + 'url': 'https://members.onepeloton.com/classes/player/0e9653eb53544eeb881298c8d7a87b86', + 'info_dict': { + 'id': '0e9653eb53544eeb881298c8d7a87b86', + 'title': '20 min Chest & Back Strength', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.+\.jpg', + 'description': 'md5:fcd5be9b9eda0194b470e13219050a66', + 'creator': 'Chase Tucker', + 'release_timestamp': 1556141400, + 'timestamp': 1556141400, + 'upload_date': '20190424', + 'duration': 1389, + 'categories': ['Strength'], + 'tags': ['Workout Mat', 'Light Weights', 'Medium Weights'], + 'is_live': False, + 'chapters': 'count:1', + 'subtitles': {'en': [{ + 'url': r're:^https?://.+', + 'ext': 'vtt' + }]}, + }, 'params': { + 'skip_download': 'm3u8', + }, + '_skip': 'Account needed' + }, { + 'url': 'https://members.onepeloton.com/classes/player/26603d53d6bb4de1b340514864a6a6a8', + 'info_dict': { + 'id': '26603d53d6bb4de1b340514864a6a6a8', + 'title': '30 min Earth Day Run', + 'ext': 'm4a', + 'thumbnail': r're:https://.+\.jpg', + 'description': 'md5:adc065a073934d7ee0475d217afe0c3d', + 'creator': 'Selena Samuela', + 'release_timestamp': 1587567600, + 'timestamp': 1587567600, + 'upload_date': '20200422', + 'duration': 1802, + 'categories': ['Running'], + 'is_live': False, + 'chapters': 'count:3' + }, 'params': { + 'skip_download': 'm3u8', + }, + '_skip': 'Account needed' + }] + + _MANIFEST_URL_TEMPLATE = '%s?hdnea=%s' + + def _start_session(self, video_id): + self._download_webpage('https://api.onepeloton.com/api/started_client_session', video_id, note='Starting session') + + def _login(self, video_id): + username, password = self._get_login_info() + if not (username and password): + self.raise_login_required() + try: + self._download_json( + 'https://api.onepeloton.com/auth/login', video_id, note='Logging in', + data=json.dumps({ + 'username_or_email': username, + 'password': password, + 'with_pubsub': False + }).encode(), + headers={'Content-Type': 'application/json', 'User-Agent': 'web'}) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + json_string = self._webpage_read_content(e.cause, None, video_id) + res = self._parse_json(json_string, video_id) + raise ExtractorError(res['message'], expected=res['message'] == 'Login failed') + else: + raise + + def _get_token(self, video_id): + try: + subscription = self._download_json( + 'https://api.onepeloton.com/api/subscription/stream', video_id, note='Downloading token', + data=json.dumps({}).encode(), headers={'Content-Type': 'application/json'}) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + json_string = self._webpage_read_content(e.cause, None, video_id) + res = self._parse_json(json_string, video_id) + raise ExtractorError(res['message'], expected=res['message'] == 'Stream limit reached') + else: + raise + return subscription['token'] + + def _real_extract(self, url): + video_id = self._match_id(url) + try: + self._start_session(video_id) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + self._login(video_id) + self._start_session(video_id) + else: + raise + + metadata = self._download_json('https://api.onepeloton.com/api/ride/%s/details?stream_source=multichannel' % video_id, video_id) + ride_data = metadata.get('ride') + if not ride_data: + raise ExtractorError('Missing stream metadata') + token = self._get_token(video_id) + + is_live = False + if ride_data.get('content_format') == 'audio': + url = self._MANIFEST_URL_TEMPLATE % (ride_data.get('vod_stream_url'), compat_urllib_parse.quote(token)) + formats = [{ + 'url': url, + 'ext': 'm4a', + 'format_id': 'audio', + 'vcodec': 'none', + }] + subtitles = {} + else: + if ride_data.get('vod_stream_url'): + url = 'https://members.onepeloton.com/.netlify/functions/m3u8-proxy?displayLanguage=en&acceptedSubtitles=%s&url=%s?hdnea=%s' % ( + ','.join([re.sub('^([a-z]+)-([A-Z]+)$', r'\1', caption) for caption in ride_data['captions']]), + ride_data['vod_stream_url'], + compat_urllib_parse.quote(compat_urllib_parse.quote(token))) + elif ride_data.get('live_stream_url'): + url = self._MANIFEST_URL_TEMPLATE % (ride_data.get('live_stream_url'), compat_urllib_parse.quote(token)) + is_live = True + else: + raise ExtractorError('Missing video URL') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') + + if metadata.get('instructor_cues'): + subtitles['cues'] = [{ + 'data': json.dumps(metadata.get('instructor_cues')), + 'ext': 'json' + }] + + category = ride_data.get('fitness_discipline_display_name') + chapters = [{ + 'start_time': segment.get('start_time_offset'), + 'end_time': segment.get('start_time_offset') + segment.get('length'), + 'title': segment.get('name') + } for segment in traverse_obj(metadata, ('segments', 'segment_list'))] + + self._sort_formats(formats) + return { + 'id': video_id, + 'title': ride_data.get('title'), + 'formats': formats, + 'thumbnail': url_or_none(ride_data.get('image_url')), + 'description': str_or_none(ride_data.get('description')), + 'creator': traverse_obj(ride_data, ('instructor', 'name')), + 'release_timestamp': ride_data.get('original_air_time'), + 'timestamp': ride_data.get('original_air_time'), + 'subtitles': subtitles, + 'duration': float_or_none(ride_data.get('length')), + 'categories': [category] if category else None, + 'tags': traverse_obj(ride_data, ('equipment_tags', ..., 'name')), + 'is_live': is_live, + 'chapters': chapters + } + + +class PelotonLiveIE(InfoExtractor): + IE_NAME = 'peloton:live' + IE_DESC = 'Peloton Live' + _VALID_URL = r'https?://members\.onepeloton\.com/player/live/(?P[a-f0-9]+)' + _TEST = { + 'url': 'https://members.onepeloton.com/player/live/eedee2d19f804a9788f53aa8bd38eb1b', + 'info_dict': { + 'id': '32edc92d28044be5bf6c7b6f1f8d1cbc', + 'title': '30 min HIIT Ride: Live from Home', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.+\.png', + 'description': 'md5:f0d7d8ed3f901b7ee3f62c1671c15817', + 'creator': 'Alex Toussaint', + 'release_timestamp': 1587736620, + 'timestamp': 1587736620, + 'upload_date': '20200424', + 'duration': 2014, + 'categories': ['Cycling'], + 'is_live': False, + 'chapters': 'count:3' + }, + 'params': { + 'skip_download': 'm3u8', + }, + '_skip': 'Account needed' + } + + def _real_extract(self, url): + workout_id = self._match_id(url) + peloton = self._download_json(f'https://api.onepeloton.com/api/peloton/{workout_id}', workout_id) + + if peloton.get('ride_id'): + if not peloton.get('is_live') or peloton.get('is_encore') or peloton.get('status') != 'PRE_START': + return self.url_result('https://members.onepeloton.com/classes/player/%s' % peloton['ride_id']) + else: + raise ExtractorError('Ride has not started', expected=True) + else: + raise ExtractorError('Missing video ID') diff --git a/yt_dlp/extractor/picarto.py b/yt_dlp/extractor/picarto.py index e6c51e16b5..adf21fda87 100644 --- a/yt_dlp/extractor/picarto.py +++ b/yt_dlp/extractor/picarto.py @@ -77,7 +77,7 @@ def _real_extract(self, url): return { 'id': channel_id, - 'title': self._live_title(title.strip()), + 'title': title.strip(), 'is_live': True, 'channel': channel_id, 'channel_id': metadata.get('id'), @@ -111,7 +111,7 @@ def _real_extract(self, url): vod_info = self._parse_json( self._search_regex( r'(?s)#vod-player["\']\s*,\s*(\{.+?\})\s*\)', webpage, - video_id), + 'vod player'), video_id, transform_source=js_to_json) formats = self._extract_m3u8_formats( diff --git a/yt_dlp/extractor/piksel.py b/yt_dlp/extractor/piksel.py index a362664b20..84c3de2f0e 100644 --- a/yt_dlp/extractor/piksel.py +++ b/yt_dlp/extractor/piksel.py @@ -4,11 +4,11 @@ import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( dict_get, ExtractorError, int_or_none, + join_nonempty, parse_iso8601, try_get, unescapeHTML, @@ -116,12 +116,8 @@ def process_asset_file(asset_file): elif asset_type == 'audio': tbr = abr - format_id = ['http'] - if tbr: - format_id.append(compat_str(tbr)) - formats.append({ - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty('http', tbr), 'url': unescapeHTML(http_url), 'vbr': vbr, 'abr': abr, @@ -167,7 +163,7 @@ def process_asset_files(asset_files): re.sub(r'/od/[^/]+/', '/od/http/', smil_url), video_id, transform_source=transform_source, fatal=False)) - self._sort_formats(formats) + self._sort_formats(formats, ('tbr', )) # Incomplete resolution information subtitles = {} for caption in video_data.get('captions', []): diff --git a/yt_dlp/extractor/pixivsketch.py b/yt_dlp/extractor/pixivsketch.py new file mode 100644 index 0000000000..f0ad0b24a1 --- /dev/null +++ b/yt_dlp/extractor/pixivsketch.py @@ -0,0 +1,122 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + unified_timestamp, +) + + +class PixivSketchBaseIE(InfoExtractor): + def _call_api(self, video_id, path, referer, note='Downloading JSON metadata'): + response = self._download_json(f'https://sketch.pixiv.net/api/{path}', video_id, note=note, headers={ + 'Referer': referer, + 'X-Requested-With': referer, + }) + errors = traverse_obj(response, ('errors', ..., 'message')) + if errors: + raise ExtractorError(' '.join(f'{e}.' for e in errors)) + return response.get('data') or {} + + +class PixivSketchIE(PixivSketchBaseIE): + IE_NAME = 'pixiv:sketch' + _VALID_URL = r'https?://sketch\.pixiv\.net/@(?P[a-zA-Z0-9_-]+)/lives/(?P\d+)/?' + _TESTS = [{ + 'url': 'https://sketch.pixiv.net/@nuhutya/lives/3654620468641830507', + 'info_dict': { + 'id': '7370666691623196569', + 'title': 'まにあえクリスマス!', + 'uploader': 'ぬふちゃ', + 'uploader_id': 'nuhutya', + 'channel_id': '9844815', + 'age_limit': 0, + 'timestamp': 1640351536, + }, + 'skip': True, + }, { + # these two (age_limit > 0) requires you to login on website, but it's actually not required for download + 'url': 'https://sketch.pixiv.net/@namahyou/lives/4393103321546851377', + 'info_dict': { + 'id': '4907995960957946943', + 'title': 'クリスマスなんて知らん🖕', + 'uploader': 'すゃもり', + 'uploader_id': 'suya2mori2', + 'channel_id': '31169300', + 'age_limit': 15, + 'timestamp': 1640347640, + }, + 'skip': True, + }, { + 'url': 'https://sketch.pixiv.net/@8aki/lives/3553803162487249670', + 'info_dict': { + 'id': '1593420639479156945', + 'title': 'おまけ本作業(リョナ有)', + 'uploader': 'おぶい / Obui', + 'uploader_id': 'oving', + 'channel_id': '17606', + 'age_limit': 18, + 'timestamp': 1640330263, + }, + 'skip': True, + }] + + def _real_extract(self, url): + video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') + data = self._call_api(video_id, f'lives/{video_id}.json', url) + + if not traverse_obj(data, 'is_broadcasting'): + raise ExtractorError(f'This live is offline. Use https://sketch.pixiv.net/@{uploader_id} for ongoing live.', expected=True) + + m3u8_url = traverse_obj(data, ('owner', 'hls_movie', 'url')) + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': data.get('name'), + 'formats': formats, + 'uploader': traverse_obj(data, ('user', 'name'), ('owner', 'user', 'name')), + 'uploader_id': traverse_obj(data, ('user', 'unique_name'), ('owner', 'user', 'unique_name')), + 'channel_id': str(traverse_obj(data, ('user', 'pixiv_user_id'), ('owner', 'user', 'pixiv_user_id'))), + 'age_limit': 18 if data.get('is_r18') else 15 if data.get('is_r15') else 0, + 'timestamp': unified_timestamp(data.get('created_at')), + 'is_live': True + } + + +class PixivSketchUserIE(PixivSketchBaseIE): + IE_NAME = 'pixiv:sketch:user' + _VALID_URL = r'https?://sketch\.pixiv\.net/@(?P[a-zA-Z0-9_-]+)/?' + _TESTS = [{ + 'url': 'https://sketch.pixiv.net/@nuhutya', + 'only_matching': True, + }, { + 'url': 'https://sketch.pixiv.net/@namahyou', + 'only_matching': True, + }, { + 'url': 'https://sketch.pixiv.net/@8aki', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return super(PixivSketchUserIE, cls).suitable(url) and not PixivSketchIE.suitable(url) + + def _real_extract(self, url): + user_id = self._match_id(url) + data = self._call_api(user_id, f'lives/users/@{user_id}.json', url) + + if not traverse_obj(data, 'is_broadcasting'): + try: + self._call_api(user_id, 'users/current.json', url, 'Investigating reason for request failure') + except ExtractorError as ex: + if ex.cause and ex.cause.code == 401: + self.raise_login_required(f'Please log in, or use direct link like https://sketch.pixiv.net/@{user_id}/1234567890', method='cookies') + raise ExtractorError('This user is offline', expected=True) + + return self.url_result(f'https://sketch.pixiv.net/@{user_id}/lives/{data["id"]}') diff --git a/yt_dlp/extractor/planetmarathi.py b/yt_dlp/extractor/planetmarathi.py new file mode 100644 index 0000000000..07ac15b540 --- /dev/null +++ b/yt_dlp/extractor/planetmarathi.py @@ -0,0 +1,76 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + try_get, + unified_strdate, +) + + +class PlanetMarathiIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?planetmarathi\.com/titles/(?P[^/#&?$]+)' + _TESTS = [{ + 'url': 'https://www.planetmarathi.com/titles/ek-unad-divas', + 'playlist_mincount': 2, + 'info_dict': { + 'id': 'ek-unad-divas', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'ASSETS-MOVIE-ASSET-01_ek-unad-divas', + 'ext': 'mp4', + 'title': 'ek unad divas', + 'alt_title': 'चित्रपट', + 'description': 'md5:41c7ed6b041c2fea9820a3f3125bd881', + 'season_number': None, + 'episode_number': 1, + 'duration': 5539, + 'upload_date': '20210829', + }, + }] # Trailer skipped + }, { + 'url': 'https://www.planetmarathi.com/titles/baap-beep-baap-season-1', + 'playlist_mincount': 10, + 'info_dict': { + 'id': 'baap-beep-baap-season-1', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'ASSETS-CHARACTER-PROFILE-SEASON-01-ASSET-01_baap-beep-baap-season-1', + 'ext': 'mp4', + 'title': 'Manohar Kanhere', + 'alt_title': 'मनोहर कान्हेरे', + 'description': 'md5:285ed45d5c0ab5522cac9a043354ebc6', + 'season_number': 1, + 'episode_number': 1, + 'duration': 29, + 'upload_date': '20210829', + }, + }] # Trailers, Episodes, other Character profiles skipped + }] + + def _real_extract(self, url): + id = self._match_id(url) + entries = [] + json_data = self._download_json(f'https://www.planetmarathi.com/api/v1/titles/{id}/assets', id)['assets'] + for asset in json_data: + asset_title = asset['mediaAssetName']['en'] + if asset_title == 'Movie': + asset_title = id.replace('-', ' ') + asset_id = f'{asset["sk"]}_{id}'.replace('#', '-') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(asset['mediaAssetURL'], asset_id) + self._sort_formats(formats) + entries.append({ + 'id': asset_id, + 'title': asset_title, + 'alt_title': try_get(asset, lambda x: x['mediaAssetName']['mr']), + 'description': try_get(asset, lambda x: x['mediaAssetDescription']['en']), + 'season_number': asset.get('mediaAssetSeason'), + 'episode_number': asset.get('mediaAssetIndexForAssetType'), + 'duration': asset.get('mediaAssetDurationInSeconds'), + 'upload_date': unified_strdate(asset.get('created')), + 'formats': formats, + 'subtitles': subtitles, + }) + return self.playlist_result(entries, playlist_id=id) diff --git a/yt_dlp/extractor/playtvak.py b/yt_dlp/extractor/playtvak.py index 84e92dda49..30c8a599e7 100644 --- a/yt_dlp/extractor/playtvak.py +++ b/yt_dlp/extractor/playtvak.py @@ -167,8 +167,6 @@ def _real_extract(self, url): title = item['title'] is_live = item['type'] == 'stream' - if is_live: - title = self._live_title(title) description = self._og_search_description(webpage, default=None) or self._html_search_meta( 'description', webpage, 'description', default=None) timestamp = None diff --git a/yt_dlp/extractor/plutotv.py b/yt_dlp/extractor/plutotv.py index b19ff8d021..26aff1af5d 100644 --- a/yt_dlp/extractor/plutotv.py +++ b/yt_dlp/extractor/plutotv.py @@ -19,7 +19,16 @@ class PlutoTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pluto\.tv(?:/en)?/on-demand/(?Pmovies|series)/(?P.*)/?$' + _VALID_URL = r'''(?x) + https?://(?:www\.)?pluto\.tv(?:/[^/]+)?/on-demand + /(?Pmovies|series) + /(?P[^/]+) + (?: + (?:/seasons?/(?P\d+))? + (?:/episode/(?P[^/]+))? + )? + /?(?:$|[#?])''' + _INFO_URL = 'https://service-vod.clusters.pluto.tv/v3/vod/slugs/' _INFO_QUERY_PARAMS = { 'appName': 'web', @@ -75,6 +84,9 @@ class PlutoTVIE(InfoExtractor): }, { 'url': 'https://pluto.tv/en/on-demand/series/manhunters-fugitive-task-force/seasons/1/episode/third-times-the-charm-1-1', 'only_matching': True, + }, { + 'url': 'https://pluto.tv/it/on-demand/series/csi-vegas/episode/legacy-2021-1-1', + 'only_matching': True, } ] @@ -146,17 +158,13 @@ def _get_video_info(self, video_json, slug, series_name=None): return info def _real_extract(self, url): - path = compat_urlparse.urlparse(url).path - path_components = path.split('/') - video_type = path_components[2] - info_slug = path_components[3] - video_json = self._download_json(self._INFO_URL + info_slug, info_slug, - query=self._INFO_QUERY_PARAMS) + mobj = self._match_valid_url(url).groupdict() + info_slug = mobj['series_or_movie_slug'] + video_json = self._download_json(self._INFO_URL + info_slug, info_slug, query=self._INFO_QUERY_PARAMS) - if video_type == 'series': + if mobj['video_type'] == 'series': series_name = video_json.get('name', info_slug) - season_number = int_or_none(try_get(path_components, lambda x: x[5])) - episode_slug = try_get(path_components, lambda x: x[7]) + season_number, episode_slug = mobj.get('season_number'), mobj.get('episode_slug') videos = [] for season in video_json['seasons']: diff --git a/yt_dlp/extractor/polsatgo.py b/yt_dlp/extractor/polsatgo.py new file mode 100644 index 0000000000..1e3f46c07c --- /dev/null +++ b/yt_dlp/extractor/polsatgo.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from uuid import uuid4 +import json + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, + url_or_none, + ExtractorError, +) + + +class PolsatGoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polsat(?:box)?go\.pl/.+/(?P[0-9a-fA-F]+)(?:[/#?]|$)' + _TESTS = [{ + 'url': 'https://polsatgo.pl/wideo/seriale/swiat-wedlug-kiepskich/5024045/sezon-1/5028300/swiat-wedlug-kiepskich-odcinek-88/4121', + 'info_dict': { + 'id': '4121', + 'ext': 'mp4', + 'title': 'Świat według Kiepskich - Odcinek 88', + 'age_limit': 12, + }, + }] + + def _extract_formats(self, sources, video_id): + for source in sources or []: + if not source.get('id'): + continue + url = url_or_none(self._call_api( + 'drm', video_id, 'getPseudoLicense', + {'mediaId': video_id, 'sourceId': source['id']}).get('url')) + if not url: + continue + yield { + 'url': url, + 'height': int_or_none(try_get(source, lambda x: x['quality'][:-1])) + } + + def _real_extract(self, url): + video_id = self._match_id(url) + media = self._call_api('navigation', video_id, 'prePlayData', {'mediaId': video_id})['mediaItem'] + + formats = list(self._extract_formats( + try_get(media, lambda x: x['playback']['mediaSources']), video_id)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': media['displayInfo']['title'], + 'formats': formats, + 'age_limit': int_or_none(media['displayInfo']['ageGroup']) + } + + def _call_api(self, endpoint, media_id, method, params): + rand_uuid = str(uuid4()) + res = self._download_json( + f'https://b2c-mobile.redefine.pl/rpc/{endpoint}/', media_id, + note=f'Downloading {method} JSON metadata', + data=json.dumps({ + 'method': method, + 'id': '2137', + 'jsonrpc': '2.0', + 'params': { + **params, + 'userAgentData': { + 'deviceType': 'mobile', + 'application': 'native', + 'os': 'android', + 'build': 10003, + 'widevine': False, + 'portal': 'pg', + 'player': 'cpplayer', + }, + 'deviceId': { + 'type': 'other', + 'value': rand_uuid, + }, + 'clientId': rand_uuid, + 'cpid': 1, + }, + }).encode('utf-8'), + headers={'Content-type': 'application/json'}) + if not res.get('result'): + if res['error']['code'] == 13404: + raise ExtractorError('This video is either unavailable in your region or is DRM protected', expected=True) + raise ExtractorError(f'Solorz said: {res["error"]["message"]} - {res["error"]["data"]["userMessage"]}') + return res['result'] diff --git a/yt_dlp/extractor/polskieradio.py b/yt_dlp/extractor/polskieradio.py index 978d6f813b..b2b3eb29cf 100644 --- a/yt_dlp/extractor/polskieradio.py +++ b/yt_dlp/extractor/polskieradio.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals import itertools +import json +import math import re from .common import InfoExtractor @@ -12,15 +14,46 @@ ) from ..utils import ( extract_attributes, + ExtractorError, + InAdvancePagedList, int_or_none, + js_to_json, + parse_iso8601, strip_or_none, unified_timestamp, + unescapeHTML, + url_or_none, ) -class PolskieRadioIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P[0-9]+)' - _TESTS = [{ +class PolskieRadioBaseExtractor(InfoExtractor): + def _extract_webpage_player_entries(self, webpage, playlist_id, base_data): + media_urls = set() + + for data_media in re.findall(r'<[^>]+data-media="?({[^>]+})"?', webpage): + media = self._parse_json(data_media, playlist_id, transform_source=unescapeHTML, fatal=False) + if not media.get('file') or not media.get('desc'): + continue + media_url = self._proto_relative_url(media['file']) + if media_url in media_urls: + continue + media_urls.add(media_url) + entry = base_data.copy() + entry.update({ + 'id': compat_str(media['id']), + 'url': media_url, + 'duration': int_or_none(media.get('length')), + 'vcodec': 'none' if media.get('provider') == 'audio' else None, + }) + entry_title = compat_urllib_parse_unquote(media['desc']) + if entry_title: + entry['title'] = entry_title + yield entry + + +class PolskieRadioIE(PolskieRadioBaseExtractor): + _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/Artykul/(?P[0-9]+)' + _TESTS = [{ # Old-style single broadcast. 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', 'info_dict': { 'id': '1587943', @@ -39,14 +72,33 @@ class PolskieRadioIE(InfoExtractor): 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' }, }], - }, { - 'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal', + }, { # New-style single broadcast. + 'url': 'https://www.polskieradio.pl/8/2382/Artykul/2534482,Zagarysci-Poezja-jak-spoiwo', 'info_dict': { - 'id': '1635803', - 'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał', - 'description': 'md5:01cb7d0cad58664095d72b51a1ebada2', + 'id': '2534482', + 'title': 'Żagaryści. Poezja jak spoiwo', + 'description': 'md5:f18d95d5dcba747a09b635e21a4c0695', + }, + 'playlist': [{ + 'md5': 'd07559829f61d5a93a75755987ded760', + 'info_dict': { + 'id': '2516679', + 'ext': 'mp3', + 'title': 'md5:c6e1234e0b747ad883cb91b7ad06b98c', + 'timestamp': 1592654400, + 'upload_date': '20200620', + 'duration': 1430, + 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' + }, + }], + }, { + # PR4 audition - other frontend + 'url': 'https://www.polskieradio.pl/10/6071/Artykul/2610977,Poglos-29-pazdziernika-godz-2301', + 'info_dict': { + 'id': '2610977', + 'ext': 'mp3', + 'title': 'Pogłos 29 października godz. 23:01', }, - 'playlist_mincount': 12, }, { 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', 'only_matching': True, @@ -57,6 +109,9 @@ class PolskieRadioIE(InfoExtractor): # with mp4 video 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', 'only_matching': True, + }, { + 'url': 'https://polskieradio24.pl/130/4503/Artykul/2621876,Narusza-nasza-suwerennosc-Publicysci-o-uzaleznieniu-funduszy-UE-od-praworzadnosci', + 'only_matching': True, }] def _real_extract(self, url): @@ -66,38 +121,37 @@ def _real_extract(self, url): content = self._search_regex( r'(?s)]+class="\s*this-article\s*"[^>]*>(.+?)]+class="tags"[^>]*>', - webpage, 'content') + webpage, 'content', default=None) timestamp = unified_timestamp(self._html_search_regex( r'(?s)]+id="datetime2"[^>]*>(.+?)', - webpage, 'timestamp', fatal=False)) + webpage, 'timestamp', default=None)) - thumbnail_url = self._og_search_thumbnail(webpage) - - entries = [] - - media_urls = set() - - for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content): - media = self._parse_json(data_media, playlist_id, fatal=False) - if not media.get('file') or not media.get('desc'): - continue - media_url = self._proto_relative_url(media['file'], 'http:') - if media_url in media_urls: - continue - media_urls.add(media_url) - entries.append({ - 'id': compat_str(media['id']), - 'url': media_url, - 'title': compat_urllib_parse_unquote(media['desc']), - 'duration': int_or_none(media.get('length')), - 'vcodec': 'none' if media.get('provider') == 'audio' else None, - 'timestamp': timestamp, - 'thumbnail': thumbnail_url - }) + thumbnail_url = self._og_search_thumbnail(webpage, default=None) title = self._og_search_title(webpage).strip() - description = strip_or_none(self._og_search_description(webpage)) + + description = strip_or_none(self._og_search_description(webpage, default=None)) + description = description.replace('\xa0', ' ') if description is not None else None + + if not content: + return { + 'id': playlist_id, + 'url': self._proto_relative_url( + self._search_regex( + r"source:\s*'(//static\.prsa\.pl/[^']+)'", + webpage, 'audition record url')), + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'thumbnail': thumbnail_url, + } + + entries = self._extract_webpage_player_entries(content, playlist_id, { + 'title': title, + 'timestamp': timestamp, + 'thumbnail': thumbnail_url, + }) return self.playlist_result(entries, playlist_id, title, description) @@ -178,3 +232,201 @@ def _real_extract(self, url): return self.playlist_result( self._entries(url, webpage, category_id), category_id, title) + + +class PolskieRadioPlayerIE(InfoExtractor): + IE_NAME = 'polskieradio:player' + _VALID_URL = r'https?://player\.polskieradio\.pl/anteny/(?P[^/]+)' + + _BASE_URL = 'https://player.polskieradio.pl' + _PLAYER_URL = 'https://player.polskieradio.pl/main.bundle.js' + _STATIONS_API_URL = 'https://apipr.polskieradio.pl/api/stacje' + + _TESTS = [{ + 'url': 'https://player.polskieradio.pl/anteny/trojka', + 'info_dict': { + 'id': '3', + 'ext': 'm4a', + 'title': 'Trójka', + }, + 'params': { + 'format': 'bestaudio', + 'skip_download': 'endless stream', + }, + }] + + def _get_channel_list(self, channel_url='no_channel'): + player_code = self._download_webpage( + self._PLAYER_URL, channel_url, + note='Downloading js player') + channel_list = js_to_json(self._search_regex( + r';var r="anteny",a=(\[.+?\])},', player_code, 'channel list')) + return self._parse_json(channel_list, channel_url) + + def _real_extract(self, url): + channel_url = self._match_id(url) + channel_list = self._get_channel_list(channel_url) + + channel = next((c for c in channel_list if c.get('url') == channel_url), None) + + if not channel: + raise ExtractorError('Channel not found') + + station_list = self._download_json(self._STATIONS_API_URL, channel_url, + note='Downloading stream url list', + headers={ + 'Accept': 'application/json', + 'Referer': url, + 'Origin': self._BASE_URL, + }) + station = next((s for s in station_list + if s.get('Name') == (channel.get('streamName') or channel.get('name'))), None) + if not station: + raise ExtractorError('Station not found even though we extracted channel') + + formats = [] + for stream_url in station['Streams']: + stream_url = self._proto_relative_url(stream_url) + if stream_url.endswith('/playlist.m3u8'): + formats.extend(self._extract_m3u8_formats(stream_url, channel_url, live=True)) + elif stream_url.endswith('/manifest.f4m'): + formats.extend(self._extract_mpd_formats(stream_url, channel_url)) + elif stream_url.endswith('/Manifest'): + formats.extend(self._extract_ism_formats(stream_url, channel_url)) + else: + formats.append({ + 'url': stream_url, + }) + + self._sort_formats(formats) + + return { + 'id': compat_str(channel['id']), + 'formats': formats, + 'title': channel.get('name') or channel.get('streamName'), + 'display_id': channel_url, + 'thumbnail': f'{self._BASE_URL}/images/{channel_url}-color-logo.png', + 'is_live': True, + } + + +class PolskieRadioPodcastBaseExtractor(InfoExtractor): + _API_BASE = 'https://apipodcasts.polskieradio.pl/api' + + def _parse_episode(self, data): + return { + 'id': data['guid'], + 'formats': [{ + 'url': data['url'], + 'filesize': int_or_none(data.get('fileSize')), + }], + 'title': data['title'], + 'description': data.get('description'), + 'duration': int_or_none(data.get('length')), + 'timestamp': parse_iso8601(data.get('publishDate')), + 'thumbnail': url_or_none(data.get('image')), + 'series': data.get('podcastTitle'), + 'episode': data['title'], + } + + +class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor): + IE_NAME = 'polskieradio:podcast:list' + _VALID_URL = r'https?://podcasty\.polskieradio\.pl/podcast/(?P\d+)' + _TESTS = [{ + 'url': 'https://podcasty.polskieradio.pl/podcast/8/', + 'info_dict': { + 'id': '8', + 'title': 'Śniadanie w Trójce', + 'description': 'md5:57abcc27bc4c6a6b25baa3061975b9ef', + 'uploader': 'Beata Michniewicz', + }, + 'playlist_mincount': 714, + }] + _PAGE_SIZE = 10 + + def _call_api(self, podcast_id, page): + return self._download_json( + f'{self._API_BASE}/Podcasts/{podcast_id}/?pageSize={self._PAGE_SIZE}&page={page}', + podcast_id, f'Downloading page {page}') + + def _real_extract(self, url): + podcast_id = self._match_id(url) + data = self._call_api(podcast_id, 1) + + def get_page(page_num): + page_data = self._call_api(podcast_id, page_num + 1) if page_num else data + yield from (self._parse_episode(ep) for ep in page_data['items']) + + return { + '_type': 'playlist', + 'entries': InAdvancePagedList( + get_page, math.ceil(data['itemCount'] / self._PAGE_SIZE), self._PAGE_SIZE), + 'id': str(data['id']), + 'title': data['title'], + 'description': data.get('description'), + 'uploader': data.get('announcer'), + } + + +class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor): + IE_NAME = 'polskieradio:podcast' + _VALID_URL = r'https?://podcasty\.polskieradio\.pl/track/(?P[a-f\d]{8}(?:-[a-f\d]{4}){4}[a-f\d]{8})' + _TESTS = [{ + 'url': 'https://podcasty.polskieradio.pl/track/6eafe403-cb8f-4756-b896-4455c3713c32', + 'info_dict': { + 'id': '6eafe403-cb8f-4756-b896-4455c3713c32', + 'ext': 'mp3', + 'title': 'Theresa May rezygnuje. Co dalej z brexitem?', + 'description': 'md5:e41c409a29d022b70ef0faa61dbded60', + }, + }] + + def _real_extract(self, url): + podcast_id = self._match_id(url) + data = self._download_json( + f'{self._API_BASE}/audio', + podcast_id, 'Downloading podcast metadata', + data=json.dumps({ + 'guids': [podcast_id], + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + }) + return self._parse_episode(data[0]) + + +class PolskieRadioRadioKierowcowIE(PolskieRadioBaseExtractor): + _VALID_URL = r'https?://(?:www\.)?radiokierowcow\.pl/artykul/(?P[0-9]+)' + IE_NAME = 'polskieradio:kierowcow' + + _TESTS = [{ + 'url': 'https://radiokierowcow.pl/artykul/2694529', + 'info_dict': { + 'id': '2694529', + 'title': 'Zielona fala reliktem przeszłości?', + 'description': 'md5:343950a8717c9818fdfd4bd2b8ca9ff2', + }, + 'playlist_count': 3, + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + webpage = self._download_webpage(url, media_id) + nextjs_build = self._search_nextjs_data(webpage, media_id)['buildId'] + article = self._download_json( + f'https://radiokierowcow.pl/_next/data/{nextjs_build}/artykul/{media_id}.json?articleId={media_id}', + media_id) + data = article['pageProps']['data'] + title = data['title'] + entries = self._extract_webpage_player_entries(data['content'], media_id, { + 'title': title, + }) + + return { + '_type': 'playlist', + 'id': media_id, + 'entries': entries, + 'title': title, + 'description': data.get('lead'), + } diff --git a/yt_dlp/extractor/pornflip.py b/yt_dlp/extractor/pornflip.py index d0aefa2dd5..accf45269b 100644 --- a/yt_dlp/extractor/pornflip.py +++ b/yt_dlp/extractor/pornflip.py @@ -29,7 +29,6 @@ class PornFlipIE(InfoExtractor): 'age_limit': 18, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index 6d894affd9..4357c79df5 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -258,8 +258,7 @@ def _extract_urls(webpage): webpage) def _extract_count(self, pattern, webpage, name): - return str_to_int(self._search_regex( - pattern, webpage, '%s count' % name, fatal=False)) + return str_to_int(self._search_regex(pattern, webpage, '%s count' % name, default=None)) def _real_extract(self, url): mobj = self._match_valid_url(url) diff --git a/yt_dlp/extractor/projectveritas.py b/yt_dlp/extractor/projectveritas.py new file mode 100644 index 0000000000..9e9867ba5d --- /dev/null +++ b/yt_dlp/extractor/projectveritas.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + unified_strdate, +) + + +class ProjectVeritasIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?projectveritas\.com/(?Pnews|video)/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.projectveritas.com/news/exclusive-inside-the-new-york-and-new-jersey-hospitals-battling-coronavirus/', + 'info_dict': { + 'id': '51910aab-365a-5cf1-88f2-8eb1ca5fd3c6', + 'ext': 'mp4', + 'title': 'Exclusive: Inside The New York and New Jersey Hospitals Battling Coronavirus', + 'upload_date': '20200327', + 'thumbnail': 'md5:6076477fe50b03eb8708be9415e18e1c', + } + }, { + 'url': 'https://www.projectveritas.com/video/ilhan-omar-connected-ballot-harvester-in-cash-for-ballots-scheme-car-is-full/', + 'info_dict': { + 'id': 'c5aab304-a56b-54b1-9f0b-03b77bc5f2f6', + 'ext': 'mp4', + 'title': 'Ilhan Omar connected Ballot Harvester in cash-for-ballots scheme: "Car is full" of absentee ballots', + 'upload_date': '20200927', + 'thumbnail': 'md5:194b8edf0e2ba64f25500ff4378369a4', + } + }] + + def _real_extract(self, url): + id, type = self._match_valid_url(url).group('id', 'type') + api_url = f'https://www.projectveritas.com/page-data/{type}/{id}/page-data.json' + data_json = self._download_json(api_url, id)['result']['data'] + main_data = traverse_obj(data_json, 'video', 'post') + video_id = main_data['id'] + thumbnail = traverse_obj(main_data, ('image', 'ogImage', 'src')) + mux_asset = traverse_obj(main_data, + 'muxAsset', ('body', 'json', 'content', ..., 'data', 'target', 'fields', 'muxAsset'), + get_all=False, expected_type=dict) + if not mux_asset: + raise ExtractorError('No video on the provided url.', expected=True) + playback_id = traverse_obj(mux_asset, 'playbackId', ('en-US', 'playbackId')) + formats = self._extract_m3u8_formats(f'https://stream.mux.com/{playback_id}.m3u8', video_id) + self._sort_formats(formats) + return { + 'id': video_id, + 'title': main_data['title'], + 'upload_date': unified_strdate(main_data.get('date')), + 'thumbnail': thumbnail.replace('//', ''), + 'formats': formats, + } diff --git a/yt_dlp/extractor/radiko.py b/yt_dlp/extractor/radiko.py new file mode 100644 index 0000000000..1e60de1539 --- /dev/null +++ b/yt_dlp/extractor/radiko.py @@ -0,0 +1,234 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import base64 +import calendar +import datetime + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + update_url_query, + clean_html, + unified_timestamp, +) +from ..compat import compat_urllib_parse + + +class RadikoBaseIE(InfoExtractor): + _FULL_KEY = None + + def _auth_client(self): + auth_cache = self._downloader.cache.load('radiko', 'auth_data') + if auth_cache: + return auth_cache + + _, auth1_handle = self._download_webpage_handle( + 'https://radiko.jp/v2/api/auth1', None, 'Downloading authentication page', + headers={ + 'x-radiko-app': 'pc_html5', + 'x-radiko-app-version': '0.0.1', + 'x-radiko-device': 'pc', + 'x-radiko-user': 'dummy_user', + }) + auth1_header = auth1_handle.info() + + auth_token = auth1_header['X-Radiko-AuthToken'] + kl = int(auth1_header['X-Radiko-KeyLength']) + ko = int(auth1_header['X-Radiko-KeyOffset']) + raw_partial_key = self._extract_full_key()[ko:ko + kl] + partial_key = base64.b64encode(raw_partial_key).decode() + + area_id = self._download_webpage( + 'https://radiko.jp/v2/api/auth2', None, 'Authenticating', + headers={ + 'x-radiko-device': 'pc', + 'x-radiko-user': 'dummy_user', + 'x-radiko-authtoken': auth_token, + 'x-radiko-partialkey': partial_key, + }).split(',')[0] + + auth_data = (auth_token, area_id) + self._downloader.cache.store('radiko', 'auth_data', auth_data) + return auth_data + + def _extract_full_key(self): + if self._FULL_KEY: + return self._FULL_KEY + + jscode = self._download_webpage( + 'https://radiko.jp/apps/js/playerCommon.js', None, + note='Downloading player js code') + full_key = self._search_regex( + (r"RadikoJSPlayer\([^,]*,\s*(['\"])pc_html5\1,\s*(['\"])(?P[0-9a-f]+)\2,\s*{"), + jscode, 'full key', fatal=False, group='fullkey') + + if full_key: + full_key = full_key.encode() + else: # use full key ever known + full_key = b'bcd151073c03b352e1ef2fd66c32209da9ca0afa' + + self._FULL_KEY = full_key + return full_key + + def _find_program(self, video_id, station, cursor): + station_program = self._download_xml( + 'https://radiko.jp/v3/program/station/weekly/%s.xml' % station, video_id, + note='Downloading radio program for %s station' % station) + + prog = None + for p in station_program.findall('.//prog'): + ft_str, to_str = p.attrib['ft'], p.attrib['to'] + ft = unified_timestamp(ft_str, False) + to = unified_timestamp(to_str, False) + if ft <= cursor and cursor < to: + prog = p + break + if not prog: + raise ExtractorError('Cannot identify radio program to download!') + assert ft, to + return prog, station_program, ft, ft_str, to_str + + def _extract_formats(self, video_id, station, is_onair, ft, cursor, auth_token, area_id, query): + m3u8_playlist_data = self._download_xml( + 'https://radiko.jp/v3/station/stream/pc_html5/%s.xml' % station, video_id, + note='Downloading m3u8 information') + m3u8_urls = m3u8_playlist_data.findall('.//url') + + formats = [] + found = set() + for url_tag in m3u8_urls: + pcu = url_tag.find('playlist_create_url') + url_attrib = url_tag.attrib + playlist_url = update_url_query(pcu.text, { + 'station_id': station, + **query, + 'l': '15', + 'lsid': '77d0678df93a1034659c14d6fc89f018', + 'type': 'b', + }) + if playlist_url in found: + continue + else: + found.add(playlist_url) + + time_to_skip = None if is_onair else cursor - ft + + subformats = self._extract_m3u8_formats( + playlist_url, video_id, ext='m4a', + live=True, fatal=False, m3u8_id=None, + headers={ + 'X-Radiko-AreaId': area_id, + 'X-Radiko-AuthToken': auth_token, + }) + for sf in subformats: + domain = sf['format_id'] = compat_urllib_parse.urlparse(sf['url']).netloc + if re.match(r'^[cf]-radiko\.smartstream\.ne\.jp$', domain): + # Prioritize live radio vs playback based on extractor + sf['preference'] = 100 if is_onair else -100 + if not is_onair and url_attrib['timefree'] == '1' and time_to_skip: + sf['_ffmpeg_args'] = ['-ss', time_to_skip] + formats.extend(subformats) + + self._sort_formats(formats) + return formats + + +class RadikoIE(RadikoBaseIE): + _VALID_URL = r'https?://(?:www\.)?radiko\.jp/#!/ts/(?P[A-Z0-9-]+)/(?P\d+)' + + _TESTS = [{ + # QRR (文化放送) station provides + 'url': 'https://radiko.jp/#!/ts/QRR/20210425101300', + 'only_matching': True, + }, { + # FMT (TOKYO FM) station does not provide + 'url': 'https://radiko.jp/#!/ts/FMT/20210810150000', + 'only_matching': True, + }, { + 'url': 'https://radiko.jp/#!/ts/JOAK-FM/20210509090000', + 'only_matching': True, + }] + + def _real_extract(self, url): + station, video_id = self._match_valid_url(url).groups() + vid_int = unified_timestamp(video_id, False) + + auth_token, area_id = self._auth_client() + + prog, station_program, ft, radio_begin, radio_end = self._find_program(video_id, station, vid_int) + + title = prog.find('title').text + description = clean_html(prog.find('info').text) + station_name = station_program.find('.//name').text + + formats = self._extract_formats( + video_id=video_id, station=station, is_onair=False, + ft=ft, cursor=vid_int, auth_token=auth_token, area_id=area_id, + query={ + 'start_at': radio_begin, + 'ft': radio_begin, + 'end_at': radio_end, + 'to': radio_end, + 'seek': video_id, + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'uploader': station_name, + 'uploader_id': station, + 'timestamp': vid_int, + 'formats': formats, + 'is_live': True, + } + + +class RadikoRadioIE(RadikoBaseIE): + _VALID_URL = r'https?://(?:www\.)?radiko\.jp/#!/live/(?P[A-Z0-9-]+)' + + _TESTS = [{ + # QRR (文化放送) station provides + 'url': 'https://radiko.jp/#!/live/QRR', + 'only_matching': True, + }, { + # FMT (TOKYO FM) station does not provide + 'url': 'https://radiko.jp/#!/live/FMT', + 'only_matching': True, + }, { + 'url': 'https://radiko.jp/#!/live/JOAK-FM', + 'only_matching': True, + }] + + def _real_extract(self, url): + station = self._match_id(url) + self.report_warning('Downloader will not stop at the end of the program! Press Ctrl+C to stop') + + auth_token, area_id = self._auth_client() + # get current time in JST (GMT+9:00 w/o DST) + vid_now = datetime.datetime.now(datetime.timezone(datetime.timedelta(hours=9))) + vid_now = calendar.timegm(vid_now.timetuple()) + + prog, station_program, ft, _, _ = self._find_program(station, station, vid_now) + + title = prog.find('title').text + description = clean_html(prog.find('info').text) + station_name = station_program.find('.//name').text + + formats = self._extract_formats( + video_id=station, station=station, is_onair=True, + ft=ft, cursor=vid_now, auth_token=auth_token, area_id=area_id, + query={}) + + return { + 'id': station, + 'title': title, + 'description': description, + 'uploader': station_name, + 'uploader_id': station, + 'timestamp': ft, + 'formats': formats, + 'is_live': True, + } diff --git a/yt_dlp/extractor/radiode.py b/yt_dlp/extractor/radiode.py index 2c06c8b1e4..0382873637 100644 --- a/yt_dlp/extractor/radiode.py +++ b/yt_dlp/extractor/radiode.py @@ -29,7 +29,7 @@ def _real_extract(self, url): webpage, 'broadcast') broadcast = self._parse_json(jscode, radio_id) - title = self._live_title(broadcast['name']) + title = broadcast['name'] description = broadcast.get('description') or broadcast.get('shortDescription') thumbnail = broadcast.get('picture4Url') or broadcast.get('picture4TransUrl') or broadcast.get('logo100x100') diff --git a/yt_dlp/extractor/radiokapital.py b/yt_dlp/extractor/radiokapital.py new file mode 100644 index 0000000000..2e93e034f7 --- /dev/null +++ b/yt_dlp/extractor/radiokapital.py @@ -0,0 +1,99 @@ +# coding: utf-8 + +from .common import InfoExtractor +from ..utils import ( + clean_html, + traverse_obj, + unescapeHTML, +) + +import itertools +from urllib.parse import urlencode + + +class RadioKapitalBaseIE(InfoExtractor): + def _call_api(self, resource, video_id, note='Downloading JSON metadata', qs={}): + return self._download_json( + f'https://www.radiokapital.pl/wp-json/kapital/v1/{resource}?{urlencode(qs)}', + video_id, note=note) + + def _parse_episode(self, data): + release = '%s%s%s' % (data['published'][6:11], data['published'][3:6], data['published'][:3]) + return { + '_type': 'url_transparent', + 'url': data['mixcloud_url'], + 'ie_key': 'Mixcloud', + 'title': unescapeHTML(data['title']), + 'description': clean_html(data.get('content')), + 'tags': traverse_obj(data, ('tags', ..., 'name')), + 'release_date': release, + 'series': traverse_obj(data, ('show', 'title')), + } + + +class RadioKapitalIE(RadioKapitalBaseIE): + IE_NAME = 'radiokapital' + _VALID_URL = r'https?://(?:www\.)?radiokapital\.pl/shows/[a-z\d-]+/(?P[a-z\d-]+)' + + _TESTS = [{ + 'url': 'https://radiokapital.pl/shows/tutaj-sa-smoki/5-its-okay-to-be-immaterial', + 'info_dict': { + 'id': 'radiokapital_radio-kapitał-tutaj-są-smoki-5-its-okay-to-be-immaterial-2021-05-20', + 'ext': 'm4a', + 'title': '#5: It’s okay to\xa0be\xa0immaterial', + 'description': 'md5:2499da5fbfb0e88333b7d37ec8e9e4c4', + 'uploader': 'Radio Kapitał', + 'uploader_id': 'radiokapital', + 'timestamp': 1621640164, + 'upload_date': '20210521', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + episode = self._call_api('episodes/%s' % video_id, video_id) + return self._parse_episode(episode) + + +class RadioKapitalShowIE(RadioKapitalBaseIE): + IE_NAME = 'radiokapital:show' + _VALID_URL = r'https?://(?:www\.)?radiokapital\.pl/shows/(?P[a-z\d-]+)/?(?:$|[?#])' + + _TESTS = [{ + 'url': 'https://radiokapital.pl/shows/wesz', + 'info_dict': { + 'id': '100', + 'title': 'WĘSZ', + 'description': 'md5:3a557a1e0f31af612b0dcc85b1e0ca5c', + }, + 'playlist_mincount': 17, + }] + + def _get_episode_list(self, series_id, page_no): + return self._call_api( + 'episodes', series_id, + f'Downloading episode list page #{page_no}', qs={ + 'show': series_id, + 'page': page_no, + }) + + def _entries(self, series_id): + for page_no in itertools.count(1): + episode_list = self._get_episode_list(series_id, page_no) + yield from (self._parse_episode(ep) for ep in episode_list['items']) + if episode_list['next'] is None: + break + + def _real_extract(self, url): + series_id = self._match_id(url) + + show = self._call_api(f'shows/{series_id}', series_id, 'Downloading show metadata') + entries = self._entries(series_id) + return { + '_type': 'playlist', + 'entries': entries, + 'id': str(show['id']), + 'title': show.get('title'), + 'description': clean_html(show.get('content')), + } diff --git a/yt_dlp/extractor/radiozet.py b/yt_dlp/extractor/radiozet.py new file mode 100644 index 0000000000..2e1ff36c2f --- /dev/null +++ b/yt_dlp/extractor/radiozet.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import ( + traverse_obj, + strip_or_none, +) + + +class RadioZetPodcastIE(InfoExtractor): + _VALID_URL = r'https?://player\.radiozet\.pl\/Podcasty/.*?/(?P.+)' + _TEST = { + 'url': 'https://player.radiozet.pl/Podcasty/Nie-Ma-Za-Co/O-przedmiotach-szkolnych-ktore-przydaja-sie-w-zyciu', + 'md5': 'e03665c316b4fbc5f6a8f232948bbba3', + 'info_dict': { + 'id': '42154', + 'display_id': 'O-przedmiotach-szkolnych-ktore-przydaja-sie-w-zyciu', + 'title': 'O przedmiotach szkolnych, które przydają się w życiu', + 'description': 'md5:fa72bed49da334b09e5b2f79851f185c', + 'release_timestamp': 1592985480, + 'ext': 'mp3', + 'thumbnail': r're:^https?://.*\.png$', + 'duration': 83, + 'series': 'Nie Ma Za Co', + 'creator': 'Katarzyna Pakosińska', + } + } + + def _call_api(self, podcast_id, display_id): + return self._download_json( + f'https://player.radiozet.pl/api/podcasts/getPodcast/(node)/{podcast_id}/(station)/radiozet', + display_id) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + podcast_id = self._html_search_regex(r'feature|episode)/(?P[a-f0-9-]+)' + _TESTS = [{ + 'url': 'https://rad.live/content/feature/dc5acfbc-761b-4bec-9564-df999905116a', + 'md5': '6219d5d31d52de87d21c9cf5b7cb27ff', + 'info_dict': { + 'id': 'dc5acfbc-761b-4bec-9564-df999905116a', + 'ext': 'mp4', + 'title': 'Deathpact - Digital Mirage 2 [Full Set]', + 'language': 'en', + 'thumbnail': 'https://static.12core.net/cb65ae077a079c68380e38f387fbc438.png', + 'description': '', + 'release_timestamp': 1600185600.0, + 'channel': 'Proximity', + 'channel_id': '9ce6dd01-70a4-4d59-afb6-d01f807cd009', + 'channel_url': 'https://rad.live/content/channel/9ce6dd01-70a4-4d59-afb6-d01f807cd009', + } + }, { + 'url': 'https://rad.live/content/episode/bbcf66ec-0d02-4ca0-8dc0-4213eb2429bf', + 'md5': '40b2175f347592125d93e9a344080125', + 'info_dict': { + 'id': 'bbcf66ec-0d02-4ca0-8dc0-4213eb2429bf', + 'ext': 'mp4', + 'title': 'E01: Bad Jokes 1', + 'language': 'en', + 'thumbnail': 'https://lsp.littlstar.com/channels/WHISTLE/BAD_JOKES/SEASON_1/BAD_JOKES_101/poster.jpg', + 'description': 'Bad Jokes - Champions, Adam Pally, Super Troopers, Team Edge and 2Hype', + 'release_timestamp': None, + 'channel': None, + 'channel_id': None, + 'channel_url': None, + 'episode': 'E01: Bad Jokes 1', + 'episode_number': 1, + 'episode_id': '336', + }, + }] + + def _real_extract(self, url): + content_type, video_id = self._match_valid_url(url).groups() + + webpage = self._download_webpage(url, video_id) + + content_info = json.loads(self._search_regex( + r']*type=([\'"])application/json\1[^>]*>(?P{.+?})', + webpage, 'video info', group='json'))['props']['pageProps']['initialContentData'] + video_info = content_info[content_type] + + if not video_info: + raise ExtractorError('Unable to extract video info, make sure the URL is valid') + + formats = self._extract_m3u8_formats(video_info['assets']['videos'][0]['url'], video_id) + self._sort_formats(formats) + + data = video_info.get('structured_data', {}) + + release_date = unified_timestamp(traverse_obj(data, ('releasedEvent', 'startDate'))) + channel = next(iter(content_info.get('channels', [])), {}) + channel_id = channel.get('lrn', '').split(':')[-1] or None + + result = { + 'id': video_id, + 'title': video_info['title'], + 'formats': formats, + 'language': traverse_obj(data, ('potentialAction', 'target', 'inLanguage')), + 'thumbnail': traverse_obj(data, ('image', 'contentUrl')), + 'description': data.get('description'), + 'release_timestamp': release_date, + 'channel': channel.get('name'), + 'channel_id': channel_id, + 'channel_url': f'https://rad.live/content/channel/{channel_id}' if channel_id else None, + + } + if content_type == 'episode': + result.update({ + # TODO: Get season number when downloading single episode + 'episode': video_info.get('title'), + 'episode_number': video_info.get('number'), + 'episode_id': video_info.get('id'), + }) + + return result + + +class RadLiveSeasonIE(RadLiveIE): + IE_NAME = 'radlive:season' + _VALID_URL = r'https?://(?:www\.)?rad\.live/content/season/(?P[a-f0-9-]+)' + _TESTS = [{ + 'url': 'https://rad.live/content/season/08a290f7-c9ef-4e22-9105-c255995a2e75', + 'md5': '40b2175f347592125d93e9a344080125', + 'info_dict': { + 'id': '08a290f7-c9ef-4e22-9105-c255995a2e75', + 'title': 'Bad Jokes - Season 1', + }, + 'playlist_mincount': 5, + }] + + @classmethod + def suitable(cls, url): + return False if RadLiveIE.suitable(url) else super(RadLiveSeasonIE, cls).suitable(url) + + def _real_extract(self, url): + season_id = self._match_id(url) + webpage = self._download_webpage(url, season_id) + + content_info = json.loads(self._search_regex( + r']*type=([\'"])application/json\1[^>]*>(?P{.+?})', + webpage, 'video info', group='json'))['props']['pageProps']['initialContentData'] + video_info = content_info['season'] + + entries = [{ + '_type': 'url_transparent', + 'id': episode['structured_data']['url'].split('/')[-1], + 'url': episode['structured_data']['url'], + 'series': try_get(content_info, lambda x: x['series']['title']), + 'season': video_info['title'], + 'season_number': video_info.get('number'), + 'season_id': video_info.get('id'), + 'ie_key': RadLiveIE.ie_key(), + } for episode in video_info['episodes']] + + return self.playlist_result(entries, season_id, video_info.get('title')) + + +class RadLiveChannelIE(RadLiveIE): + IE_NAME = 'radlive:channel' + _VALID_URL = r'https?://(?:www\.)?rad\.live/content/channel/(?P[a-f0-9-]+)' + _TESTS = [{ + 'url': 'https://rad.live/content/channel/5c4d8df4-6fa0-413c-81e3-873479b49274', + 'md5': '625156a08b7f2b0b849f234e664457ac', + 'info_dict': { + 'id': '5c4d8df4-6fa0-413c-81e3-873479b49274', + 'title': 'Whistle Sports', + }, + 'playlist_mincount': 7, + }] + + _QUERY = ''' +query WebChannelListing ($lrn: ID!) { + channel (id:$lrn) { + name + features { + structured_data + } + } +}''' + + @classmethod + def suitable(cls, url): + return False if RadLiveIE.suitable(url) else super(RadLiveChannelIE, cls).suitable(url) + + def _real_extract(self, url): + channel_id = self._match_id(url) + + graphql = self._download_json( + 'https://content.mhq.12core.net/graphql', channel_id, + headers={'Content-Type': 'application/json'}, + data=json.dumps({ + 'query': self._QUERY, + 'variables': {'lrn': f'lrn:12core:media:content:channel:{channel_id}'} + }).encode('utf-8')) + + data = traverse_obj(graphql, ('data', 'channel')) + if not data: + raise ExtractorError('Unable to extract video info, make sure the URL is valid') + + entries = [{ + '_type': 'url_transparent', + 'url': feature['structured_data']['url'], + 'ie_key': RadLiveIE.ie_key(), + } for feature in data['features']] + + return self.playlist_result(entries, channel_id, data.get('name')) diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py index 27cd018012..39e57decd5 100644 --- a/yt_dlp/extractor/rai.py +++ b/yt_dlp/extractor/rai.py @@ -14,12 +14,16 @@ find_xpath_attr, fix_xml_ampersands, GeoRestrictedError, + get_element_by_class, HEADRequest, int_or_none, + join_nonempty, parse_duration, + parse_list, remove_start, strip_or_none, try_get, + unescapeHTML, unified_strdate, unified_timestamp, update_url_query, @@ -135,6 +139,9 @@ def test_url(url): return False if resp.url == url else resp.url return None + # filter out audio-only formats + fmts = [f for f in fmts if not f.get('vcodec') == 'none'] + def get_format_info(tbr): import math br = int_or_none(tbr) @@ -226,7 +233,7 @@ class RaiPlayIE(RaiBaseIE): 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', 'ext': 'mp4', 'title': 'Report del 07/04/2014', - 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014', + 'alt_title': 'St 2013/14 - Report - Espresso nel caffè - 07/04/2014', 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Rai Gulp', @@ -234,7 +241,7 @@ class RaiPlayIE(RaiBaseIE): 'series': 'Report', 'season': '2013/14', 'subtitles': { - 'it': 'count:2', + 'it': 'count:4', }, }, 'params': { @@ -242,18 +249,18 @@ class RaiPlayIE(RaiBaseIE): }, }, { # 1080p direct mp4 url - 'url': 'https://www.raiplay.it/video/2021/03/Leonardo-S1E1-b5703b02-82ee-475a-85b6-c9e4a8adf642.html', - 'md5': '2e501e8651d72f05ffe8f5d286ad560b', + 'url': 'https://www.raiplay.it/video/2021/11/Blanca-S1E1-Senza-occhi-b1255a4a-8e72-4a2f-b9f3-fc1308e00736.html', + 'md5': 'aeda7243115380b2dd5e881fd42d949a', 'info_dict': { - 'id': 'b5703b02-82ee-475a-85b6-c9e4a8adf642', + 'id': 'b1255a4a-8e72-4a2f-b9f3-fc1308e00736', 'ext': 'mp4', - 'title': 'Leonardo - S1E1', - 'alt_title': 'St 1 Ep 1 - Episodio 1', - 'description': 'md5:f5360cd267d2de146e4e3879a5a47d31', + 'title': 'Blanca - S1E1 - Senza occhi', + 'alt_title': 'St 1 Ep 1 - Blanca - Senza occhi', + 'description': 'md5:75f95d5c030ec8bac263b1212322e28c', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Rai 1', - 'duration': 3229, - 'series': 'Leonardo', + 'duration': 6493, + 'series': 'Blanca', 'season': 'Season 1', }, }, { @@ -306,12 +313,13 @@ def _real_extract(self, url): program_info = media.get('program_info') or {} season = media.get('season') + alt_title = join_nonempty(media.get('subtitle'), media.get('toptitle'), delim=' - ') + info = { 'id': remove_start(media.get('id'), 'ContentItem-') or video_id, 'display_id': video_id, - 'title': self._live_title(title) if relinker_info.get( - 'is_live') else title, - 'alt_title': strip_or_none(media.get('subtitle')), + 'title': title, + 'alt_title': strip_or_none(alt_title), 'description': media.get('description'), 'uploader': strip_or_none(media.get('channel')), 'creator': strip_or_none(media.get('editor') or None), @@ -585,3 +593,84 @@ def _real_extract(self, url): info.update(relinker_info) return info + + +class RaiPlayRadioBaseIE(InfoExtractor): + _BASE = 'https://www.raiplayradio.it' + + def get_playlist_iter(self, url, uid): + webpage = self._download_webpage(url, uid) + for attrs in parse_list(webpage): + title = attrs['data-title'].strip() + audio_url = urljoin(url, attrs['data-mediapolis']) + entry = { + 'url': audio_url, + 'id': attrs['data-uniquename'].lstrip('ContentItem-'), + 'title': title, + 'ext': 'mp3', + 'language': 'it', + } + if 'data-image' in attrs: + entry['thumbnail'] = urljoin(url, attrs['data-image']) + yield entry + + +class RaiPlayRadioIE(RaiPlayRadioBaseIE): + _VALID_URL = r'%s/audio/.+?-(?P%s)\.html' % ( + RaiPlayRadioBaseIE._BASE, RaiBaseIE._UUID_RE) + _TEST = { + 'url': 'https://www.raiplayradio.it/audio/2019/07/RADIO3---LEZIONI-DI-MUSICA-36b099ff-4123-4443-9bf9-38e43ef5e025.html', + 'info_dict': { + 'id': '36b099ff-4123-4443-9bf9-38e43ef5e025', + 'ext': 'mp3', + 'title': 'Dal "Chiaro di luna" al "Clair de lune", prima parte con Giovanni Bietti', + 'thumbnail': r're:^https?://.*\.jpg$', + 'language': 'it', + } + } + + def _real_extract(self, url): + audio_id = self._match_id(url) + list_url = url.replace('.html', '-list.html') + return next(entry for entry in self.get_playlist_iter(list_url, audio_id) if entry['id'] == audio_id) + + +class RaiPlayRadioPlaylistIE(RaiPlayRadioBaseIE): + _VALID_URL = r'%s/playlist/.+?-(?P%s)\.html' % ( + RaiPlayRadioBaseIE._BASE, RaiBaseIE._UUID_RE) + _TEST = { + 'url': 'https://www.raiplayradio.it/playlist/2017/12/Alice-nel-paese-delle-meraviglie-72371d3c-d998-49f3-8860-d168cfdf4966.html', + 'info_dict': { + 'id': '72371d3c-d998-49f3-8860-d168cfdf4966', + 'title': "Alice nel paese delle meraviglie", + 'description': "di Lewis Carrol letto da Aldo Busi", + }, + 'playlist_count': 11, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + playlist_webpage = self._download_webpage(url, playlist_id) + playlist_title = unescapeHTML(self._html_search_regex( + r'data-playlist-title="(.+?)"', playlist_webpage, 'title')) + playlist_creator = self._html_search_meta( + 'nomeProgramma', playlist_webpage) + playlist_description = get_element_by_class( + 'textDescriptionProgramma', playlist_webpage) + + player_href = self._html_search_regex( + r'data-player-href="(.+?)"', playlist_webpage, 'href') + list_url = urljoin(url, player_href) + + entries = list(self.get_playlist_iter(list_url, playlist_id)) + for index, entry in enumerate(entries, start=1): + entry.update({ + 'track': entry['title'], + 'track_number': index, + 'artist': playlist_creator, + 'album': playlist_title + }) + + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description, + creator=playlist_creator) diff --git a/yt_dlp/extractor/rcti.py b/yt_dlp/extractor/rcti.py index 31d9779dd4..ac42e58d9c 100644 --- a/yt_dlp/extractor/rcti.py +++ b/yt_dlp/extractor/rcti.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import itertools import json import random import time @@ -12,6 +11,7 @@ dict_get, ExtractorError, strip_or_none, + traverse_obj, try_get ) @@ -26,7 +26,7 @@ def _call_api(self, url, video_id, note=None): json = self._download_json( url, video_id, note=note, headers={'Authorization': self._AUTH_KEY}) if json.get('status', {}).get('code', 0) != 0: - raise ExtractorError('%s said: %s' % (self.IE_NAME, json["status"]["message_client"]), cause=json) + raise ExtractorError(f'{self.IE_NAME} said: {json["status"]["message_client"]}', cause=json) return json.get('data'), json.get('meta') @@ -85,9 +85,6 @@ class RCTIPlusIE(RCTIPlusBaseIE): 'series': 'iNews Malam', 'channel': 'INews', }, - 'params': { - 'format': 'bestvideo', - }, }, { # Missed event/replay 'url': 'https://www.rctiplus.com/missed-event/2507/mou-signing-ceremony-27-juli-2021-1400-wib', 'md5': '649c5f27250faed1452ca8b91e06922d', @@ -132,7 +129,6 @@ class RCTIPlusIE(RCTIPlusBaseIE): }, 'params': { 'skip_download': True, - 'format': 'bestvideo', }, }] _CONVIVA_JSON_TEMPLATE = { @@ -227,18 +223,30 @@ def _real_extract(self, url): class RCTIPlusSeriesIE(RCTIPlusBaseIE): - _VALID_URL = r'https://www\.rctiplus\.com/programs/(?P\d+)/(?P[^/?#&]+)' + _VALID_URL = r'https://www\.rctiplus\.com/programs/(?P\d+)/(?P[^/?#&]+)(?:/(?Pepisodes|extras|clips))?' _TESTS = [{ - 'url': 'https://www.rctiplus.com/programs/540/upin-ipin', - 'playlist_mincount': 417, + 'url': 'https://www.rctiplus.com/programs/829/putri-untuk-pangeran', + 'playlist_mincount': 1019, 'info_dict': { - 'id': '540', - 'title': 'Upin & Ipin', - 'description': 'md5:22cc912381f389664416844e1ec4f86b', + 'id': '829', + 'title': 'Putri Untuk Pangeran', + 'description': 'md5:aca7b54d05bd95a67d4f4613cc1d622d', + 'age_limit': 2, + 'cast': ['Verrel Bramasta', 'Ranty Maria', 'Riza Syah', 'Ivan Fadilla', 'Nicole Parham', 'Dll', 'Aviv Elham'], + 'display_id': 'putri-untuk-pangeran', + 'tag': 'count:18', }, - }, { - 'url': 'https://www.rctiplus.com/programs/540/upin-ipin/episodes?utm_source=Rplusdweb&utm_medium=share_copy&utm_campaign=programsupin-ipin', - 'only_matching': True, + }, { # No episodes + 'url': 'https://www.rctiplus.com/programs/615/inews-pagi', + 'playlist_mincount': 388, + 'info_dict': { + 'id': '615', + 'title': 'iNews Pagi', + 'description': 'md5:f18ee3d4643cfb41c358e5a9b693ee04', + 'age_limit': 2, + 'tag': 'count:11', + 'display_id': 'inews-pagi', + } }] _AGE_RATINGS = { # Based off https://id.wikipedia.org/wiki/Sistem_rating_konten_televisi with additional ratings 'S-SU': 2, @@ -273,47 +281,63 @@ def _entries(self, url, display_id=None, note='Downloading entries JSON', metada display_id, '%s page %s' % (note, page_num))[0] or [] for video_json in episode_list: - link = video_json['share_link'] - url_res = self.url_result(link, 'RCTIPlus', video_json.get('product_id'), video_json.get('title')) - url_res.update(metadata) - yield url_res + yield { + '_type': 'url', + 'url': video_json['share_link'], + 'ie_key': RCTIPlusIE.ie_key(), + 'id': video_json.get('product_id'), + 'title': video_json.get('title'), + 'display_id': video_json.get('title_code').replace('_', '-'), + 'description': video_json.get('summary'), + 'timestamp': video_json.get('release_date'), + 'duration': video_json.get('duration'), + 'season_number': video_json.get('season'), + 'episode_number': video_json.get('episode'), + **metadata + } + + def _series_entries(self, series_id, display_id=None, video_type=None, metadata={}): + if not video_type or video_type in 'episodes': + try: + seasons_list = self._call_api( + f'https://api.rctiplus.com/api/v1/program/{series_id}/season', + display_id, 'Downloading seasons list JSON')[0] + except ExtractorError as e: + if 'not found' not in str(e): + raise + seasons_list = [] + for season in seasons_list: + yield from self._entries( + f'https://api.rctiplus.com/api/v2/program/{series_id}/episode?season={season["season"]}', + display_id, f'Downloading season {season["season"]} episode entries', metadata) + if not video_type or video_type in 'extras': + yield from self._entries( + f'https://api.rctiplus.com/api/v2/program/{series_id}/extra?content_id=0', + display_id, 'Downloading extra entries', metadata) + if not video_type or video_type in 'clips': + yield from self._entries( + f'https://api.rctiplus.com/api/v2/program/{series_id}/clip?content_id=0', + display_id, 'Downloading clip entries', metadata) def _real_extract(self, url): - series_id, display_id = self._match_valid_url(url).groups() + series_id, display_id, video_type = self._match_valid_url(url).group('id', 'display_id', 'type') + if video_type: + self.report_warning( + f'Only {video_type} will be downloaded. ' + f'To download everything from the series, remove "/{video_type}" from the URL') series_meta, meta_paths = self._call_api( - 'https://api.rctiplus.com/api/v1/program/%s/detail' % series_id, display_id, 'Downloading series metadata') + f'https://api.rctiplus.com/api/v1/program/{series_id}/detail', display_id, 'Downloading series metadata') metadata = { - 'age_limit': try_get(series_meta, lambda x: self._AGE_RATINGS[x['age_restriction'][0]['code']]) + 'age_limit': try_get(series_meta, lambda x: self._AGE_RATINGS[x['age_restriction'][0]['code']]), + 'cast': traverse_obj(series_meta, (('starring', 'creator', 'writer'), ..., 'name'), + expected_type=lambda x: strip_or_none(x) or None), + 'tag': traverse_obj(series_meta, ('tag', ..., 'name'), + expected_type=lambda x: strip_or_none(x) or None), } - - cast = [] - for star in series_meta.get('starring', []): - cast.append(strip_or_none(star.get('name'))) - for star in series_meta.get('creator', []): - cast.append(strip_or_none(star.get('name'))) - for star in series_meta.get('writer', []): - cast.append(strip_or_none(star.get('name'))) - metadata['cast'] = cast - - tags = [] - for tag in series_meta.get('tag', []): - tags.append(strip_or_none(tag.get('name'))) - metadata['tag'] = tags - - entries = [] - seasons_list = self._call_api( - 'https://api.rctiplus.com/api/v1/program/%s/season' % series_id, display_id, 'Downloading seasons list JSON')[0] - for season in seasons_list: - entries.append(self._entries('https://api.rctiplus.com/api/v2/program/%s/episode?season=%s' % (series_id, season['season']), - display_id, 'Downloading season %s episode entries' % season['season'], metadata)) - - entries.append(self._entries('https://api.rctiplus.com/api/v2/program/%s/clip?content_id=0' % series_id, - display_id, 'Downloading clip entries', metadata)) - entries.append(self._entries('https://api.rctiplus.com/api/v2/program/%s/extra?content_id=0' % series_id, - display_id, 'Downloading extra entries', metadata)) - - return self.playlist_result(itertools.chain(*entries), series_id, series_meta.get('title'), series_meta.get('summary'), **metadata) + return self.playlist_result( + self._series_entries(series_id, display_id, video_type, metadata), series_id, + series_meta.get('title'), series_meta.get('summary'), display_id=display_id, **metadata) class RCTIPlusTVIE(RCTIPlusBaseIE): @@ -329,7 +353,6 @@ class RCTIPlusTVIE(RCTIPlusBaseIE): }, 'params': { 'skip_download': True, - 'format': 'bestvideo', } }, { # Returned video will always change @@ -350,5 +373,6 @@ def _real_extract(self, url): tv_id = match.get('tvname') or match.get('eventname') webpage = self._download_webpage(url, tv_id) video_type, video_id = self._search_regex( - r'url\s*:\s*["\']https://api\.rctiplus\.com/api/v./(?P[^/]+)/(?P\d+)/url', webpage, 'video link', group=('type', 'id')) + r'url\s*:\s*["\']https://api\.rctiplus\.com/api/v./(?P[^/]+)/(?P\d+)/url', + webpage, 'video link', group=('type', 'id')) return self.url_result(f'https://www.rctiplus.com/{video_type}/{video_id}/{tv_id}', 'RCTIPlus') diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py index 2a1b950bd7..a042a59cc4 100644 --- a/yt_dlp/extractor/reddit.py +++ b/yt_dlp/extractor/reddit.py @@ -1,5 +1,4 @@ -from __future__ import unicode_literals - +import random from .common import InfoExtractor from ..utils import ( @@ -9,47 +8,12 @@ try_get, unescapeHTML, url_or_none, + traverse_obj ) class RedditIE(InfoExtractor): - _VALID_URL = r'https?://v\.redd\.it/(?P[^/?#&]+)' - _TEST = { - # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ - 'url': 'https://v.redd.it/zv89llsvexdz', - 'md5': '0a070c53eba7ec4534d95a5a1259e253', - 'info_dict': { - 'id': 'zv89llsvexdz', - 'ext': 'mp4', - 'title': 'zv89llsvexdz', - }, - 'params': { - 'format': 'bestvideo', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - formats = self._extract_m3u8_formats( - 'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id, - 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) - - formats.extend(self._extract_mpd_formats( - 'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id, - mpd_id='dash', fatal=False)) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': video_id, - 'formats': formats, - } - - -class RedditRIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?reddit\.com/r/[^/]+/comments/(?P[^/?#&]+))' + _VALID_URL = r'https?://(?P[^/]+\.)?reddit(?:media)?\.com/r/(?P[^/]+/comments/(?P[^/?#&]+))' _TESTS = [{ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', 'info_dict': { @@ -68,7 +32,6 @@ class RedditRIE(InfoExtractor): 'age_limit': 0, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { @@ -94,17 +57,27 @@ class RedditRIE(InfoExtractor): # reddit video @ nm reddit 'url': 'https://nm.reddit.com/r/Cricket/comments/8idvby/lousy_cameraman_finds_himself_in_cairns_line_of/', 'only_matching': True, + }, { + 'url': 'https://www.redditmedia.com/r/serbia/comments/pu9wbx/ako_vu%C4%8Di%C4%87_izgubi_izbore_ja_%C4%87u_da_crknem/', + 'only_matching': True, }] + @staticmethod + def _gen_session_id(): + id_length = 16 + rand_max = 1 << (id_length * 4) + return '%0.*x' % (id_length, random.randrange(rand_max)) + def _real_extract(self, url): - mobj = self._match_valid_url(url) - url, video_id = mobj.group('url', 'id') - - video_id = self._match_id(url) - - data = self._download_json( - url + '/.json', video_id)[0]['data']['children'][0]['data'] + subdomain, slug, video_id = self._match_valid_url(url).group('subdomain', 'slug', 'id') + self._set_cookie('.reddit.com', 'reddit_session', self._gen_session_id()) + self._set_cookie('.reddit.com', '_options', '%7B%22pref_quarantine_optin%22%3A%20true%7D') + data = self._download_json(f'https://{subdomain}reddit.com/r/{slug}/.json', video_id, fatal=False) + if not data: + # Fall back to old.reddit.com in case the requested subdomain fails + data = self._download_json(f'https://old.reddit.com/r/{slug}/.json', video_id) + data = data[0]['data']['children'][0]['data'] video_url = data['url'] # Avoid recursing into the same reddit URL @@ -142,19 +115,53 @@ def add_thumbnail(src): for resolution in resolutions: add_thumbnail(resolution) - return { - '_type': 'url_transparent', - 'url': video_url, + info = { 'title': data.get('title'), 'thumbnails': thumbnails, 'timestamp': float_or_none(data.get('created_utc')), 'uploader': data.get('author'), - 'duration': int_or_none(try_get( - data, - (lambda x: x['media']['reddit_video']['duration'], - lambda x: x['secure_media']['reddit_video']['duration']))), 'like_count': int_or_none(data.get('ups')), 'dislike_count': int_or_none(data.get('downs')), 'comment_count': int_or_none(data.get('num_comments')), 'age_limit': age_limit, } + + # Check if media is hosted on reddit: + reddit_video = traverse_obj(data, (('media', 'secure_media'), 'reddit_video'), get_all=False) + if reddit_video: + playlist_urls = [ + try_get(reddit_video, lambda x: unescapeHTML(x[y])) + for y in ('dash_url', 'hls_url') + ] + + # Update video_id + display_id = video_id + video_id = self._search_regex( + r'https?://v\.redd\.it/(?P[^/?#&]+)', reddit_video['fallback_url'], + 'video_id', default=display_id) + + dash_playlist_url = playlist_urls[0] or f'https://v.redd.it/{video_id}/DASHPlaylist.mpd' + hls_playlist_url = playlist_urls[1] or f'https://v.redd.it/{video_id}/HLSPlaylist.m3u8' + + formats = self._extract_m3u8_formats( + hls_playlist_url, display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + formats.extend(self._extract_mpd_formats( + dash_playlist_url, display_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) + + return { + **info, + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'duration': int_or_none(reddit_video.get('duration')), + } + + # Not hosted on reddit, must continue extraction + return { + **info, + 'display_id': video_id, + '_type': 'url_transparent', + 'url': video_url, + } diff --git a/yt_dlp/extractor/redgifs.py b/yt_dlp/extractor/redgifs.py new file mode 100644 index 0000000000..55196b768f --- /dev/null +++ b/yt_dlp/extractor/redgifs.py @@ -0,0 +1,232 @@ +# coding: utf-8 +import functools + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ( + ExtractorError, + int_or_none, + qualities, + try_get, + OnDemandPagedList, +) + + +class RedGifsBaseInfoExtractor(InfoExtractor): + _FORMATS = { + 'gif': 250, + 'sd': 480, + 'hd': None, + } + + def _parse_gif_data(self, gif_data): + video_id = gif_data.get('id') + quality = qualities(tuple(self._FORMATS.keys())) + + orig_height = int_or_none(gif_data.get('height')) + aspect_ratio = try_get(gif_data, lambda x: orig_height / x['width']) + + formats = [] + for format_id, height in self._FORMATS.items(): + video_url = gif_data['urls'].get(format_id) + if not video_url: + continue + height = min(orig_height, height or orig_height) + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'width': height * aspect_ratio if aspect_ratio else None, + 'height': height, + 'quality': quality(format_id), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'webpage_url': f'https://redgifs.com/watch/{video_id}', + 'ie_key': RedGifsIE.ie_key(), + 'extractor': 'RedGifs', + 'title': ' '.join(gif_data.get('tags') or []) or 'RedGifs', + 'timestamp': int_or_none(gif_data.get('createDate')), + 'uploader': gif_data.get('userName'), + 'duration': int_or_none(gif_data.get('duration')), + 'view_count': int_or_none(gif_data.get('views')), + 'like_count': int_or_none(gif_data.get('likes')), + 'categories': gif_data.get('tags') or [], + 'tags': gif_data.get('tags'), + 'age_limit': 18, + 'formats': formats, + } + + def _call_api(self, ep, video_id, *args, **kwargs): + data = self._download_json( + f'https://api.redgifs.com/v2/{ep}', video_id, *args, **kwargs) + if 'error' in data: + raise ExtractorError(f'RedGifs said: {data["error"]}', expected=True, video_id=video_id) + return data + + def _fetch_page(self, ep, video_id, query, page): + query['page'] = page + 1 + data = self._call_api( + ep, video_id, query=query, note=f'Downloading JSON metadata page {page + 1}') + + for entry in data['gifs']: + yield self._parse_gif_data(entry) + + def _prepare_api_query(self, query, fields): + api_query = [ + (field_name, query.get(field_name, (default,))[0]) + for field_name, default in fields.items()] + + return {key: val for key, val in api_query if val is not None} + + def _paged_entries(self, ep, item_id, query, fields): + page = int_or_none(query.get('page', (None,))[0]) + page_fetcher = functools.partial( + self._fetch_page, ep, item_id, self._prepare_api_query(query, fields)) + return page_fetcher(page) if page else OnDemandPagedList(page_fetcher, self._PAGE_SIZE) + + +class RedGifsIE(RedGifsBaseInfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?redgifs\.com/watch/|thumbs2\.redgifs\.com/)(?P[^-/?#\.]+)' + _TESTS = [{ + 'url': 'https://www.redgifs.com/watch/squeakyhelplesswisent', + 'info_dict': { + 'id': 'squeakyhelplesswisent', + 'ext': 'mp4', + 'title': 'Hotwife Legs Thick', + 'timestamp': 1636287915, + 'upload_date': '20211107', + 'uploader': 'ignored52', + 'duration': 16, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + } + }, { + 'url': 'https://thumbs2.redgifs.com/SqueakyHelplessWisent-mobile.mp4#t=0', + 'info_dict': { + 'id': 'squeakyhelplesswisent', + 'ext': 'mp4', + 'title': 'Hotwife Legs Thick', + 'timestamp': 1636287915, + 'upload_date': '20211107', + 'uploader': 'ignored52', + 'duration': 16, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url).lower() + video_info = self._call_api( + f'gifs/{video_id}', video_id, note='Downloading video info') + return self._parse_gif_data(video_info['gif']) + + +class RedGifsSearchIE(RedGifsBaseInfoExtractor): + IE_DESC = 'Redgifs search' + _VALID_URL = r'https?://(?:www\.)?redgifs\.com/browse\?(?P[^#]+)' + _PAGE_SIZE = 80 + _TESTS = [ + { + 'url': 'https://www.redgifs.com/browse?tags=Lesbian', + 'info_dict': { + 'id': 'tags=Lesbian', + 'title': 'Lesbian', + 'description': 'RedGifs search for Lesbian, ordered by trending' + }, + 'playlist_mincount': 100, + }, + { + 'url': 'https://www.redgifs.com/browse?type=g&order=latest&tags=Lesbian', + 'info_dict': { + 'id': 'type=g&order=latest&tags=Lesbian', + 'title': 'Lesbian', + 'description': 'RedGifs search for Lesbian, ordered by latest' + }, + 'playlist_mincount': 100, + }, + { + 'url': 'https://www.redgifs.com/browse?type=g&order=latest&tags=Lesbian&page=2', + 'info_dict': { + 'id': 'type=g&order=latest&tags=Lesbian&page=2', + 'title': 'Lesbian', + 'description': 'RedGifs search for Lesbian, ordered by latest' + }, + 'playlist_count': 80, + } + ] + + def _real_extract(self, url): + query_str = self._match_valid_url(url).group('query') + query = compat_parse_qs(query_str) + if not query.get('tags'): + raise ExtractorError('Invalid query tags', expected=True) + + tags = query.get('tags')[0] + order = query.get('order', ('trending',))[0] + + query['search_text'] = [tags] + entries = self._paged_entries('gifs/search', query_str, query, { + 'search_text': None, + 'order': 'trending', + 'type': None, + }) + + return self.playlist_result( + entries, query_str, tags, f'RedGifs search for {tags}, ordered by {order}') + + +class RedGifsUserIE(RedGifsBaseInfoExtractor): + IE_DESC = 'Redgifs user' + _VALID_URL = r'https?://(?:www\.)?redgifs\.com/users/(?P[^/?#]+)(?:\?(?P[^#]+))?' + _PAGE_SIZE = 30 + _TESTS = [ + { + 'url': 'https://www.redgifs.com/users/lamsinka89', + 'info_dict': { + 'id': 'lamsinka89', + 'title': 'lamsinka89', + 'description': 'RedGifs user lamsinka89, ordered by recent' + }, + 'playlist_mincount': 100, + }, + { + 'url': 'https://www.redgifs.com/users/lamsinka89?page=3', + 'info_dict': { + 'id': 'lamsinka89?page=3', + 'title': 'lamsinka89', + 'description': 'RedGifs user lamsinka89, ordered by recent' + }, + 'playlist_count': 30, + }, + { + 'url': 'https://www.redgifs.com/users/lamsinka89?order=best&type=g', + 'info_dict': { + 'id': 'lamsinka89?order=best&type=g', + 'title': 'lamsinka89', + 'description': 'RedGifs user lamsinka89, ordered by best' + }, + 'playlist_mincount': 100, + } + ] + + def _real_extract(self, url): + username, query_str = self._match_valid_url(url).group('username', 'query') + playlist_id = f'{username}?{query_str}' if query_str else username + + query = compat_parse_qs(query_str) + order = query.get('order', ('recent',))[0] + + entries = self._paged_entries(f'users/{username}/search', playlist_id, query, { + 'order': 'recent', + 'type': None, + }) + + return self.playlist_result( + entries, playlist_id, username, f'RedGifs user {username}, ordered by {order}') diff --git a/yt_dlp/extractor/redtube.py b/yt_dlp/extractor/redtube.py index a1ca791caa..7fee54fee4 100644 --- a/yt_dlp/extractor/redtube.py +++ b/yt_dlp/extractor/redtube.py @@ -17,17 +17,20 @@ class RedTubeIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P[0-9]+)' _TESTS = [{ - 'url': 'http://www.redtube.com/66418', - 'md5': 'fc08071233725f26b8f014dba9590005', + 'url': 'https://www.redtube.com/38864951', + 'md5': '4fba70cbca3aefd25767ab4b523c9878', 'info_dict': { - 'id': '66418', + 'id': '38864951', 'ext': 'mp4', - 'title': 'Sucked on a toilet', - 'upload_date': '20110811', - 'duration': 596, + 'title': 'Public Sex on the Balcony in Freezing Paris! Amateur Couple LeoLulu', + 'description': 'Watch video Public Sex on the Balcony in Freezing Paris! Amateur Couple LeoLulu on Redtube, home of free Blowjob porn videos and Blonde sex movies online. Video length: (10:46) - Uploaded by leolulu - Verified User - Starring Pornstar: Leolulu', + 'upload_date': '20210111', + 'timestamp': 1610343109, + 'duration': 646, 'view_count': int, 'age_limit': 18, - } + 'thumbnail': r're:https://\wi-ph\.rdtcdn\.com/videos/.+/.+\.jpg', + }, }, { 'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286', 'only_matching': True, @@ -84,27 +87,38 @@ def _real_extract(self, url): r'mediaDefinition["\']?\s*:\s*(\[.+?}\s*\])', webpage, 'media definitions', default='{}'), video_id, fatal=False) - if medias and isinstance(medias, list): - for media in medias: + for media in medias if isinstance(medias, list) else []: + format_url = url_or_none(media.get('videoUrl')) + if not format_url: + continue + format_id = media.get('format') + quality = media.get('quality') + if format_id == 'hls' or (format_id == 'mp4' and not quality): + more_media = self._download_json(format_url, video_id, fatal=False) + else: + more_media = [media] + for media in more_media if isinstance(more_media, list) else []: format_url = url_or_none(media.get('videoUrl')) if not format_url: continue - if media.get('format') == 'hls' or determine_ext(format_url) == 'm3u8': + format_id = media.get('format') + if format_id == 'hls' or determine_ext(format_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', + entry_protocol='m3u8_native', m3u8_id=format_id or 'hls', fatal=False)) continue format_id = media.get('quality') formats.append({ 'url': format_url, + 'ext': 'mp4', 'format_id': format_id, 'height': int_or_none(format_id), }) if not formats: video_url = self._html_search_regex( r'', webpage, 'video URL') - formats.append({'url': video_url}) + formats.append({'url': video_url, 'ext': 'mp4'}) self._sort_formats(formats) thumbnail = self._og_search_thumbnail(webpage) diff --git a/yt_dlp/extractor/rmcdecouverte.py b/yt_dlp/extractor/rmcdecouverte.py index 422d47ae9f..8bfce34169 100644 --- a/yt_dlp/extractor/rmcdecouverte.py +++ b/yt_dlp/extractor/rmcdecouverte.py @@ -26,7 +26,6 @@ class RMCDecouverteIE(InfoExtractor): 'upload_date': '20210428', }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { diff --git a/yt_dlp/extractor/roosterteeth.py b/yt_dlp/extractor/roosterteeth.py index 2c815bda63..652fdd116c 100644 --- a/yt_dlp/extractor/roosterteeth.py +++ b/yt_dlp/extractor/roosterteeth.py @@ -1,74 +1,32 @@ # coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, + join_nonempty, + LazyList, + parse_qs, str_or_none, + traverse_obj, + url_or_none, urlencode_postdata, + urljoin, + update_url_query, ) -class RoosterTeethIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:episode|watch)/(?P[^/?#&]+)' +class RoosterTeethBaseIE(InfoExtractor): _NETRC_MACHINE = 'roosterteeth' - _TESTS = [{ - 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', - 'md5': 'e2bd7764732d785ef797700a2489f212', - 'info_dict': { - 'id': '9156', - 'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement', - 'ext': 'mp4', - 'title': 'Million Dollars, But... The Game Announcement', - 'description': 'md5:168a54b40e228e79f4ddb141e89fe4f5', - 'thumbnail': r're:^https?://.*\.png$', - 'series': 'Million Dollars, But...', - 'episode': 'Million Dollars, But... The Game Announcement', - }, - }, { - 'url': 'https://roosterteeth.com/watch/rwby-bonus-25', - 'md5': 'fe8d9d976b272c18a24fe7f1f5830084', - 'info_dict': { - 'id': '31', - 'display_id': 'rwby-bonus-25', - 'title': 'Volume 2, World of Remnant 3', - 'description': 'md5:8d58d3270292ea11da00ea712bbfb009', - 'episode': 'Volume 2, World of Remnant 3', - 'channel_id': 'fab60c1c-29cb-43bc-9383-5c3538d9e246', - 'thumbnail': r're:^https?://.*\.(png|jpe?g)$', - 'ext': 'mp4', - }, - }, { - 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', - 'only_matching': True, - }, { - 'url': 'http://funhaus.roosterteeth.com/episode/funhaus-shorts-2016-austin-sucks-funhaus-shorts', - 'only_matching': True, - }, { - 'url': 'http://screwattack.roosterteeth.com/episode/death-battle-season-3-mewtwo-vs-shadow', - 'only_matching': True, - }, { - 'url': 'http://theknow.roosterteeth.com/episode/the-know-game-news-season-1-boring-steam-sales-are-better', - 'only_matching': True, - }, { - # only available for FIRST members - 'url': 'http://roosterteeth.com/episode/rt-docs-the-world-s-greatest-head-massage-the-world-s-greatest-head-massage-an-asmr-journey-part-one', - 'only_matching': True, - }, { - 'url': 'https://roosterteeth.com/watch/million-dollars-but-season-2-million-dollars-but-the-game-announcement', - 'only_matching': True, - }] - _EPISODE_BASE_URL = 'https://svod-be.roosterteeth.com/api/v1/watch/' + _API_BASE = 'https://svod-be.roosterteeth.com' + _API_BASE_URL = f'{_API_BASE}/api/v1' def _login(self): username, password = self._get_login_info() if username is None: return + if self._get_cookies(self._API_BASE_URL).get('rt_access_token'): + return try: self._download_json( @@ -90,13 +48,95 @@ def _login(self): self.report_warning(msg) def _real_initialize(self): - if self._get_cookies(self._EPISODE_BASE_URL).get('rt_access_token'): - return self._login() + def _extract_video_info(self, data): + thumbnails = [] + for image in traverse_obj(data, ('included', 'images')): + if image.get('type') not in ('episode_image', 'bonus_feature_image'): + continue + thumbnails.extend([{ + 'id': name, + 'url': url, + } for name, url in (image.get('attributes') or {}).items() if url_or_none(url)]) + + attributes = data.get('attributes') or {} + title = traverse_obj(attributes, 'title', 'display_title') + sub_only = attributes.get('is_sponsors_only') + + return { + 'id': str(data.get('id')), + 'display_id': attributes.get('slug'), + 'title': title, + 'description': traverse_obj(attributes, 'description', 'caption'), + 'series': attributes.get('show_title'), + 'season_number': int_or_none(attributes.get('season_number')), + 'season_id': attributes.get('season_id'), + 'episode': title, + 'episode_number': int_or_none(attributes.get('number')), + 'episode_id': str_or_none(data.get('uuid')), + 'channel_id': attributes.get('channel_id'), + 'duration': int_or_none(attributes.get('length')), + 'thumbnails': thumbnails, + 'availability': self._availability( + needs_premium=sub_only, needs_subscription=sub_only, needs_auth=sub_only, + is_private=False, is_unlisted=False), + 'tags': attributes.get('genres') + } + + +class RoosterTeethIE(RoosterTeethBaseIE): + _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:episode|watch)/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'info_dict': { + 'id': '9156', + 'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'ext': 'mp4', + 'title': 'Million Dollars, But... The Game Announcement', + 'description': 'md5:168a54b40e228e79f4ddb141e89fe4f5', + 'thumbnail': r're:^https?://.*\.png$', + 'series': 'Million Dollars, But...', + 'episode': 'Million Dollars, But... The Game Announcement', + }, + 'params': {'skip_download': True}, + }, { + 'url': 'https://roosterteeth.com/watch/rwby-bonus-25', + 'info_dict': { + 'id': '40432', + 'display_id': 'rwby-bonus-25', + 'title': 'Grimm', + 'description': 'md5:f30ff570741213418a8d2c19868b93ab', + 'episode': 'Grimm', + 'channel_id': '92f780eb-ebfe-4bf5-a3b5-c6ad5460a5f1', + 'thumbnail': r're:^https?://.*\.(png|jpe?g)$', + 'ext': 'mp4', + }, + 'params': {'skip_download': True}, + }, { + 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', + 'only_matching': True, + }, { + 'url': 'http://funhaus.roosterteeth.com/episode/funhaus-shorts-2016-austin-sucks-funhaus-shorts', + 'only_matching': True, + }, { + 'url': 'http://screwattack.roosterteeth.com/episode/death-battle-season-3-mewtwo-vs-shadow', + 'only_matching': True, + }, { + 'url': 'http://theknow.roosterteeth.com/episode/the-know-game-news-season-1-boring-steam-sales-are-better', + 'only_matching': True, + }, { + # only available for FIRST members + 'url': 'http://roosterteeth.com/episode/rt-docs-the-world-s-greatest-head-massage-the-world-s-greatest-head-massage-an-asmr-journey-part-one', + 'only_matching': True, + }, { + 'url': 'https://roosterteeth.com/watch/million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'only_matching': True, + }] + def _real_extract(self, url): display_id = self._match_id(url) - api_episode_url = self._EPISODE_BASE_URL + display_id + api_episode_url = f'{self._API_BASE_URL}/watch/{display_id}' try: video_data = self._download_json( @@ -118,36 +158,62 @@ def _real_extract(self, url): episode = self._download_json( api_episode_url, display_id, 'Downloading episode JSON metadata')['data'][0] - attributes = episode['attributes'] - title = attributes.get('title') or attributes['display_title'] - video_id = compat_str(episode['id']) - - thumbnails = [] - for image in episode.get('included', {}).get('images', []): - if image.get('type') in ('episode_image', 'bonus_feature_image'): - img_attributes = image.get('attributes') or {} - for k in ('thumb', 'small', 'medium', 'large'): - img_url = img_attributes.get(k) - if img_url: - thumbnails.append({ - 'id': k, - 'url': img_url, - }) return { - 'id': video_id, 'display_id': display_id, - 'title': title, - 'description': attributes.get('description') or attributes.get('caption'), - 'thumbnails': thumbnails, - 'series': attributes.get('show_title'), - 'season_number': int_or_none(attributes.get('season_number')), - 'season_id': attributes.get('season_id'), - 'episode': title, - 'episode_number': int_or_none(attributes.get('number')), - 'episode_id': str_or_none(episode.get('uuid')), 'formats': formats, - 'channel_id': attributes.get('channel_id'), - 'duration': int_or_none(attributes.get('length')), - 'subtitles': subtitles + 'subtitles': subtitles, + **self._extract_video_info(episode) } + + +class RoosterTeethSeriesIE(RoosterTeethBaseIE): + _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/series/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://roosterteeth.com/series/rwby?season=7', + 'playlist_count': 13, + 'info_dict': { + 'id': 'rwby-7', + 'title': 'RWBY - Season 7', + } + }, { + 'url': 'https://roosterteeth.com/series/role-initiative', + 'playlist_mincount': 16, + 'info_dict': { + 'id': 'role-initiative', + 'title': 'Role Initiative', + } + }, { + 'url': 'https://roosterteeth.com/series/let-s-play-minecraft?season=9', + 'playlist_mincount': 50, + 'info_dict': { + 'id': 'let-s-play-minecraft-9', + 'title': 'Let\'s Play Minecraft - Season 9', + } + }] + + def _entries(self, series_id, season_number): + display_id = join_nonempty(series_id, season_number) + # TODO: extract bonus material + for data in self._download_json( + f'{self._API_BASE_URL}/shows/{series_id}/seasons?order=asc&order_by', display_id)['data']: + idx = traverse_obj(data, ('attributes', 'number')) + if season_number and idx != season_number: + continue + season_url = update_url_query(urljoin(self._API_BASE, data['links']['episodes']), {'per_page': 1000}) + season = self._download_json(season_url, display_id, f'Downloading season {idx} JSON metadata')['data'] + for episode in season: + yield self.url_result( + f'https://www.roosterteeth.com{episode["canonical_links"]["self"]}', + RoosterTeethIE.ie_key(), + **self._extract_video_info(episode)) + + def _real_extract(self, url): + series_id = self._match_id(url) + season_number = traverse_obj(parse_qs(url), ('season', 0), expected_type=int_or_none) + + entries = LazyList(self._entries(series_id, season_number)) + return self.playlist_result( + entries, + join_nonempty(series_id, season_number), + join_nonempty(entries[0].get('series'), season_number, delim=' - Season ')) diff --git a/yt_dlp/extractor/rtbf.py b/yt_dlp/extractor/rtbf.py index f9979d0a49..4b61fdb17e 100644 --- a/yt_dlp/extractor/rtbf.py +++ b/yt_dlp/extractor/rtbf.py @@ -85,8 +85,6 @@ def _real_extract(self, url): title = data['title'] is_live = data.get('isLive') - if is_live: - title = self._live_title(title) height_re = r'-(\d+)p\.' formats = [] diff --git a/yt_dlp/extractor/rtrfm.py b/yt_dlp/extractor/rtrfm.py new file mode 100644 index 0000000000..93d51e8ed7 --- /dev/null +++ b/yt_dlp/extractor/rtrfm.py @@ -0,0 +1,67 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class RTRFMIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtrfm\.com\.au/(?:shows|show-episode)/(?P[^/?\#&]+)' + _TESTS = [ + { + 'url': 'https://rtrfm.com.au/shows/breakfast/', + 'md5': '46168394d3a5ce237cf47e85d0745413', + 'info_dict': { + 'id': 'breakfast-2021-11-16', + 'ext': 'mp3', + 'series': 'Breakfast with Taylah', + 'title': r're:^Breakfast with Taylah \d{4}-\d{2}-\d{2}$', + 'description': 'md5:0979c3ab1febfbec3f1ccb743633c611', + }, + 'skip': 'ID and md5 changes daily', + }, + { + 'url': 'https://rtrfm.com.au/show-episode/breakfast-2021-11-11/', + 'md5': '396bedf1e40f96c62b30d4999202a790', + 'info_dict': { + 'id': 'breakfast-2021-11-11', + 'ext': 'mp3', + 'series': 'Breakfast with Taylah', + 'title': 'Breakfast with Taylah 2021-11-11', + 'description': 'md5:0979c3ab1febfbec3f1ccb743633c611', + }, + }, + { + 'url': 'https://rtrfm.com.au/show-episode/breakfast-2020-06-01/', + 'md5': '594027f513ec36a24b15d65007a24dff', + 'info_dict': { + 'id': 'breakfast-2020-06-01', + 'ext': 'mp3', + 'series': 'Breakfast with Taylah', + 'title': 'Breakfast with Taylah 2020-06-01', + 'description': r're:^Breakfast with Taylah ', + }, + 'skip': 'This audio has expired', + }, + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + show, date, title = self._search_regex( + r'''\.playShow(?:From)?\(['"](?P[^'"]+)['"],\s*['"](?P[0-9]{4}-[0-9]{2}-[0-9]{2})['"],\s*['"](?P[^'"]+)['"]''', + webpage, 'details', group=('show', 'date', 'title')) + url = self._download_json( + 'https://restreams.rtrfm.com.au/rzz', + show, 'Downloading MP3 URL', query={'n': show, 'd': date})['u'] + # This is the only indicator of an error until trying to download the URL and + # downloads of mp4 URLs always fail (403 for current episodes, 404 for missing). + if '.mp4' in url: + url = None + self.raise_no_formats('Expired or no episode on this date', expected=True) + return { + 'id': '%s-%s' % (show, date), + 'title': '%s %s' % (title, date), + 'series': title, + 'url': url, + 'release_date': date, + 'description': self._og_search_description(webpage), + } diff --git a/yt_dlp/extractor/rtve.py b/yt_dlp/extractor/rtve.py index 59832eeac8..af1bb943d5 100644 --- a/yt_dlp/extractor/rtve.py +++ b/yt_dlp/extractor/rtve.py @@ -18,6 +18,7 @@ remove_end, remove_start, std_headers, + try_get, ) _bytes_to_chr = (lambda x: x) if sys.version_info[0] == 2 else (lambda x: map(chr, x)) @@ -160,7 +161,7 @@ def _real_extract(self, url): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'formats': formats, 'thumbnail': info.get('image'), 'subtitles': subtitles, @@ -178,6 +179,93 @@ def _get_subtitles(self, video_id, sub_file): for s in subs) +class RTVEAudioIE(RTVEALaCartaIE): + IE_NAME = 'rtve.es:audio' + IE_DESC = 'RTVE audio' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/(alacarta|play)/audios/[^/]+/[^/]+/(?P<id>[0-9]+)' + + _TESTS = [{ + 'url': 'https://www.rtve.es/alacarta/audios/a-hombros-de-gigantes/palabra-ingeniero-codigos-informaticos-27-04-21/5889192/', + 'md5': 'ae06d27bff945c4e87a50f89f6ce48ce', + 'info_dict': { + 'id': '5889192', + 'ext': 'mp3', + 'title': 'Códigos informáticos', + 'thumbnail': r're:https?://.+/1598856591583.jpg', + 'duration': 349.440, + 'series': 'A hombros de gigantes', + }, + }, { + 'url': 'https://www.rtve.es/play/audios/en-radio-3/ignatius-farray/5791165/', + 'md5': '072855ab89a9450e0ba314c717fa5ebc', + 'info_dict': { + 'id': '5791165', + 'ext': 'mp3', + 'title': 'Ignatius Farray', + 'thumbnail': r're:https?://.+/1613243011863.jpg', + 'duration': 3559.559, + 'series': 'En Radio 3' + }, + }, { + 'url': 'https://www.rtve.es/play/audios/frankenstein-o-el-moderno-prometeo/capitulo-26-ultimo-muerte-victor-juan-jose-plans-mary-shelley/6082623/', + 'md5': '0eadab248cc8dd193fa5765712e84d5c', + 'info_dict': { + 'id': '6082623', + 'ext': 'mp3', + 'title': 'Capítulo 26 y último: La muerte de Victor', + 'thumbnail': r're:https?://.+/1632147445707.jpg', + 'duration': 3174.086, + 'series': 'Frankenstein o el moderno Prometeo' + }, + }] + + def _extract_png_formats(self, audio_id): + """ + This function retrieves media related png thumbnail which obfuscate + valuable information about the media. This information is decrypted + via base class _decrypt_url function providing media quality and + media url + """ + png = self._download_webpage( + 'http://www.rtve.es/ztnr/movil/thumbnail/%s/audios/%s.png' % + (self._manager, audio_id), + audio_id, 'Downloading url information', query={'q': 'v2'}) + q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) + formats = [] + for quality, audio_url in self._decrypt_url(png): + ext = determine_ext(audio_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + audio_url, audio_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + audio_url, audio_id, 'dash', fatal=False)) + else: + formats.append({ + 'format_id': quality, + 'quality': q(quality), + 'url': audio_url, + }) + self._sort_formats(formats) + return formats + + def _real_extract(self, url): + audio_id = self._match_id(url) + info = self._download_json( + 'https://www.rtve.es/api/audios/%s.json' % audio_id, + audio_id)['page']['items'][0] + + return { + 'id': audio_id, + 'title': info['title'].strip(), + 'thumbnail': info.get('thumbnail'), + 'duration': float_or_none(info.get('duration'), 1000), + 'series': try_get(info, lambda x: x['programInfo']['title']), + 'formats': self._extract_png_formats(audio_id), + } + + class RTVEInfantilIE(RTVEALaCartaIE): IE_NAME = 'rtve.es:infantil' IE_DESC = 'RTVE infantil' @@ -230,7 +318,7 @@ def _real_extract(self, url): return { 'id': video_id, - 'title': self._live_title(title), + 'title': title, 'formats': self._extract_png_formats(vidplayer_id), 'is_live': True, } diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index b526de76bc..49c1f44851 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -1,15 +1,17 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import compat_str, compat_HTTPError from ..utils import ( determine_ext, int_or_none, parse_iso8601, try_get, + ExtractorError, ) @@ -75,3 +77,36 @@ def _real_extract(self, url): 'channel_url': author.get('url'), 'duration': int_or_none(video.get('duration')), } + + +class RumbleChannelIE(InfoExtractor): + _VALID_URL = r'(?P<url>https?://(?:www\.)?rumble\.com/(?:c|user)/(?P<id>[^&?#$/]+))' + + _TESTS = [{ + 'url': 'https://rumble.com/c/Styxhexenhammer666', + 'playlist_mincount': 1160, + 'info_dict': { + 'id': 'Styxhexenhammer666', + }, + }, { + 'url': 'https://rumble.com/user/goldenpoodleharleyeuna', + 'playlist_count': 4, + 'info_dict': { + 'id': 'goldenpoodleharleyeuna', + }, + }] + + def entries(self, url, playlist_id): + for page in itertools.count(1): + try: + webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + break + raise + for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage): + yield self.url_result('https://rumble.com' + video_url) + + def _real_extract(self, url): + url, playlist_id = self._match_valid_url(url).groups() + return self.playlist_result(self.entries(url, playlist_id), playlist_id=playlist_id) diff --git a/yt_dlp/extractor/rutube.py b/yt_dlp/extractor/rutube.py index d027412c48..2f753b41f2 100644 --- a/yt_dlp/extractor/rutube.py +++ b/yt_dlp/extractor/rutube.py @@ -230,9 +230,9 @@ def _real_extract(self, url): return self._extract_playlist(self._match_id(url)) -class RutubeChannelIE(RutubePlaylistBaseIE): - IE_NAME = 'rutube:channel' - IE_DESC = 'Rutube channels' +class RutubeTagsIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:tags' + IE_DESC = 'Rutube tags' _VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)' _TESTS = [{ 'url': 'http://rutube.ru/tags/video/1800/', @@ -312,3 +312,18 @@ def _real_extract(self, url): playlist_kind = qs['pl_type'][0] playlist_id = qs['pl_id'][0] return self._extract_playlist(playlist_id, item_kind=playlist_kind) + + +class RutubeChannelIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:channel' + IE_DESC = 'Rutube channel' + _VALID_URL = r'https?://rutube\.ru/channel/(?P<id>\d+)/videos' + _TESTS = [{ + 'url': 'https://rutube.ru/channel/639184/videos/', + 'info_dict': { + 'id': '639184', + }, + 'playlist_mincount': 133, + }] + + _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' diff --git a/yt_dlp/extractor/rutv.py b/yt_dlp/extractor/rutv.py index 7e0de994a4..3de86b2328 100644 --- a/yt_dlp/extractor/rutv.py +++ b/yt_dlp/extractor/rutv.py @@ -201,7 +201,7 @@ def _real_extract(self, url): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': description, 'thumbnail': thumbnail, 'view_count': view_count, diff --git a/yt_dlp/extractor/safari.py b/yt_dlp/extractor/safari.py index c92e8849bd..cca4464ca8 100644 --- a/yt_dlp/extractor/safari.py +++ b/yt_dlp/extractor/safari.py @@ -193,7 +193,12 @@ def _real_extract(self, url): part = self._download_json( url, '%s/%s' % (mobj.group('course_id'), mobj.group('part')), 'Downloading part JSON') - return self.url_result(part['web_url'], SafariIE.ie_key()) + web_url = part['web_url'] + if 'library/view' in web_url: + web_url = web_url.replace('library/view', 'videos') + natural_keys = part['natural_key'] + web_url = f'{web_url.rsplit("/", 1)[0]}/{natural_keys[0]}-{natural_keys[1][:-5]}' + return self.url_result(web_url, SafariIE.ie_key()) class SafariCourseIE(SafariBaseIE): diff --git a/yt_dlp/extractor/sbs.py b/yt_dlp/extractor/sbs.py index 0a806ee4e4..4090f6385d 100644 --- a/yt_dlp/extractor/sbs.py +++ b/yt_dlp/extractor/sbs.py @@ -10,7 +10,14 @@ class SBSIE(InfoExtractor): IE_DESC = 'sbs.com.au' - _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=|/watch/)|news/(?:embeds/)?video/)(?P<id>[0-9]+)' + _VALID_URL = r'''(?x) + https?://(?:www\.)?sbs\.com\.au/(?: + ondemand(?: + /video/(?:single/)?| + /movie/[^/]+/| + .*?\bplay=|/watch/ + )|news/(?:embeds/)?video/ + )(?P<id>[0-9]+)''' _TESTS = [{ # Original URL is handled by the generic IE which finds the iframe: @@ -46,6 +53,13 @@ class SBSIE(InfoExtractor): }, { 'url': 'https://www.sbs.com.au/ondemand/watch/1698704451971', 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/movie/coherence/1469404227931', + 'only_matching': True, + }, { + 'note': 'Live stream', + 'url': 'https://www.sbs.com.au/ondemand/video/1726824003663/sbs-24x7-live-stream-nsw', + 'only_matching': True, }] def _real_extract(self, url): @@ -75,4 +89,5 @@ def _real_extract(self, url): 'ie_key': 'ThePlatform', 'id': video_id, 'url': smuggle_url(self._proto_relative_url(theplatform_url), {'force_smil_url': True}), + 'is_live': player_params.get('streamType') == 'live', } diff --git a/yt_dlp/extractor/senategov.py b/yt_dlp/extractor/senategov.py new file mode 100644 index 0000000000..6f4240422a --- /dev/null +++ b/yt_dlp/extractor/senategov.py @@ -0,0 +1,213 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + parse_qs, + unsmuggle_url, +) + +_COMMITTEES = { + 'ag': ('76440', 'http://ag-f.akamaihd.net'), + 'aging': ('76442', 'http://aging-f.akamaihd.net'), + 'approps': ('76441', 'http://approps-f.akamaihd.net'), + 'arch': ('', 'http://ussenate-f.akamaihd.net'), + 'armed': ('76445', 'http://armed-f.akamaihd.net'), + 'banking': ('76446', 'http://banking-f.akamaihd.net'), + 'budget': ('76447', 'http://budget-f.akamaihd.net'), + 'cecc': ('76486', 'http://srs-f.akamaihd.net'), + 'commerce': ('80177', 'http://commerce1-f.akamaihd.net'), + 'csce': ('75229', 'http://srs-f.akamaihd.net'), + 'dpc': ('76590', 'http://dpc-f.akamaihd.net'), + 'energy': ('76448', 'http://energy-f.akamaihd.net'), + 'epw': ('76478', 'http://epw-f.akamaihd.net'), + 'ethics': ('76449', 'http://ethics-f.akamaihd.net'), + 'finance': ('76450', 'http://finance-f.akamaihd.net'), + 'foreign': ('76451', 'http://foreign-f.akamaihd.net'), + 'govtaff': ('76453', 'http://govtaff-f.akamaihd.net'), + 'help': ('76452', 'http://help-f.akamaihd.net'), + 'indian': ('76455', 'http://indian-f.akamaihd.net'), + 'intel': ('76456', 'http://intel-f.akamaihd.net'), + 'intlnarc': ('76457', 'http://intlnarc-f.akamaihd.net'), + 'jccic': ('85180', 'http://jccic-f.akamaihd.net'), + 'jec': ('76458', 'http://jec-f.akamaihd.net'), + 'judiciary': ('76459', 'http://judiciary-f.akamaihd.net'), + 'rpc': ('76591', 'http://rpc-f.akamaihd.net'), + 'rules': ('76460', 'http://rules-f.akamaihd.net'), + 'saa': ('76489', 'http://srs-f.akamaihd.net'), + 'smbiz': ('76461', 'http://smbiz-f.akamaihd.net'), + 'srs': ('75229', 'http://srs-f.akamaihd.net'), + 'uscc': ('76487', 'http://srs-f.akamaihd.net'), + 'vetaff': ('76462', 'http://vetaff-f.akamaihd.net'), +} + + +class SenateISVPIE(InfoExtractor): + _IE_NAME = 'senate.gov:isvp' + _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)' + + _TESTS = [{ + 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', + 'info_dict': { + 'id': 'judiciary031715', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false', + 'info_dict': { + 'id': 'commerce011514', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player' + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi', + # checksum differs each time + 'info_dict': { + 'id': 'intel090613', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player' + } + }, { + # From http://www.c-span.org/video/?96791-1 + 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715', + 'only_matching': True, + }] + + @staticmethod + def _search_iframe_url(webpage): + mobj = re.search( + r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + qs = compat_parse_qs(self._match_valid_url(url).group('qs')) + if not qs.get('filename') or not qs.get('type') or not qs.get('comm'): + raise ExtractorError('Invalid URL', expected=True) + + video_id = re.sub(r'.mp4$', '', qs['filename'][0]) + + webpage = self._download_webpage(url, video_id) + + if smuggled_data.get('force_title'): + title = smuggled_data['force_title'] + else: + title = self._html_search_regex(r'<title>([^<]+)', webpage, video_id) + poster = qs.get('poster') + thumbnail = poster[0] if poster else None + + video_type = qs['type'][0] + committee = video_type if video_type == 'arch' else qs['comm'][0] + + stream_num, domain = _COMMITTEES[committee] + + formats = [] + if video_type == 'arch': + filename = video_id if '.' in video_id else video_id + '.mp4' + m3u8_url = compat_urlparse.urljoin(domain, 'i/' + filename + '/master.m3u8') + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8') + else: + hdcore_sign = 'hdcore=3.1.0' + url_params = (domain, video_id, stream_num) + f4m_url = f'%s/z/%s_1@%s/manifest.f4m?{hdcore_sign}' % url_params + m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params + for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'): + # URLs without the extra param induce an 404 error + entry.update({'extra_param_to_segment_url': hdcore_sign}) + formats.append(entry) + for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'): + mobj = re.search(r'(?P(?:-p|-b)).m3u8', entry['url']) + if mobj: + entry['format_id'] += mobj.group('tag') + formats.append(entry) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + } + + +class SenateGovIE(InfoExtractor): + _IE_NAME = 'senate.gov' + _VALID_URL = r'https?:\/\/(?:www\.)?(help|appropriations|judiciary|banking|armed-services|finance)\.senate\.gov' + _TESTS = [{ + 'url': 'https://www.help.senate.gov/hearings/vaccines-saving-lives-ensuring-confidence-and-protecting-public-health', + 'info_dict': { + 'id': 'help090920', + 'display_id': 'vaccines-saving-lives-ensuring-confidence-and-protecting-public-health', + 'title': 'Vaccines: Saving Lives, Ensuring Confidence, and Protecting Public Health', + 'description': 'The U.S. Senate Committee on Health, Education, Labor & Pensions', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.appropriations.senate.gov/hearings/watch?hearingid=B8A25434-5056-A066-6020-1F68CB75F0CD', + 'info_dict': { + 'id': 'appropsA051518', + 'display_id': 'watch?hearingid=B8A25434-5056-A066-6020-1F68CB75F0CD', + 'title': 'Review of the FY2019 Budget Request for the U.S. Army', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.banking.senate.gov/hearings/21st-century-communities-public-transportation-infrastructure-investment-and-fast-act-reauthorization', + 'info_dict': { + 'id': 'banking041521', + 'display_id': '21st-century-communities-public-transportation-infrastructure-investment-and-fast-act-reauthorization', + 'title': '21st Century Communities: Public Transportation Infrastructure Investment and FAST Act Reauthorization', + 'description': 'The Official website of The United States Committee on Banking, Housing, and Urban Affairs', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + display_id = self._generic_id(url) + webpage = self._download_webpage(url, display_id) + parse_info = parse_qs(self._search_regex( + r'