From fe7866d0ed6bfa3904ce12b049a3424fdc0ea1fa Mon Sep 17 00:00:00 2001
From: pukkandan <pukkandan.ytdlp@gmail.com>
Date: Wed, 24 Aug 2022 05:42:16 +0530
Subject: [PATCH] Add option `--use-extractors`

Deprecates `--force-generic-extractor`

Closes #3234, Closes #2044

Related: #4307, #1791
---
 README.md                  |  9 ++++++++-
 yt_dlp/YoutubeDL.py        | 41 +++++++++++++++++++++++---------------
 yt_dlp/__init__.py         |  1 +
 yt_dlp/extractor/common.py | 13 ++++++++++++
 yt_dlp/options.py          | 12 ++++++++++-
 5 files changed, 58 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 7cfeec4f1..aab20c079 100644
--- a/README.md
+++ b/README.md
@@ -375,7 +375,13 @@ ## General Options:
     --list-extractors               List all supported extractors and exit
     --extractor-descriptions        Output descriptions of all supported
                                     extractors and exit
-    --force-generic-extractor       Force extraction to use the generic extractor
+    --use-extractors, --ies NAMES   Extractor names to use separated by commas.
+                                    You can also use regexes, "all", "default"
+                                    and "end" (end URL matching); e.g. --ies
+                                    "holodex.*,end,youtube". Prefix the name
+                                    with a "-" to exclude it, e.g. --ies
+                                    default,-generic. Use --list-extractors for
+                                    a list of available extractor names
     --default-search PREFIX         Use this prefix for unqualified URLs. E.g.
                                     "gvsearch2:python" downloads two videos from
                                     google videos for the search term "python".
@@ -2058,6 +2064,7 @@ #### Redundant options
 #### Not recommended
 While these options still work, their use is not recommended since there are other alternatives to achieve the same
 
+    --force-generic-extractor        --ies generic,default
     --exec-before-download CMD       --exec "before_dl:CMD"
     --no-exec-before-download        --no-exec
     --all-formats                    -f all
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py
index 872e0bdc3..a3d562042 100644
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -29,6 +29,7 @@
 from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name
 from .downloader.rtmp import rtmpdump_version
 from .extractor import gen_extractor_classes, get_info_extractor
+from .extractor.common import UnsupportedURLIE
 from .extractor.openload import PhantomJSwrapper
 from .minicurses import format_text
 from .postprocessor import _PLUGIN_CLASSES as plugin_postprocessors
@@ -237,7 +238,7 @@ class YoutubeDL:
                        Default is 'only_download' for CLI, but False for API
     skip_playlist_after_errors: Number of allowed failures until the rest of
                        the playlist is skipped
-    force_generic_extractor: Force downloader to use the generic extractor
+    allowed_extractors:  List of regexes to match against extractor names that are allowed
     overwrites:        Overwrite all video and metadata files if True,
                        overwrite only non-video files if None
                        and don't overwrite any file if False
@@ -477,6 +478,8 @@ class YoutubeDL:
 
     The following options are deprecated and may be removed in the future:
 
+    force_generic_extractor: Force downloader to use the generic extractor
+                       - Use allowed_extractors = ['generic', 'default']
     playliststart:     - Use playlist_items
                        Playlist item to start at.
     playlistend:       - Use playlist_items
@@ -758,13 +761,6 @@ def add_info_extractor(self, ie):
             self._ies_instances[ie_key] = ie
             ie.set_downloader(self)
 
-    def _get_info_extractor_class(self, ie_key):
-        ie = self._ies.get(ie_key)
-        if ie is None:
-            ie = get_info_extractor(ie_key)
-            self.add_info_extractor(ie)
-        return ie
-
     def get_info_extractor(self, ie_key):
         """
         Get an instance of an IE with name ie_key, it will try to get one from
@@ -781,8 +777,19 @@ def add_default_info_extractors(self):
         """
         Add the InfoExtractors returned by gen_extractors to the end of the list
         """
-        for ie in gen_extractor_classes():
-            self.add_info_extractor(ie)
+        all_ies = {ie.IE_NAME.lower(): ie for ie in gen_extractor_classes()}
+        all_ies['end'] = UnsupportedURLIE()
+        try:
+            ie_names = orderedSet_from_options(
+                self.params.get('allowed_extractors', ['default']), {
+                    'all': list(all_ies),
+                    'default': [name for name, ie in all_ies.items() if ie._ENABLED],
+                }, use_regex=True)
+        except re.error as e:
+            raise ValueError(f'Wrong regex for allowed_extractors: {e.pattern}')
+        for name in ie_names:
+            self.add_info_extractor(all_ies[name])
+        self.write_debug(f'Loaded {len(ie_names)} extractors')
 
     def add_post_processor(self, pp, when='post_process'):
         """Add a PostProcessor object to the end of the chain."""
@@ -1413,11 +1420,11 @@ def extract_info(self, url, download=True, ie_key=None, extra_info=None,
             ie_key = 'Generic'
 
         if ie_key:
-            ies = {ie_key: self._get_info_extractor_class(ie_key)}
+            ies = {ie_key: self._ies[ie_key]} if ie_key in self._ies else {}
         else:
             ies = self._ies
 
-        for ie_key, ie in ies.items():
+        for key, ie in ies.items():
             if not ie.suitable(url):
                 continue
 
@@ -1426,14 +1433,16 @@ def extract_info(self, url, download=True, ie_key=None, extra_info=None,
                                     'and will probably not work.')
 
             temp_id = ie.get_temp_id(url)
-            if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
-                self.to_screen(f'[{ie_key}] {temp_id}: has already been recorded in the archive')
+            if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': key}):
+                self.to_screen(f'[{key}] {temp_id}: has already been recorded in the archive')
                 if self.params.get('break_on_existing', False):
                     raise ExistingVideoReached()
                 break
-            return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process)
+            return self.__extract_info(url, self.get_info_extractor(key), download, extra_info, process)
         else:
-            self.report_error('no suitable InfoExtractor for URL %s' % url)
+            extractors_restricted = self.params.get('allowed_extractors') not in (None, ['default'])
+            self.report_error(f'No suitable extractor{format_field(ie_key, None, " (%s)")} found for URL {url}',
+                              tb=False if extractors_restricted else None)
 
     def _handle_extraction_exceptions(func):
         @functools.wraps(func)
diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py
index 317dd2623..e9234e6f4 100644
--- a/yt_dlp/__init__.py
+++ b/yt_dlp/__init__.py
@@ -766,6 +766,7 @@ def parse_options(argv=None):
         'windowsfilenames': opts.windowsfilenames,
         'ignoreerrors': opts.ignoreerrors,
         'force_generic_extractor': opts.force_generic_extractor,
+        'allowed_extractors': opts.allowed_extractors or ['default'],
         'ratelimit': opts.ratelimit,
         'throttledratelimit': opts.throttledratelimit,
         'overwrites': opts.overwrites,
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index a534703e5..6337a13a4 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -480,6 +480,9 @@ class InfoExtractor:
     will be used by geo restriction bypass mechanism similarly
     to _GEO_COUNTRIES.
 
+    The _ENABLED attribute should be set to False for IEs that
+    are disabled by default and must be explicitly enabled.
+
     The _WORKING attribute should be set to False for broken IEs
     in order to warn the users and skip the tests.
     """
@@ -491,6 +494,7 @@ class InfoExtractor:
     _GEO_COUNTRIES = None
     _GEO_IP_BLOCKS = None
     _WORKING = True
+    _ENABLED = True
     _NETRC_MACHINE = None
     IE_DESC = None
     SEARCH_KEY = None
@@ -3941,3 +3945,12 @@ def _search_results(self, query):
     @classproperty
     def SEARCH_KEY(cls):
         return cls._SEARCH_KEY
+
+
+class UnsupportedURLIE(InfoExtractor):
+    _VALID_URL = '.*'
+    _ENABLED = False
+    IE_DESC = False
+
+    def _real_extract(self, url):
+        raise UnsupportedError(url)
diff --git a/yt_dlp/options.py b/yt_dlp/options.py
index 0cddb7fd5..bee531d1b 100644
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@@ -353,10 +353,20 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs):
         '--extractor-descriptions',
         action='store_true', dest='list_extractor_descriptions', default=False,
         help='Output descriptions of all supported extractors and exit')
+    general.add_option(
+        '--use-extractors', '--ies',
+        action='callback', dest='allowed_extractors', metavar='NAMES', type='str',
+        default=[], callback=_list_from_options_callback,
+        help=(
+            'Extractor names to use separated by commas. '
+            'You can also use regexes, "all", "default" and "end" (end URL matching); '
+            'e.g. --ies "holodex.*,end,youtube". '
+            'Prefix the name with a "-" to exclude it, e.g. --ies default,-generic. '
+            'Use --list-extractors for a list of available extractor names'))
     general.add_option(
         '--force-generic-extractor',
         action='store_true', dest='force_generic_extractor', default=False,
-        help='Force extraction to use the generic extractor')
+        help=optparse.SUPPRESS_HELP)
     general.add_option(
         '--default-search',
         dest='default_search', metavar='PREFIX',