另一种ValueError:无效模式:'**'只能是整个路径组成部分
我正在尝试使用Hugging Face上的infoslack/mistral-7b-arxiv-paper-chunked
数据集创建一个简单的RAG(检索增强生成)。但是,我遇到了一个问题,具体来说是ValueError: Invalid pattern: '**' can only be an entire path component
,就像在这个问题中提到的那样。不过,那个问题中的解决方案在这里都不管用。
这是我的代码:
from datasets import load_dataset
dataset = load_dataset("infoslack/mistral-7b-arxiv-paper-chunked", split="train")
完整的错误信息是:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[21], line 3
1 from datasets import load_dataset
----> 3 dataset = load_dataset("infoslack/mistral-7b-arxiv-paper-chunked", split="train")
File ~\AppData\Local\anaconda3\Lib\site-packages\datasets\load.py:1773, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)
1768 verification_mode = VerificationMode(
1769 (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
1770 )
1772 # Create a dataset builder
-> 1773 builder_instance = load_dataset_builder(
1774 path=path,
1775 name=name,
1776 data_dir=data_dir,
1777 data_files=data_files,
1778 cache_dir=cache_dir,
1779 features=features,
1780 download_config=download_config,
1781 download_mode=download_mode,
1782 revision=revision,
1783 use_auth_token=use_auth_token,
1784 storage_options=storage_options,
1785 **config_kwargs,
1786 )
1788 # Return iterable dataset in case of streaming
1789 if streaming:
File ~\AppData\Local\anaconda3\Lib\site-packages\datasets\load.py:1502, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, storage_options, **config_kwargs)
1500 download_config = download_config.copy() if download_config else DownloadConfig()
1501 download_config.use_auth_token = use_auth_token
-> 1502 dataset_module = dataset_module_factory(
1503 path,
1504 revision=revision,
1505 download_config=download_config,
1506 download_mode=download_mode,
1507 data_dir=data_dir,
1508 data_files=data_files,
1509 )
1511 # Get dataset builder class from the processing script
1512 builder_cls = import_main_class(dataset_module.module_path)
File ~\AppData\Local\anaconda3\Lib\site-packages\datasets\load.py:1219, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)
1214 if isinstance(e1, FileNotFoundError):
1215 raise FileNotFoundError(
1216 f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
1217 f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
1218 ) from None
-> 1219 raise e1 from None
1220 else:
1221 raise FileNotFoundError(
1222 f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
1223 )
File ~\AppData\Local\anaconda3\Lib\site-packages\datasets\load.py:1203, in dataset_module_factory(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)
1188 return HubDatasetModuleFactoryWithScript(
1189 path,
1190 revision=revision,
(...)
1193 dynamic_modules_path=dynamic_modules_path,
1194 ).get_module()
1195 else:
1196 return HubDatasetModuleFactoryWithoutScript(
1197 path,
1198 revision=revision,
1199 data_dir=data_dir,
1200 data_files=data_files,
1201 download_config=download_config,
1202 download_mode=download_mode,
-> 1203 ).get_module()
1204 except (
1205 Exception
1206 ) as e1: # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
1207 try:
File ~\AppData\Local\anaconda3\Lib\site-packages\datasets\load.py:769, in HubDatasetModuleFactoryWithoutScript.get_module(self)
759 def get_module(self) -> DatasetModule:
760 hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
761 self.name,
762 revision=self.revision,
763 token=self.download_config.use_auth_token,
764 timeout=100.0,
765 )
766 patterns = (
767 sanitize_patterns(self.data_files)
768 if self.data_files is not None
--> 769 else get_data_patterns_in_dataset_repository(hfh_dataset_info, self.data_dir)
770 )
771 data_files = DataFilesDict.from_hf_repo(
772 patterns,
773 dataset_info=hfh_dataset_info,
774 base_path=self.data_dir,
775 allowed_extensions=ALL_ALLOWED_EXTENSIONS,
776 )
777 split_modules = {
778 split: infer_module_for_data_files(data_files_list, use_auth_token=self.download_config.use_auth_token)
779 for split, data_files_list in data_files.items()
780 }
File ~\AppData\Local\anaconda3\Lib\site-packages\datasets\data_files.py:662, in get_data_patterns_in_dataset_repository(dataset_info, base_path)
660 resolver = partial(_resolve_single_pattern_in_dataset_repository, dataset_info, base_path=base_path)
661 try:
--> 662 return _get_data_files_patterns(resolver)
663 except FileNotFoundError:
664 raise EmptyDatasetError(
665 f"The dataset repository at '{dataset_info.id}' doesn't contain any data files"
666 ) from None
File ~\AppData\Local\anaconda3\Lib\site-packages\datasets\data_files.py:223, in _get_data_files_patterns(pattern_resolver)
221 try:
222 for pattern in patterns:
--> 223 data_files = pattern_resolver(pattern)
224 if len(data_files) > 0:
225 non_empty_splits.append(split)
File ~\AppData\Local\anaconda3\Lib\site-packages\datasets\data_files.py:473, in _resolve_single_pattern_in_dataset_repository(dataset_info, pattern, base_path, allowed_extensions)
471 else:
472 base_path = "/"
--> 473 glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)]
474 matched_paths = [
475 filepath
476 for filepath in glob_iter
(...)
483 )
484 ] # ignore .ipynb and __pycache__, but keep /../
485 if allowed_extensions is not None:
File ~\AppData\Roaming\Python\Python311\site-packages\fsspec\spec.py:606, in AbstractFileSystem.glob(self, path, maxdepth, **kwargs)
602 depth = None
604 allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
--> 606 pattern = glob_translate(path + ("/" if ends_with_sep else ""))
607 pattern = re.compile(pattern)
609 out = {
610 p: info
611 for p, info in sorted(allpaths.items())
(...)
618 )
619 }
File ~\AppData\Roaming\Python\Python311\site-packages\fsspec\utils.py:734, in glob_translate(pat)
732 continue
733 elif "**" in part:
--> 734 raise ValueError(
735 "Invalid pattern: '**' can only be an entire path component"
736 )
737 if part:
738 results.extend(_translate(part, f"{not_sep}*", not_sep))
ValueError: Invalid pattern: '**' can only be an entire path component
1 个回答
0
这个问题在通过Anaconda运行Python时仍然存在,具体是使用的版本是 Python 3.11.5 ('base') ~\AppData\Local\anaconda3\python.exe
。不过,当使用 Python 3.12.2
时,这个问题就解决了。因此,看起来是某些Python版本在运行这些软件包时出现了问题。