| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242 |
- # Copyright (c) Alibaba, Inc. and its affiliates.
- import os
- from collections import defaultdict
- from typing import Optional, Union
- import pandas as pd
- from modelscope.hub.api import HubApi
- from modelscope.msdatasets.context.dataset_context_config import \
- DatasetContextConfig
- from modelscope.utils.constant import DEFAULT_DATASET_REVISION, MetaDataFields
- from modelscope.utils.logger import get_logger
- logger = get_logger()
- def format_dataset_structure(dataset_structure):
- return {
- k: v
- for k, v in dataset_structure.items()
- if (v.get('meta') or v.get('file'))
- }
- def get_target_dataset_structure(dataset_structure: dict,
- subset_name: Optional[str] = None,
- split: Optional[str] = None):
- """
- Args:
- dataset_structure (dict): Dataset Structure, like
- {
- "default":{
- "train":{
- "meta":"my_train.csv",
- "file":"pictures.zip"
- }
- },
- "subsetA":{
- "test":{
- "meta":"mytest.csv",
- "file":"pictures.zip"
- }
- }
- }
- subset_name (str, optional): Defining the subset_name of the dataset.
- split (str, optional): Which split of the data to load.
- Returns:
- target_subset_name (str): Name of the chosen subset.
- target_dataset_structure (dict): Structure of the chosen split(s), like
- {
- "test":{
- "meta":"mytest.csv",
- "file":"pictures.zip"
- }
- }
- """
- # verify dataset subset
- if (subset_name and subset_name not in dataset_structure) or (
- not subset_name and len(dataset_structure.keys()) > 1):
- raise ValueError(
- f'subset_name {subset_name} not found. Available: {dataset_structure.keys()}'
- )
- target_subset_name = subset_name
- if not subset_name:
- target_subset_name = next(iter(dataset_structure.keys()))
- logger.info(
- f'No subset_name specified, defaulting to the {target_subset_name}'
- )
- # verify dataset split
- target_dataset_structure = format_dataset_structure(
- dataset_structure[target_subset_name])
- if split and split not in target_dataset_structure:
- raise ValueError(
- f'split {split} not found. Available: {target_dataset_structure.keys()}'
- )
- if split:
- target_dataset_structure = {split: target_dataset_structure[split]}
- return target_subset_name, target_dataset_structure
- def list_dataset_objects(hub_api: HubApi, max_limit: int, is_recursive: bool,
- dataset_name: str, namespace: str,
- version: str) -> list:
- """
- List all objects for specific dataset.
- Args:
- hub_api (class HubApi): HubApi instance.
- max_limit (int): Max number of objects.
- is_recursive (bool): Whether to list objects recursively.
- dataset_name (str): Dataset name.
- namespace (str): Namespace.
- version (str): Dataset version.
- Returns:
- res (list): List of objects, i.e., ['train/images/001.png', 'train/images/002.png', 'val/images/001.png', ...]
- """
- res = []
- objects = hub_api.list_oss_dataset_objects(
- dataset_name=dataset_name,
- namespace=namespace,
- max_limit=max_limit,
- is_recursive=is_recursive,
- is_filter_dir=True,
- revision=version)
- for item in objects:
- object_key = item.get('Key')
- if not object_key:
- continue
- res.append(object_key)
- return res
- def contains_dir(file_map) -> bool:
- """
- To check whether input contains at least one directory.
- Args:
- file_map (dict): Structure of data files. e.g., {'train': 'train.zip', 'validation': 'val.zip'}
- Returns:
- True if input contains at least one directory, False otherwise.
- """
- res = False
- for k, v in file_map.items():
- if isinstance(v, str) and not v.endswith('.zip'):
- res = True
- break
- return res
- def get_subdir_hash_from_split(split: Union[str, list], version: str) -> str:
- if isinstance(split, str):
- split = [split]
- return os.path.join(version, '_'.join(split))
- def get_split_list(split: Union[str, list]) -> list:
- """ Unify the split to list-format. """
- if isinstance(split, str):
- return [split]
- elif isinstance(split, list):
- return split
- else:
- raise f'Expected format of split: str or list, but got {type(split)}.'
- def get_split_objects_map(file_map, objects):
- """
- Get the map between dataset split and oss objects.
- Args:
- file_map (dict): Structure of data files. e.g., {'train': 'train', 'validation': 'val'}, both of train and val
- are dirs.
- objects (list): List of oss objects. e.g., ['train/001/1_123.png', 'train/001/1_124.png', 'val/003/3_38.png']
- Returns:
- A map of split-objects. e.g., {'train': ['train/001/1_123.png', 'train/001/1_124.png'],
- 'validation':['val/003/3_38.png']}
- """
- res = {}
- for k, v in file_map.items():
- res[k] = []
- for obj_key in objects:
- for k, v in file_map.items():
- if obj_key.startswith(v.rstrip('/') + '/'):
- res[k].append(obj_key)
- return res
- def get_dataset_files(subset_split_into: dict,
- dataset_name: str,
- namespace: str,
- context_config: DatasetContextConfig,
- revision: Optional[str] = DEFAULT_DATASET_REVISION):
- """
- Return:
- meta_map: Structure of meta files (.csv), the meta file name will be replaced by url, like
- {
- "test": "https://xxx/mytest.csv"
- }
- file_map: Structure of data files (.zip), like
- {
- "test": "pictures.zip"
- }
- """
- meta_map = defaultdict(dict)
- file_map = defaultdict(dict)
- args_map = defaultdict(dict)
- custom_type_map = defaultdict(dict)
- modelscope_api = HubApi()
- meta_cache_dir = context_config.data_meta_config.meta_cache_dir
- for split, info in subset_split_into.items():
- custom_type_map[split] = info.get('custom', '')
- meta_map[split] = modelscope_api.get_dataset_file_url_origin(
- info.get('meta', ''), dataset_name, namespace, revision)
- if info.get('file'):
- file_map[split] = info['file']
- args_map[split] = info.get('args')
- objects = []
- # If `big_data` is true, then fetch objects from meta-csv file directly.
- for split, args_dict in args_map.items():
- if args_dict and args_dict.get(MetaDataFields.ARGS_BIG_DATA):
- meta_csv_file_url = meta_map[split]
- meta_csv_file_path = HubApi.fetch_meta_files_from_url(
- meta_csv_file_url, meta_cache_dir)
- csv_delimiter = context_config.config_kwargs.get('delimiter', ',')
- csv_df = pd.read_csv(
- meta_csv_file_path,
- iterator=False,
- delimiter=csv_delimiter,
- escapechar='\\')
- target_col = csv_df.columns[csv_df.columns.str.contains(
- ':FILE')].to_list()
- if len(target_col) == 0:
- logger.error(
- f'No column contains ":FILE" in {meta_csv_file_path}.')
- target_col = csv_df.columns[0]
- else:
- target_col = target_col[0]
- objects = csv_df[target_col].to_list()
- file_map[split] = objects
- # More general but low-efficiency.
- if not objects:
- objects = list_dataset_objects(
- hub_api=modelscope_api,
- max_limit=-1,
- is_recursive=True,
- dataset_name=dataset_name,
- namespace=namespace,
- version=revision)
- if contains_dir(file_map):
- file_map = get_split_objects_map(file_map, objects)
- return meta_map, file_map, args_map, custom_type_map
|