| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673 |
- # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import bisect
- import math
- import warnings
- from typing import Iterable
- import paddle
- from ... import framework
- class Dataset:
- """
- An abstract class to encapsulate methods and behaviors of datasets.
- All datasets in map-style(dataset samples can be get by a given key)
- should be a subclass of `paddle.io.Dataset`. All subclasses should
- implement following methods:
- :code:`__getitem__`: get sample from dataset with a given index. This
- method is required by reading dataset sample in :code:`paddle.io.DataLoader`.
- :code:`__len__`: return dataset sample number. This method is required
- by some implements of :code:`paddle.io.BatchSampler`
- see :code:`paddle.io.DataLoader`.
- Examples:
- .. code-block:: python
- >>> import numpy as np
- >>> from paddle.io import Dataset
- >>> # define a random dataset
- >>> class RandomDataset(Dataset):
- ... def __init__(self, num_samples):
- ... self.num_samples = num_samples
- ...
- ... def __getitem__(self, idx):
- ... image = np.random.random([784]).astype('float32')
- ... label = np.random.randint(0, 9, (1, )).astype('int64')
- ... return image, label
- ...
- ... def __len__(self):
- ... return self.num_samples
- ...
- >>> dataset = RandomDataset(10)
- >>> for i in range(len(dataset)):
- ... image, label = dataset[i]
- ... # do something
- """
- def __init__(self):
- pass
- def __getitem__(self, idx):
- raise NotImplementedError(
- "'{}' not implement in class "
- "{}".format('__getitem__', self.__class__.__name__)
- )
- def __len__(self):
- raise NotImplementedError(
- "'{}' not implement in class "
- "{}".format('__len__', self.__class__.__name__)
- )
- class IterableDataset(Dataset):
- """
- An abstract class to encapsulate methods and behaviors of iterable datasets.
- All datasets in iterable-style (can only get sample one by one sequentially, like
- a Python iterator) should be a subclass of :ref:`api_paddle_io_IterableDataset` . All subclasses should
- implement following methods:
- :code:`__iter__`: yield sample sequentially. This method is required by reading dataset sample in :ref:`api_paddle_io_DataLoader` .
- .. note::
- do not implement :code:`__getitem__` and :code:`__len__` in IterableDataset, should not be called either.
- see :ref:`api_paddle_io_DataLoader` .
- Examples:
- .. code-block:: python
- :name: code-example1
- >>> import numpy as np
- >>> from paddle.io import IterableDataset
- >>> # define a random dataset
- >>> class RandomDataset(IterableDataset):
- ... def __init__(self, num_samples):
- ... self.num_samples = num_samples
- ...
- ... def __iter__(self):
- ... for i in range(self.num_samples):
- ... image = np.random.random([784]).astype('float32')
- ... label = np.random.randint(0, 9, (1, )).astype('int64')
- ... yield image, label
- ...
- >>> dataset = RandomDataset(10)
- >>> for img, label in dataset:
- ... # do something
- ... ...
- When :attr:`num_workers > 0`, each worker has a different copy of the dataset object and
- will yield whole dataset samples, which means samples in dataset will be repeated in
- :attr:`num_workers` times. If it is required for each sample to yield only once, there
- are two methods to configure different copy in each worker process to avoid duplicate data
- among workers as follows. In both the methods, worker information that can be getted in
- a worker process by `paddle.io.get_worker_info` will be needed.
- splitting data copy in each worker in :code:`__iter__`
- .. code-block:: python
- :name: code-example2
- >>> import math
- >>> import paddle
- >>> import numpy as np
- >>> from paddle.io import IterableDataset, DataLoader, get_worker_info
- >>> class SplitedIterableDataset(IterableDataset):
- ... def __init__(self, start, end):
- ... self.start = start
- ... self.end = end
- ...
- ... def __iter__(self):
- ... worker_info = get_worker_info()
- ... if worker_info is None:
- ... iter_start = self.start
- ... iter_end = self.end
- ... else:
- ... per_worker = int(
- ... math.ceil((self.end - self.start) / float(
- ... worker_info.num_workers)))
- ... worker_id = worker_info.id
- ... iter_start = self.start + worker_id * per_worker
- ... iter_end = min(iter_start + per_worker, self.end)
- ...
- ... for i in range(iter_start, iter_end):
- ... yield np.array([i])
- ...
- >>> dataset = SplitedIterableDataset(start=2, end=9)
- >>> dataloader = DataLoader(
- ... dataset,
- ... num_workers=2,
- ... batch_size=1,
- ... drop_last=True)
- ...
- >>> for data in dataloader:
- ... print(data) # doctest: +SKIP("The output depends on the environment.")
- Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
- [[2]])
- Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
- [[3]])
- Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
- [[4]])
- Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
- [[5]])
- Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
- [[6]])
- Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
- [[7]])
- Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
- [[8]])
- splitting data copy in each worker by :code:`worker_init_fn`
- .. code-block:: python
- :name: code-example3
- >>> import math
- >>> import paddle
- >>> import numpy as np
- >>> from paddle.io import IterableDataset, DataLoader, get_worker_info
- >>> class RangeIterableDataset(IterableDataset):
- ... def __init__(self, start, end):
- ... self.start = start
- ... self.end = end
- ...
- ... def __iter__(self):
- ... for i in range(self.start, self.end):
- ... yield np.array([i])
- ...
- >>> dataset = RangeIterableDataset(start=2, end=9)
- >>> def worker_init_fn(worker_id):
- ... worker_info = get_worker_info()
- ...
- ... dataset = worker_info.dataset
- ... start = dataset.start
- ... end = dataset.end
- ... num_per_worker = int(
- ... math.ceil((end - start) / float(worker_info.num_workers)))
- ...
- ... worker_id = worker_info.id
- ... dataset.start = start + worker_id * num_per_worker
- ... dataset.end = min(dataset.start + num_per_worker, end)
- ...
- >>> dataloader = DataLoader(
- ... dataset,
- ... num_workers=2,
- ... batch_size=1,
- ... drop_last=True,
- ... worker_init_fn=worker_init_fn)
- ...
- >>> for data in dataloader:
- ... print(data) # doctest: +SKIP("The output depends on the environment.")
- Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
- [[2]])
- Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
- [[3]])
- Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
- [[4]])
- Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
- [[5]])
- Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
- [[6]])
- Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
- [[7]])
- Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,
- [[8]])
- """
- def __init__(self):
- pass
- def __iter__(self):
- raise NotImplementedError(
- "'{}' not implement in class "
- "{}".format('__iter__', self.__class__.__name__)
- )
- def __getitem__(self, idx):
- raise RuntimeError(
- "'{}' should not be called for IterableDataset"
- "{}".format('__getitem__', self.__class__.__name__)
- )
- def __len__(self):
- raise RuntimeError(
- "'{}' should not be called for IterableDataset"
- "{}".format('__len__', self.__class__.__name__)
- )
- class TensorDataset(Dataset):
- """
- Dataset defined by a list of tensors.
- Each tensor should be in shape of [N, ...], while N is the sample number,
- and each tensor contains a field of sample, :code:`TensorDataset` retrieve
- each sample by indexing tensors in the 1st dimension.
- Args:
- tensors(list|tuple): A list/tuple of tensors with same shape in the 1st dimension.
- Returns:
- Dataset: a Dataset instance wrapping tensors.
- Examples:
- .. code-block:: python
- >>> import numpy as np
- >>> import paddle
- >>> from paddle.io import TensorDataset
- >>> input_np = np.random.random([2, 3, 4]).astype('float32')
- >>> input = paddle.to_tensor(input_np)
- >>> label_np = np.random.random([2, 1]).astype('int32')
- >>> label = paddle.to_tensor(label_np)
- >>> dataset = TensorDataset([input, label])
- >>> for i in range(len(dataset)):
- ... input, label = dataset[i]
- ... # do something
- """
- def __init__(self, tensors):
- if not framework.in_dynamic_mode():
- raise RuntimeError(
- "TensorDataset con only be used in imperative mode"
- )
- assert all(
- tensor.shape[0] == tensors[0].shape[0] for tensor in tensors
- ), "tensors not have same shape of the 1st dimension"
- self.tensors = tensors
- def __getitem__(self, index):
- return tuple(tensor[index] for tensor in self.tensors)
- def __len__(self):
- return self.tensors[0].shape[0]
- def to_list(value):
- if value is None:
- return value
- if isinstance(value, (list, tuple)):
- return list(value)
- return [value]
- class ComposeDataset(Dataset):
- """
- A Dataset which composes fields of multiple datasets.
- This dataset is used for composing fields of multiple map-style
- datasets of same length.
- Args:
- datasets(list of Dataset): List of datasets to be composed.
- Returns:
- Dataset: A Dataset which composes fields of multiple datasets.
- Examples:
- .. code-block:: python
- >>> import numpy as np
- >>> import paddle
- >>> from paddle.io import Dataset, ComposeDataset
- >>> # define a random dataset
- >>> class RandomDataset(Dataset):
- ... def __init__(self, num_samples):
- ... self.num_samples = num_samples
- ...
- ... def __getitem__(self, idx):
- ... image = np.random.random([32]).astype('float32')
- ... label = np.random.randint(0, 9, (1, )).astype('int64')
- ... return image, label
- ...
- ... def __len__(self):
- ... return self.num_samples
- ...
- >>> dataset = ComposeDataset([RandomDataset(10), RandomDataset(10)])
- >>> for i in range(len(dataset)):
- ... image1, label1, image2, label2 = dataset[i]
- ... # do something
- """
- def __init__(self, datasets):
- self.datasets = list(datasets)
- assert len(self.datasets) > 0, "input datasets should not be empty"
- for i, dataset in enumerate(self.datasets):
- assert isinstance(
- dataset, Dataset
- ), "each input dataset should be paddle.io.Dataset"
- assert not isinstance(
- dataset, IterableDataset
- ), "paddle.io.IterableDataset not supported"
- if i > 0:
- assert len(dataset) == len(
- self.datasets[i - 1]
- ), "lengths of datasets should be same"
- def __len__(self):
- return len(self.datasets[0])
- def __getitem__(self, idx):
- sample = []
- for dataset in self.datasets:
- sample.extend(to_list(dataset[idx]))
- return tuple(sample)
- class ChainDataset(IterableDataset):
- """
- A Dataset which chains multiple iterable-style datasets.
- This dataset is used for assembling multiple datasets which should
- be :ref:`api_paddle_io_IterableDataset`.
- Args:
- datasets(list of IterableDatasets): List of datasets to be chainned.
- Returns:
- paddle.io.IterableDataset: A Dataset which chains fields of multiple datasets.
- Examples:
- .. code-block:: python
- >>> import numpy as np
- >>> import paddle
- >>> from paddle.io import IterableDataset, ChainDataset
- >>> # define a random dataset
- >>> class RandomDataset(IterableDataset):
- ... def __init__(self, num_samples):
- ... self.num_samples = num_samples
- ...
- ... def __iter__(self):
- ... for i in range(10):
- ... image = np.random.random([32]).astype('float32')
- ... label = np.random.randint(0, 9, (1, )).astype('int64')
- ... yield image, label
- ...
- >>> dataset = ChainDataset([RandomDataset(10), RandomDataset(10)])
- >>> for image, label in iter(dataset):
- ... # do something
- ... ...
- """
- def __init__(self, datasets):
- self.datasets = list(datasets)
- assert len(self.datasets) > 0, "input datasets should not be empty"
- for i, dataset in enumerate(self.datasets):
- assert isinstance(
- dataset, IterableDataset
- ), "ChainDataset only support paddle.io.IterableDataset"
- def __iter__(self):
- for dataset in self.datasets:
- yield from dataset
- class Subset(Dataset):
- """
- Subset of a dataset at specified indices.
- Args:
- dataset (Dataset): The whole Dataset.
- indices (sequence): Indices in the whole set selected for subset.
- Returns:
- List[Dataset]: A Dataset which is the subset of the original dataset.
- Examples:
- .. code-block:: python
- >>> import paddle
- >>> from paddle.io import Subset
- >>> # example 1:
- >>> a = paddle.io.Subset(dataset=range(1, 4), indices=[0, 2])
- >>> print(list(a))
- [1, 3]
- >>> # example 2:
- >>> b = paddle.io.Subset(dataset=range(1, 4), indices=[1, 1])
- >>> print(list(b))
- [2, 2]
- """
- def __init__(self, dataset, indices):
- self.dataset = dataset
- self.indices = indices
- def __getitem__(self, idx):
- return self.dataset[self.indices[idx]]
- def __len__(self):
- return len(self.indices)
- def random_split(dataset, lengths, generator=None):
- """
- Randomly split a dataset into non-overlapping new datasets of given lengths.
- Optionally fix the generator for reproducible results, e.g.:
- Args:
- dataset (Dataset): Dataset to be split
- lengths (sequence): lengths or fractions of splits to be produced
- generator (Generator, optional): Generator used for the random permutation. Default is None then the DefaultGenerator is used in manual_seed().
- Returns:
- Datasets: A list of subset Datasets, which are the non-overlapping subsets of the original Dataset.
- Examples:
- .. code-block:: python
- >>> import paddle
- >>> paddle.seed(2023)
- >>> a_list = paddle.io.random_split(range(10), [3, 7])
- >>> print(len(a_list))
- 2
- >>> # output of the first subset
- >>> for idx, v in enumerate(a_list[0]):
- ... print(idx, v) # doctest: +SKIP("The output depends on the environment.")
- 0 7
- 1 6
- 2 5
- >>> # output of the second subset
- >>> for idx, v in enumerate(a_list[1]):
- ... print(idx, v) # doctest: +SKIP("The output depends on the environment.")
- 0 1
- 1 9
- 2 4
- 3 2
- 4 0
- 5 3
- 6 8
- """
- if math.isclose(sum(lengths), 1) and sum(lengths) <= 1:
- subset_lengths = []
- for i, frac in enumerate(lengths):
- if frac < 0 or frac > 1:
- raise ValueError(
- f"Fraction at index {i} is not between 0 and 1"
- )
- n_items_in_split = int(math.floor(len(dataset) * frac))
- subset_lengths.append(n_items_in_split)
- remainder = len(dataset) - sum(subset_lengths)
- for i in range(remainder):
- idx_to_add_at = i % len(subset_lengths)
- subset_lengths[idx_to_add_at] += 1
- lengths = subset_lengths
- for i, length in enumerate(lengths):
- if length == 0:
- warnings.warn(
- f"Length of split at index {i} is 0. "
- f"This might result in an empty dataset."
- )
- # Cannot verify that dataset is Sized
- if sum(lengths) != len(dataset): # type: ignore
- raise ValueError(
- "Sum of input lengths does not equal the length of the input dataset!"
- )
- # TODO(@Joejiong): support Variable or Tensor type with .tolist class member function.
- # For example var.item() and var.tolist()
- indices = paddle.randperm(sum(lengths)).tolist()
- return [
- Subset(dataset, indices[offset - length : offset])
- for offset, length in zip(_accumulate(lengths), lengths)
- ]
- def _accumulate(iterable, fn=lambda x, y: x + y):
- """
- Return running totals
- Args:
- iterable: any iterable object for example dataset.
- y (x): one element in the iterable object.
- fn (x, y): Defaults to lambdax.
- Yields:
- yields total from beginning iterator to current iterator.
- Example code:
- .. code-block:: python
- >>> list(_accumulate([1, 2, 3, 4, 5]))
- [1, 3, 6, 10, 15]
- >>> import operator
- >>> list(_accumulate([1, 2, 3, 4, 5], operator.mul))
- [1, 2, 6, 24, 120]
- """
- it = iter(iterable)
- try:
- total = next(it)
- except StopIteration:
- return
- yield total
- for element in it:
- total = fn(total, element)
- yield total
- class ConcatDataset(Dataset):
- """
- Dataset as a concatenation of multiple datasets.
- This class is useful to assemble different existing datasets.
- Args:
- datasets (sequence): List of datasets to be concatenated
- Returns:
- Dataset: A Dataset which concatenated by multiple datasets.
- Examples:
- .. code-block:: python
- >>> import numpy as np
- >>> import paddle
- >>> from paddle.io import Dataset, ConcatDataset
- >>> # define a random dataset
- >>> class RandomDataset(Dataset):
- ... def __init__(self, num_samples):
- ... self.num_samples = num_samples
- ...
- ... def __getitem__(self, idx):
- ... image = np.random.random([32]).astype('float32')
- ... label = np.random.randint(0, 9, (1, )).astype('int64')
- ... return image, label
- ...
- ... def __len__(self):
- ... return self.num_samples
- ...
- >>> dataset = ConcatDataset([RandomDataset(10), RandomDataset(10)])
- >>> for i in range(len(dataset)):
- ... image, label = dataset[i]
- ... # do something
- """
- @staticmethod
- def cumsum(sequence):
- r, s = [], 0
- for e in sequence:
- l = len(e)
- r.append(l + s)
- s += l
- return r
- def __init__(self, datasets: Iterable[Dataset]):
- self.datasets = list(datasets)
- assert (
- len(self.datasets) > 0
- ), 'datasets should not be an empty iterable'
- for d in self.datasets:
- assert not isinstance(
- d, IterableDataset
- ), "ConcatDataset does not support IterableDataset"
- self.cumulative_sizes = self.cumsum(self.datasets)
- def __len__(self):
- return self.cumulative_sizes[-1]
- def __getitem__(self, idx):
- if idx < 0:
- if -idx > len(self):
- raise ValueError(
- "absolute value of index should not exceed dataset length"
- )
- idx = len(self) + idx
- dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
- if dataset_idx == 0:
- sample_idx = idx
- else:
- sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
- return self.datasets[dataset_idx][sample_idx]
|