training.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. # Copyright 2021 The HuggingFace Team. All rights reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import numpy as np
  15. import torch
  16. from torch.utils.data import DataLoader
  17. from accelerate.utils.dataclasses import DistributedType
  18. class RegressionDataset:
  19. def __init__(self, a=2, b=3, length=64, seed=None):
  20. rng = np.random.default_rng(seed)
  21. self.length = length
  22. self.x = rng.normal(size=(length,)).astype(np.float32)
  23. self.y = a * self.x + b + rng.normal(scale=0.1, size=(length,)).astype(np.float32)
  24. def __len__(self):
  25. return self.length
  26. def __getitem__(self, i):
  27. return {"x": self.x[i], "y": self.y[i]}
  28. class RegressionModel(torch.nn.Module):
  29. def __init__(self, a=0, b=0, double_output=False):
  30. super().__init__()
  31. self.a = torch.nn.Parameter(torch.tensor(a).float())
  32. self.b = torch.nn.Parameter(torch.tensor(b).float())
  33. self.first_batch = True
  34. def forward(self, x=None):
  35. if self.first_batch:
  36. print(f"Model dtype: {self.a.dtype}, {self.b.dtype}. Input dtype: {x.dtype}")
  37. self.first_batch = False
  38. return x * self.a + self.b
  39. def mocked_dataloaders(accelerator, batch_size: int = 16):
  40. from datasets import load_dataset
  41. from transformers import AutoTokenizer
  42. tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
  43. data_files = {"train": "tests/test_samples/MRPC/train.csv", "validation": "tests/test_samples/MRPC/dev.csv"}
  44. datasets = load_dataset("csv", data_files=data_files)
  45. label_list = datasets["train"].unique("label")
  46. label_to_id = {v: i for i, v in enumerate(label_list)}
  47. def tokenize_function(examples):
  48. # max_length=None => use the model max length (it's actually the default)
  49. outputs = tokenizer(
  50. examples["sentence1"], examples["sentence2"], truncation=True, max_length=None, padding="max_length"
  51. )
  52. if "label" in examples:
  53. outputs["labels"] = [label_to_id[l] for l in examples["label"]]
  54. return outputs
  55. # Apply the method we just defined to all the examples in all the splits of the dataset
  56. tokenized_datasets = datasets.map(
  57. tokenize_function,
  58. batched=True,
  59. remove_columns=["sentence1", "sentence2", "label"],
  60. )
  61. def collate_fn(examples):
  62. # On TPU it's best to pad everything to the same length or training will be very slow.
  63. if accelerator.distributed_type == DistributedType.XLA:
  64. return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
  65. return tokenizer.pad(examples, padding="longest", return_tensors="pt")
  66. # Instantiate dataloaders.
  67. train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=2)
  68. eval_dataloader = DataLoader(tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=1)
  69. return train_dataloader, eval_dataloader
  70. def mocked_dataloaders_for_autoregressive_models(accelerator, batch_size: int = 16):
  71. from datasets import load_dataset
  72. from transformers import AutoTokenizer
  73. tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-360M")
  74. tokenizer.pad_token = tokenizer.eos_token
  75. data_files = {"train": "tests/test_samples/MRPC/train.csv", "validation": "tests/test_samples/MRPC/dev.csv"}
  76. datasets = load_dataset("csv", data_files=data_files)
  77. def tokenize_function(examples):
  78. # max_length=None => use the model max length (it's actually the default)
  79. outputs = tokenizer(examples["sentence1"], truncation=True, max_length=None, return_attention_mask=False)
  80. return outputs
  81. # Apply the method we just defined to all the examples in all the splits of the dataset
  82. # starting with the main process first:
  83. with accelerator.main_process_first():
  84. tokenized_datasets = datasets.map(
  85. tokenize_function,
  86. batched=True,
  87. remove_columns=["sentence1", "sentence2", "label"],
  88. )
  89. def collate_fn(examples):
  90. # On TPU it's best to pad everything to the same length or training will be very slow.
  91. max_length = (
  92. 128
  93. if accelerator.distributed_type == DistributedType.XLA
  94. else max([len(e["input_ids"]) for e in examples])
  95. )
  96. # When using mixed precision we want round multiples of 8/16
  97. if accelerator.mixed_precision == "fp8":
  98. pad_to_multiple_of = 16
  99. elif accelerator.mixed_precision != "no":
  100. pad_to_multiple_of = 8
  101. else:
  102. pad_to_multiple_of = None
  103. batch = tokenizer.pad(
  104. examples,
  105. padding="max_length",
  106. max_length=max_length + 1,
  107. pad_to_multiple_of=pad_to_multiple_of,
  108. return_tensors="pt",
  109. )
  110. batch["labels"] = batch["input_ids"][:, 1:]
  111. batch["input_ids"] = batch["input_ids"][:, :-1]
  112. batch["labels"] = torch.where(batch["labels"] == tokenizer.pad_token_id, -100, batch["labels"])
  113. return batch
  114. # Instantiate dataloaders.
  115. train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=False, collate_fn=collate_fn, batch_size=2)
  116. eval_dataloader = DataLoader(tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=1)
  117. return train_dataloader, eval_dataloader