trainer.py 273 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723
  1. # Copyright 2020-present the HuggingFace Inc. team.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """
  15. The Trainer class, to easily train a 🤗 Transformers from scratch or finetune it on a new task.
  16. """
  17. import contextlib
  18. import copy
  19. import functools
  20. import glob
  21. import importlib.metadata
  22. import inspect
  23. import json
  24. import math
  25. import os
  26. import random
  27. import re
  28. import shutil
  29. import sys
  30. import tempfile
  31. import time
  32. import warnings
  33. from collections.abc import Iterator, Mapping
  34. from functools import partial
  35. from pathlib import Path
  36. from typing import TYPE_CHECKING, Any, Callable, Optional, Union
  37. # Integrations must be imported before ML frameworks:
  38. # ruff: isort: off
  39. from .integrations import (
  40. get_reporting_integration_callbacks,
  41. )
  42. # ruff: isort: on
  43. import huggingface_hub.utils as hf_hub_utils
  44. import numpy as np
  45. import safetensors.torch
  46. import torch
  47. import torch.distributed as dist
  48. from huggingface_hub import ModelCard, create_repo, upload_folder
  49. from packaging import version
  50. from torch import nn
  51. from torch.utils.data import DataLoader, Dataset, IterableDataset, RandomSampler, SequentialSampler
  52. from . import __version__
  53. from .configuration_utils import PretrainedConfig
  54. from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
  55. from .debug_utils import DebugOption, DebugUnderflowOverflow
  56. from .feature_extraction_sequence_utils import SequenceFeatureExtractor
  57. from .feature_extraction_utils import FeatureExtractionMixin
  58. from .hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS, default_hp_search_backend
  59. from .image_processing_utils import BaseImageProcessor
  60. from .integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_available
  61. from .integrations.tpu import tpu_spmd_dataloader
  62. from .modelcard import TrainingSummary
  63. from .modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model
  64. from .models.auto.modeling_auto import (
  65. MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
  66. MODEL_MAPPING_NAMES,
  67. )
  68. from .optimization import Adafactor, get_scheduler
  69. from .processing_utils import ProcessorMixin
  70. from .pytorch_utils import (
  71. is_torch_greater_or_equal_than_2_3,
  72. )
  73. from .tokenization_utils_base import PreTrainedTokenizerBase
  74. from .trainer_callback import (
  75. CallbackHandler,
  76. DefaultFlowCallback,
  77. ExportableState,
  78. PrinterCallback,
  79. ProgressCallback,
  80. TrainerCallback,
  81. TrainerControl,
  82. TrainerState,
  83. )
  84. from .trainer_pt_utils import (
  85. DistributedTensorGatherer,
  86. EvalLoopContainer,
  87. IterableDatasetShard,
  88. LabelSmoother,
  89. LayerWiseDummyOptimizer,
  90. LengthGroupedSampler,
  91. SequentialDistributedSampler,
  92. distributed_broadcast_scalars,
  93. distributed_concat,
  94. find_batch_size,
  95. get_model_param_count,
  96. get_module_class_from_name,
  97. get_parameter_names,
  98. nested_concat,
  99. nested_detach,
  100. nested_numpify,
  101. nested_xla_mesh_reduce,
  102. reissue_pt_warnings,
  103. remove_dummy_checkpoint,
  104. set_rng_state_for_device,
  105. )
  106. from .trainer_utils import (
  107. PREFIX_CHECKPOINT_DIR,
  108. BestRun,
  109. EvalLoopOutput,
  110. EvalPrediction,
  111. HPSearchBackend,
  112. HubStrategy,
  113. PredictionOutput,
  114. RemoveColumnsCollator,
  115. SaveStrategy,
  116. TrainerMemoryTracker,
  117. TrainOutput,
  118. check_target_module_exists,
  119. default_compute_objective,
  120. denumpify_detensorize,
  121. enable_full_determinism,
  122. find_executable_batch_size,
  123. get_last_checkpoint,
  124. has_length,
  125. neftune_post_forward_hook,
  126. number_of_arguments,
  127. seed_worker,
  128. set_seed,
  129. speed_metrics,
  130. )
  131. from .training_args import OptimizerNames, ParallelMode, TrainingArguments
  132. from .utils import (
  133. ADAPTER_CONFIG_NAME,
  134. ADAPTER_SAFE_WEIGHTS_NAME,
  135. ADAPTER_WEIGHTS_NAME,
  136. CONFIG_NAME,
  137. GENERATION_CONFIG_NAME,
  138. SAFE_WEIGHTS_INDEX_NAME,
  139. SAFE_WEIGHTS_NAME,
  140. WEIGHTS_INDEX_NAME,
  141. WEIGHTS_NAME,
  142. XLA_FSDPV2_MIN_VERSION,
  143. PushInProgress,
  144. PushToHubMixin,
  145. can_return_loss,
  146. check_torch_load_is_safe,
  147. find_labels,
  148. is_accelerate_available,
  149. is_apollo_torch_available,
  150. is_bitsandbytes_available,
  151. is_datasets_available,
  152. is_galore_torch_available,
  153. is_grokadamw_available,
  154. is_in_notebook,
  155. is_liger_kernel_available,
  156. is_lomo_available,
  157. is_peft_available,
  158. is_sagemaker_dp_enabled,
  159. is_sagemaker_mp_enabled,
  160. is_schedulefree_available,
  161. is_torch_hpu_available,
  162. is_torch_mlu_available,
  163. is_torch_mps_available,
  164. is_torch_musa_available,
  165. is_torch_neuroncore_available,
  166. is_torch_npu_available,
  167. is_torch_optimi_available,
  168. is_torch_xla_available,
  169. is_torch_xpu_available,
  170. is_torchao_available,
  171. logging,
  172. strtobool,
  173. )
  174. from .utils.deprecation import deprecate_kwarg
  175. from .utils.import_utils import requires
  176. from .utils.quantization_config import QuantizationMethod
  177. DEFAULT_CALLBACKS = [DefaultFlowCallback]
  178. DEFAULT_PROGRESS_CALLBACK = ProgressCallback
  179. if is_in_notebook():
  180. from .utils.notebook import NotebookProgressCallback
  181. DEFAULT_PROGRESS_CALLBACK = NotebookProgressCallback
  182. if is_datasets_available():
  183. import datasets
  184. if is_torch_xla_available():
  185. import torch_xla.core.xla_model as xm
  186. import torch_xla.debug.metrics as met
  187. import torch_xla.runtime as xr
  188. from torch_xla import __version__ as XLA_VERSION
  189. IS_XLA_FSDPV2_POST_2_2 = version.parse(XLA_VERSION) >= version.parse(XLA_FSDPV2_MIN_VERSION)
  190. if IS_XLA_FSDPV2_POST_2_2:
  191. import torch_xla.distributed.spmd as xs
  192. else:
  193. IS_XLA_FSDPV2_POST_2_2 = False
  194. if is_sagemaker_mp_enabled():
  195. import smdistributed.modelparallel.torch as smp
  196. from smdistributed.modelparallel import __version__ as SMP_VERSION
  197. IS_SAGEMAKER_MP_POST_1_10 = version.parse(SMP_VERSION) >= version.parse("1.10")
  198. from .trainer_pt_utils import smp_forward_backward, smp_forward_only, smp_gather, smp_nested_concat
  199. else:
  200. IS_SAGEMAKER_MP_POST_1_10 = False
  201. if is_peft_available():
  202. from peft import PeftModel
  203. if is_accelerate_available():
  204. from accelerate import Accelerator, skip_first_batches
  205. from accelerate import __version__ as accelerate_version
  206. from accelerate.state import AcceleratorState
  207. from accelerate.utils import (
  208. AutocastKwargs,
  209. DistributedDataParallelKwargs,
  210. DistributedType,
  211. load_fsdp_model,
  212. load_fsdp_optimizer,
  213. save_fsdp_model,
  214. save_fsdp_optimizer,
  215. )
  216. DATA_SAMPLERS = [RandomSampler]
  217. if version.parse(accelerate_version) > version.parse("1.3.0"):
  218. from accelerate.utils import TorchTensorParallelPlugin
  219. from accelerate.data_loader import SeedableRandomSampler
  220. DATA_SAMPLERS += [SeedableRandomSampler]
  221. if is_deepspeed_available():
  222. from accelerate.utils import DeepSpeedSchedulerWrapper
  223. if is_accelerate_available("0.28.0"):
  224. from accelerate.utils import DataLoaderConfiguration
  225. def _is_peft_model(model):
  226. if is_peft_available():
  227. classes_to_check = (PeftModel,)
  228. # Here we also check if the model is an instance of `PeftMixedModel` introduced in peft>=0.7.0: https://github.com/huggingface/transformers/pull/28321
  229. if version.parse(importlib.metadata.version("peft")) >= version.parse("0.7.0"):
  230. from peft import PeftMixedModel
  231. classes_to_check = (*classes_to_check, PeftMixedModel)
  232. return isinstance(model, classes_to_check)
  233. return False
  234. def _get_fsdp_ckpt_kwargs():
  235. # TODO: @AjayP13, @younesbelkada replace this check with version check at the next `accelerate` release
  236. if is_accelerate_available() and "adapter_only" in list(inspect.signature(save_fsdp_model).parameters):
  237. return {"adapter_only": True}
  238. else:
  239. return {}
  240. def safe_globals():
  241. # Starting from version 2.4 PyTorch introduces a check for the objects loaded
  242. # with torch.load(weights_only=True). Starting from 2.6 weights_only=True becomes
  243. # a default and requires allowlisting of objects being loaded.
  244. # See: https://github.com/pytorch/pytorch/pull/137602
  245. # See: https://pytorch.org/docs/stable/notes/serialization.html#torch.serialization.add_safe_globals
  246. # See: https://github.com/huggingface/accelerate/pull/3036
  247. if version.parse(torch.__version__).release < version.parse("2.6").release:
  248. return contextlib.nullcontext()
  249. np_core = np._core if version.parse(np.__version__) >= version.parse("2.0.0") else np.core
  250. allowlist = [np_core.multiarray._reconstruct, np.ndarray, np.dtype]
  251. # numpy >1.25 defines numpy.dtypes.UInt32DType, but below works for
  252. # all versions of numpy
  253. allowlist += [type(np.dtype(np.uint32))]
  254. return torch.serialization.safe_globals(allowlist)
  255. if TYPE_CHECKING:
  256. import optuna
  257. logger = logging.get_logger(__name__)
  258. # Name of the files used for checkpointing
  259. TRAINING_ARGS_NAME = "training_args.bin"
  260. TRAINER_STATE_NAME = "trainer_state.json"
  261. OPTIMIZER_NAME = "optimizer.pt"
  262. SCALER_NAME = "scaler.pt"
  263. OPTIMIZER_NAME_BIN = "optimizer.bin"
  264. SCHEDULER_NAME = "scheduler.pt"
  265. FSDP_MODEL_NAME = "pytorch_model_fsdp"
  266. @requires(
  267. backends=(
  268. "torch",
  269. "accelerate",
  270. )
  271. )
  272. class Trainer:
  273. """
  274. Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for 🤗 Transformers.
  275. Args:
  276. model ([`PreTrainedModel`] or `torch.nn.Module`, *optional*):
  277. The model to train, evaluate or use for predictions. If not provided, a `model_init` must be passed.
  278. <Tip>
  279. [`Trainer`] is optimized to work with the [`PreTrainedModel`] provided by the library. You can still use
  280. your own models defined as `torch.nn.Module` as long as they work the same way as the 🤗 Transformers
  281. models.
  282. </Tip>
  283. args ([`TrainingArguments`], *optional*):
  284. The arguments to tweak for training. Will default to a basic instance of [`TrainingArguments`] with the
  285. `output_dir` set to a directory named *tmp_trainer* in the current directory if not provided.
  286. data_collator (`DataCollator`, *optional*):
  287. The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. Will
  288. default to [`default_data_collator`] if no `processing_class` is provided, an instance of
  289. [`DataCollatorWithPadding`] otherwise if the processing_class is a feature extractor or tokenizer.
  290. train_dataset (Union[`torch.utils.data.Dataset`, `torch.utils.data.IterableDataset`, `datasets.Dataset`], *optional*):
  291. The dataset to use for training. If it is a [`~datasets.Dataset`], columns not accepted by the
  292. `model.forward()` method are automatically removed.
  293. Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a
  294. distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a
  295. `torch.Generator` for the randomization that must be identical on all processes (and the Trainer will
  296. manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally
  297. sets the seed of the RNGs used.
  298. eval_dataset (Union[`torch.utils.data.Dataset`, dict[str, `torch.utils.data.Dataset`, `datasets.Dataset`]), *optional*):
  299. The dataset to use for evaluation. If it is a [`~datasets.Dataset`], columns not accepted by the
  300. `model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each
  301. dataset prepending the dictionary key to the metric name.
  302. processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*):
  303. Processing class used to process the data. If provided, will be used to automatically process the inputs
  304. for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
  305. reuse the fine-tuned model.
  306. This supersedes the `tokenizer` argument, which is now deprecated.
  307. model_init (`Callable[[], PreTrainedModel]`, *optional*):
  308. A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start
  309. from a new instance of the model as given by this function.
  310. The function may have zero argument, or a single one containing the optuna/Ray Tune/SigOpt trial object, to
  311. be able to choose different architectures according to hyper parameters (such as layer count, sizes of
  312. inner layers, dropout probabilities etc).
  313. compute_loss_func (`Callable`, *optional*):
  314. A function that accepts the raw model outputs, labels, and the number of items in the entire accumulated
  315. batch (batch_size * gradient_accumulation_steps) and returns the loss. For example, see the default [loss function](https://github.com/huggingface/transformers/blob/052e652d6d53c2b26ffde87e039b723949a53493/src/transformers/trainer.py#L3618) used by [`Trainer`].
  316. compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
  317. The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return
  318. a dictionary string to metric values. *Note* When passing TrainingArgs with `batch_eval_metrics` set to
  319. `True`, your compute_metrics function must take a boolean `compute_result` argument. This will be triggered
  320. after the last eval batch to signal that the function needs to calculate and return the global summary
  321. statistics rather than accumulating the batch-level statistics
  322. callbacks (List of [`TrainerCallback`], *optional*):
  323. A list of callbacks to customize the training loop. Will add those to the list of default callbacks
  324. detailed in [here](callback).
  325. If you want to remove one of the default callbacks used, use the [`Trainer.remove_callback`] method.
  326. optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
  327. A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
  328. model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
  329. optimizer_cls_and_kwargs (`tuple[Type[torch.optim.Optimizer], dict[str, Any]]`, *optional*):
  330. A tuple containing the optimizer class and keyword arguments to use.
  331. Overrides `optim` and `optim_args` in `args`. Incompatible with the `optimizers` argument.
  332. Unlike `optimizers`, this argument avoids the need to place model parameters on the correct devices before initializing the Trainer.
  333. preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*):
  334. A function that preprocess the logits right before caching them at each evaluation step. Must take two
  335. tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
  336. by this function will be reflected in the predictions received by `compute_metrics`.
  337. Note that the labels (second parameter) will be `None` if the dataset does not have them.
  338. Important attributes:
  339. - **model** -- Always points to the core model. If using a transformers model, it will be a [`PreTrainedModel`]
  340. subclass.
  341. - **model_wrapped** -- Always points to the most external model in case one or more other modules wrap the
  342. original model. This is the model that should be used for the forward pass. For example, under `DeepSpeed`,
  343. the inner model is wrapped in `DeepSpeed` and then again in `torch.nn.DistributedDataParallel`. If the inner
  344. model hasn't been wrapped, then `self.model_wrapped` is the same as `self.model`.
  345. - **is_model_parallel** -- Whether or not a model has been switched to a model parallel mode (different from
  346. data parallelism, this means some of the model layers are split on different GPUs).
  347. - **place_model_on_device** -- Whether or not to automatically place the model on the device - it will be set
  348. to `False` if model parallel or deepspeed is used, or if the default
  349. `TrainingArguments.place_model_on_device` is overridden to return `False` .
  350. - **is_in_train** -- Whether or not a model is currently running `train` (e.g. when `evaluate` is called while
  351. in `train`)
  352. """
  353. # Those are used as methods of the Trainer in examples.
  354. from .trainer_pt_utils import _get_learning_rate, log_metrics, metrics_format, save_metrics, save_state
  355. @deprecate_kwarg("tokenizer", new_name="processing_class", version="5.0.0", raise_if_both_names=True)
  356. def __init__(
  357. self,
  358. model: Union[PreTrainedModel, nn.Module, None] = None,
  359. args: Optional[TrainingArguments] = None,
  360. data_collator: Optional[DataCollator] = None,
  361. train_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None,
  362. eval_dataset: Optional[Union[Dataset, dict[str, Dataset], "datasets.Dataset"]] = None,
  363. processing_class: Optional[
  364. Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
  365. ] = None,
  366. model_init: Optional[Callable[..., PreTrainedModel]] = None,
  367. compute_loss_func: Optional[Callable] = None,
  368. compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None,
  369. callbacks: Optional[list[TrainerCallback]] = None,
  370. optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None),
  371. optimizer_cls_and_kwargs: Optional[tuple[type[torch.optim.Optimizer], dict[str, Any]]] = None,
  372. preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
  373. ):
  374. if args is None:
  375. output_dir = "tmp_trainer"
  376. logger.info(f"No `TrainingArguments` passed, using `output_dir={output_dir}`.")
  377. args = TrainingArguments(output_dir=output_dir)
  378. if args.batch_eval_metrics and compute_metrics is not None:
  379. if "compute_result" not in inspect.signature(compute_metrics).parameters:
  380. raise ValueError(
  381. "When using `batch_eval_metrics`, your `compute_metrics` function must take a `compute_result`"
  382. " boolean argument which will be triggered after the last batch of the eval set to signal that the"
  383. " summary statistics should be returned by the function."
  384. )
  385. if args.eval_strategy is not None and args.eval_strategy != "no" and eval_dataset is None:
  386. raise ValueError(
  387. f"You have set `args.eval_strategy` to {args.eval_strategy} but you didn't pass an `eval_dataset` to `Trainer`. Either set `args.eval_strategy` to `no` or pass an `eval_dataset`. "
  388. )
  389. if args.save_strategy == SaveStrategy.BEST or args.load_best_model_at_end:
  390. if args.metric_for_best_model is None:
  391. raise ValueError(
  392. "`args.metric_for_best_model` must be provided when using 'best' save_strategy or if `args.load_best_model_at_end` is set to `True`."
  393. )
  394. self.args = args
  395. self.compute_loss_func = compute_loss_func
  396. # Seed must be set before instantiating the model when using model
  397. enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
  398. self.hp_name = None
  399. self.deepspeed = None
  400. self.is_in_train = False
  401. self.model = model
  402. self.create_accelerator_and_postprocess()
  403. # memory metrics - must set up as early as possible
  404. self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics)
  405. self._memory_tracker.start()
  406. # set the correct log level depending on the node
  407. log_level = args.get_process_log_level()
  408. logging.set_verbosity(log_level)
  409. # force device and distributed setup init explicitly
  410. args._setup_devices
  411. if model is None:
  412. if model_init is not None:
  413. self.model_init = model_init
  414. model = self.call_model_init()
  415. else:
  416. raise RuntimeError("`Trainer` requires either a `model` or `model_init` argument")
  417. else:
  418. if model_init is not None:
  419. warnings.warn(
  420. "`Trainer` requires either a `model` or `model_init` argument, but not both. `model_init` will"
  421. " overwrite your model when calling the `train` method. This will become a fatal error in the next"
  422. " release.",
  423. FutureWarning,
  424. )
  425. self.model_init = model_init
  426. if model.__class__.__name__ in MODEL_MAPPING_NAMES:
  427. raise ValueError(
  428. f"The model you have picked ({model.__class__.__name__}) cannot be used as is for training: it only "
  429. "computes hidden states and does not accept any labels. You should choose a model with a head "
  430. "suitable for your task like any of the `AutoModelForXxx` listed at "
  431. "https://huggingface.co/docs/transformers/model_doc/auto"
  432. )
  433. if getattr(model, "is_parallelizable", False) and getattr(model, "model_parallel", False):
  434. self.is_model_parallel = True
  435. else:
  436. self.is_model_parallel = False
  437. if getattr(model, "hf_device_map", None) is not None:
  438. devices = [device for device in set(model.hf_device_map.values()) if device not in ["cpu", "disk"]]
  439. if len(devices) > 1:
  440. self.is_model_parallel = True
  441. elif len(devices) == 1:
  442. self.is_model_parallel = self.args.device != torch.device(devices[0])
  443. else:
  444. self.is_model_parallel = False
  445. # warn users
  446. if self.is_model_parallel:
  447. logger.info(
  448. "You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set"
  449. " to `True` to avoid any unexpected behavior such as device placement mismatching."
  450. )
  451. if self.args.use_liger_kernel:
  452. if is_liger_kernel_available():
  453. from liger_kernel.transformers import _apply_liger_kernel_to_instance
  454. # Prepare kernel config - use provided config or default (empty dict for default behavior)
  455. kernel_config = self.args.liger_kernel_config if self.args.liger_kernel_config is not None else {}
  456. if isinstance(model, PreTrainedModel):
  457. # Patch the model with liger kernels. Use the specified or default kernel configurations.
  458. _apply_liger_kernel_to_instance(model=model, **kernel_config)
  459. elif hasattr(model, "get_base_model") and isinstance(model.get_base_model(), PreTrainedModel):
  460. # Patch the base model with liger kernels where model is a PeftModel. Use the specified or default kernel configurations.
  461. _apply_liger_kernel_to_instance(model=model.get_base_model(), **kernel_config)
  462. else:
  463. logger.warning(
  464. "The model is not an instance of PreTrainedModel. No liger kernels will be applied."
  465. )
  466. else:
  467. raise ImportError(
  468. "You have set `use_liger_kernel` to `True` but liger-kernel >= 0.3.0 is not available. "
  469. "Please install it with `pip install liger-kernel`"
  470. )
  471. _is_quantized_and_base_model = getattr(model, "is_quantized", False) and not getattr(
  472. model, "_hf_peft_config_loaded", False
  473. )
  474. _quantization_method_supports_training = (
  475. getattr(model, "hf_quantizer", None) is not None and model.hf_quantizer.is_trainable
  476. )
  477. _is_model_quantized_and_qat_trainable = getattr(model, "hf_quantizer", None) is not None and getattr(
  478. model.hf_quantizer, "is_qat_trainable", False
  479. )
  480. # Filter out quantized + compiled models
  481. if _is_quantized_and_base_model and hasattr(model, "_orig_mod"):
  482. raise ValueError(
  483. "You cannot fine-tune quantized model with `torch.compile()` make sure to pass a non-compiled model when fine-tuning a quantized model with PEFT"
  484. )
  485. # At this stage the model is already loaded
  486. if _is_quantized_and_base_model and not _is_peft_model(model) and not _is_model_quantized_and_qat_trainable:
  487. raise ValueError(
  488. "You cannot perform fine-tuning on purely quantized models. Please attach trainable adapters on top of"
  489. " the quantized model to correctly perform fine-tuning. Please see: https://huggingface.co/docs/transformers/peft"
  490. " for more details"
  491. )
  492. elif _is_quantized_and_base_model and not _quantization_method_supports_training:
  493. raise ValueError(
  494. f"The model you are trying to fine-tune is quantized with {model.hf_quantizer.quantization_config.quant_method}"
  495. " but that quantization method do not support training. Please open an issue on GitHub: https://github.com/huggingface/transformers"
  496. f" to request the support for training support for {model.hf_quantizer.quantization_config.quant_method}"
  497. )
  498. self.is_fsdp_xla_enabled = args.fsdp_config["xla"]
  499. if len(args.fsdp) > 0:
  500. if self.is_deepspeed_enabled:
  501. raise ValueError(
  502. "Using --fsdp xxx together with --deepspeed is not possible, deactivate one of those flags."
  503. )
  504. if not args.fsdp_config["xla"] and args.parallel_mode != ParallelMode.DISTRIBUTED:
  505. raise ValueError("Using fsdp only works in distributed training.")
  506. # one place to sort out whether to place the model on device or not
  507. # postpone switching model to cuda when:
  508. # 1. MP - since we are trying to fit a much bigger than 1 gpu model
  509. # 2. fp16-enabled DeepSpeed loads the model in half the size and it doesn't need .to() anyway,
  510. # and we only use deepspeed for training at the moment
  511. # 3. full bf16 or fp16 eval - since the model needs to be cast to the right dtype first
  512. # 4. FSDP - same as MP
  513. self.place_model_on_device = args.place_model_on_device
  514. if (
  515. self.is_model_parallel
  516. or self.is_deepspeed_enabled
  517. or ((args.fp16_full_eval or args.bf16_full_eval) and not args.do_train)
  518. or self.is_fsdp_xla_enabled
  519. or self.is_fsdp_enabled
  520. ):
  521. self.place_model_on_device = False
  522. default_collator = (
  523. DataCollatorWithPadding(processing_class)
  524. if processing_class is not None
  525. and isinstance(processing_class, (PreTrainedTokenizerBase, SequenceFeatureExtractor))
  526. else default_data_collator
  527. )
  528. self.data_collator = data_collator if data_collator is not None else default_collator
  529. self.train_dataset = train_dataset
  530. self.eval_dataset = eval_dataset
  531. self.processing_class = processing_class
  532. # Bnb Quantized models doesn't support `.to` operation.
  533. if (
  534. self.place_model_on_device
  535. and getattr(model, "quantization_method", None) != QuantizationMethod.BITS_AND_BYTES
  536. ):
  537. self._move_model_to_device(model, args.device)
  538. # Force n_gpu to 1 to avoid DataParallel as MP will manage the GPUs
  539. if self.is_model_parallel:
  540. self.args._n_gpu = 1
  541. # later use `self.model is self.model_wrapped` to check if it's wrapped or not
  542. self.model_wrapped = model
  543. self.model = model
  544. # Just in case the model was wrapped outside of the `Trainer`
  545. unwrapped_model = self.accelerator.unwrap_model(model)
  546. # We also unwrap peft model
  547. if _is_peft_model(unwrapped_model):
  548. if hasattr(unwrapped_model, "get_base_model"):
  549. unwrapped_model = unwrapped_model.get_base_model()
  550. elif hasattr(unwrapped_model, "base_model") and hasattr(unwrapped_model.base_model, "model"):
  551. unwrapped_model = unwrapped_model.base_model.model
  552. else:
  553. raise AttributeError("Cannot extract base model safely from this PEFT wrapper.")
  554. # Check if the model has explicit setup for loss kwargs,
  555. # if not, check if `**kwargs` are in model.forward
  556. if hasattr(unwrapped_model, "accepts_loss_kwargs"):
  557. self.model_accepts_loss_kwargs = unwrapped_model.accepts_loss_kwargs
  558. else:
  559. forward_params = inspect.signature(unwrapped_model.forward).parameters
  560. self.model_accepts_loss_kwargs = any(
  561. k.kind == inspect.Parameter.VAR_KEYWORD for k in forward_params.values()
  562. )
  563. self.neftune_noise_alpha = args.neftune_noise_alpha
  564. self.compute_metrics = compute_metrics
  565. self.preprocess_logits_for_metrics = preprocess_logits_for_metrics
  566. self.optimizer, self.lr_scheduler = optimizers
  567. self.optimizer_cls_and_kwargs = optimizer_cls_and_kwargs
  568. if self.optimizer_cls_and_kwargs is not None and self.optimizer is not None:
  569. raise RuntimeError("Passing both `optimizers` and `optimizer_cls_and_kwargs` arguments is incompatible.")
  570. if model_init is not None and (self.optimizer is not None or self.lr_scheduler is not None):
  571. raise RuntimeError(
  572. "Passing a `model_init` is incompatible with providing the `optimizers` argument. "
  573. "You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method."
  574. )
  575. if is_torch_xla_available() and self.optimizer is not None:
  576. for param in self.model.parameters():
  577. model_device = param.device
  578. break
  579. for param_group in self.optimizer.param_groups:
  580. if len(param_group["params"]) > 0:
  581. optimizer_device = param_group["params"][0].device
  582. break
  583. if model_device != optimizer_device:
  584. raise ValueError(
  585. "The model and the optimizer parameters are not on the same device, which probably means you"
  586. " created an optimizer around your model **before** putting on the device and passing it to the"
  587. " `Trainer`. Make sure the lines `import torch_xla.core.xla_model as xm` and"
  588. " `model.to(xm.xla_device())` is performed before the optimizer creation in your script."
  589. )
  590. if (self.is_fsdp_xla_enabled or self.is_fsdp_enabled) and (
  591. self.optimizer is not None or self.lr_scheduler is not None
  592. ):
  593. raise RuntimeError(
  594. "Passing `optimizers` is not allowed if PyTorch FSDP is enabled. "
  595. "You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method."
  596. )
  597. default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to)
  598. callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks
  599. self.callback_handler = CallbackHandler(
  600. callbacks, self.model, self.processing_class, self.optimizer, self.lr_scheduler
  601. )
  602. self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK)
  603. # Will be set to True by `self._setup_loggers()` on first call to `self.log()`.
  604. self._loggers_initialized = False
  605. # Create distant repo and output directory if needed
  606. self.hub_model_id = None
  607. if self.args.push_to_hub:
  608. self.init_hf_repo()
  609. if self.args.should_save:
  610. os.makedirs(self.args.output_dir, exist_ok=True)
  611. if not callable(self.data_collator) and callable(getattr(self.data_collator, "collate_batch", None)):
  612. raise TypeError("The `data_collator` should be a simple callable (function, class with `__call__`).")
  613. if args.max_steps > 0 and args.num_train_epochs > 0:
  614. logger.info("max_steps is given, it will override any value given in num_train_epochs")
  615. if train_dataset is not None and not has_length(train_dataset) and args.max_steps <= 0:
  616. raise ValueError(
  617. "The train_dataset does not implement __len__, max_steps has to be specified. "
  618. "The number of steps needs to be known in advance for the learning rate scheduler."
  619. )
  620. if (
  621. train_dataset is not None
  622. and isinstance(train_dataset, torch.utils.data.IterableDataset)
  623. and args.group_by_length
  624. ):
  625. raise ValueError("the `--group_by_length` option is only available for `Dataset`, not `IterableDataset")
  626. self._signature_columns = None
  627. # Mixed precision setup
  628. self.use_apex = False
  629. self.use_cpu_amp = False
  630. # Mixed precision setup for SageMaker Model Parallel
  631. if is_sagemaker_mp_enabled():
  632. # BF16 + model parallelism in SageMaker: currently not supported, raise an error
  633. if args.bf16:
  634. raise ValueError("SageMaker Model Parallelism does not support BF16 yet. Please use FP16 instead ")
  635. if IS_SAGEMAKER_MP_POST_1_10:
  636. # When there's mismatch between SMP config and trainer argument, use SMP config as truth
  637. if args.fp16 != smp.state.cfg.fp16:
  638. logger.warning(
  639. f"FP16 provided in SM_HP_MP_PARAMETERS is {smp.state.cfg.fp16}, "
  640. f"but FP16 provided in trainer argument is {args.fp16}, "
  641. f"setting to {smp.state.cfg.fp16}"
  642. )
  643. args.fp16 = smp.state.cfg.fp16
  644. else:
  645. # smp < 1.10 does not support fp16 in trainer.
  646. if hasattr(smp.state.cfg, "fp16"):
  647. logger.warning(
  648. f"FP16 provided in SM_HP_MP_PARAMETERS is {smp.state.cfg.fp16}, "
  649. "but SageMaker Model Parallelism < 1.10 does not support FP16 in trainer."
  650. )
  651. if (args.fp16 or args.bf16) and args.half_precision_backend == "auto":
  652. if args.device == torch.device("cpu"):
  653. if args.fp16:
  654. if not is_torch_greater_or_equal_than_2_3:
  655. raise ValueError("Tried to use `fp16` but it is not supported on cpu")
  656. else:
  657. args.half_precision_backend = "cpu_amp"
  658. logger.info(f"Using {args.half_precision_backend} half precision backend")
  659. if (args.fp16 or args.bf16) and not (self.is_deepspeed_enabled or is_sagemaker_mp_enabled()):
  660. # deepspeed and SageMaker Model Parallel manage their own half precision
  661. if args.half_precision_backend == "cpu_amp":
  662. self.use_cpu_amp = True
  663. self.amp_dtype = torch.bfloat16
  664. elif args.half_precision_backend == "apex":
  665. self.use_apex = True
  666. # Label smoothing
  667. if self.args.label_smoothing_factor != 0:
  668. self.label_smoother = LabelSmoother(epsilon=self.args.label_smoothing_factor)
  669. else:
  670. self.label_smoother = None
  671. # Check for multi-label classification incompatibility
  672. if self.args.label_smoothing_factor > 0:
  673. if getattr(self.model.config, "problem_type", None) == "multi_label_classification":
  674. warnings.warn(
  675. "Label smoothing is not compatible with multi-label classification. "
  676. "Disabling label smoothing for this training run.",
  677. UserWarning,
  678. )
  679. self.label_smoother = None
  680. self.control = TrainerControl()
  681. self.state = TrainerState(
  682. is_local_process_zero=self.is_local_process_zero(),
  683. is_world_process_zero=self.is_world_process_zero(),
  684. stateful_callbacks=[
  685. cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState)
  686. ],
  687. )
  688. # Internal variable to count flos in each process, will be accumulated in `self.state.total_flos` then
  689. # returned to 0 every time flos need to be logged
  690. self.current_flos = 0
  691. self.hp_search_backend = None
  692. model_to_inspect = self.model
  693. if _is_peft_model(self.model):
  694. if hasattr(self.model, "get_base_model"):
  695. model_to_inspect = self.model.get_base_model()
  696. else:
  697. # PeftMixedModel do not provide a `get_base_model` method
  698. model_to_inspect = self.model.base_model.model
  699. default_label_names = find_labels(model_to_inspect.__class__)
  700. self.label_names = default_label_names if self.args.label_names is None else self.args.label_names
  701. self.can_return_loss = can_return_loss(model_to_inspect.__class__)
  702. self.control = self.callback_handler.on_init_end(self.args, self.state, self.control)
  703. # Internal variables to help with automatic batch size reduction
  704. self._train_batch_size = args.train_batch_size
  705. self._created_lr_scheduler = False
  706. # very last
  707. self._memory_tracker.stop_and_update_metrics()
  708. self.is_fsdp_xla_v2_enabled = args.fsdp_config.get("xla_fsdp_v2", False)
  709. if self.is_fsdp_xla_v2_enabled:
  710. if not IS_XLA_FSDPV2_POST_2_2:
  711. raise ValueError("FSDPv2 requires `torch_xla` 2.2 or higher.")
  712. # Prepare the SPMD mesh that is going to be used by the data loader and the FSDPv2 wrapper.
  713. # Tensor axis is just a placeholder where it will not be used in FSDPv2.
  714. num_devices = xr.global_runtime_device_count()
  715. xs.set_global_mesh(xs.Mesh(np.array(range(num_devices)), (num_devices, 1), axis_names=("fsdp", "tensor")))
  716. self.is_fsdp_xla_v1_enabled = self.is_fsdp_xla_enabled and not self.is_fsdp_xla_v2_enabled
  717. @property
  718. def tokenizer(self) -> Optional[PreTrainedTokenizerBase]:
  719. logger.warning("Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.")
  720. return self.processing_class
  721. @tokenizer.setter
  722. def tokenizer(self, processing_class) -> None:
  723. logger.warning(
  724. "Trainer.tokenizer is now deprecated. You should use `Trainer.processing_class = processing_class` instead."
  725. )
  726. self.processing_class = processing_class
  727. def _activate_neftune(self, model):
  728. r"""
  729. Activates the neftune as presented in this code: https://github.com/neelsjain/NEFTune and paper:
  730. https://huggingface.co/papers/2310.05914
  731. """
  732. unwrapped_model = self.accelerator.unwrap_model(model)
  733. if _is_peft_model(unwrapped_model):
  734. embeddings = unwrapped_model.base_model.model.get_input_embeddings()
  735. else:
  736. embeddings = unwrapped_model.get_input_embeddings()
  737. del unwrapped_model
  738. embeddings.neftune_noise_alpha = self.neftune_noise_alpha
  739. hook_handle = embeddings.register_forward_hook(neftune_post_forward_hook)
  740. self.neftune_hook_handle = hook_handle
  741. return model
  742. def _deactivate_neftune(self, model):
  743. """
  744. Deactivates the neftune method. Make sure to call `_activate_neftune` first.
  745. """
  746. if not hasattr(self, "neftune_hook_handle"):
  747. raise ValueError("Neftune is not activated make sure to call `trainer._activate_neftune()` first")
  748. unwrapped_model = self.accelerator.unwrap_model(model)
  749. if _is_peft_model(unwrapped_model):
  750. embeddings = unwrapped_model.base_model.model.get_input_embeddings()
  751. else:
  752. embeddings = unwrapped_model.get_input_embeddings()
  753. self.neftune_hook_handle.remove()
  754. del embeddings.neftune_noise_alpha, unwrapped_model
  755. def add_callback(self, callback):
  756. """
  757. Add a callback to the current list of [`~transformers.TrainerCallback`].
  758. Args:
  759. callback (`type` or [`~transformers.TrainerCallback]`):
  760. A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
  761. first case, will instantiate a member of that class.
  762. """
  763. self.callback_handler.add_callback(callback)
  764. def pop_callback(self, callback):
  765. """
  766. Remove a callback from the current list of [`~transformers.TrainerCallback`] and returns it.
  767. If the callback is not found, returns `None` (and no error is raised).
  768. Args:
  769. callback (`type` or [`~transformers.TrainerCallback]`):
  770. A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
  771. first case, will pop the first member of that class found in the list of callbacks.
  772. Returns:
  773. [`~transformers.TrainerCallback`]: The callback removed, if found.
  774. """
  775. return self.callback_handler.pop_callback(callback)
  776. def remove_callback(self, callback):
  777. """
  778. Remove a callback from the current list of [`~transformers.TrainerCallback`].
  779. Args:
  780. callback (`type` or [`~transformers.TrainerCallback]`):
  781. A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
  782. first case, will remove the first member of that class found in the list of callbacks.
  783. """
  784. self.callback_handler.remove_callback(callback)
  785. def _move_model_to_device(self, model, device):
  786. if getattr(model, "hf_device_map", None) is not None:
  787. logger.warning(
  788. "The model is already on multiple devices. Skipping the move to device specified in `args`."
  789. )
  790. return
  791. model = model.to(device)
  792. # Moving a model to an XLA device disconnects the tied weights, so we have to retie them.
  793. if self.args.parallel_mode == ParallelMode.TPU and hasattr(model, "tie_weights"):
  794. model.tie_weights()
  795. def _align_special_tokens(self):
  796. """
  797. Aligns the special tokens of the tokenizer with the model configs.
  798. A new tokens may be defined in the tokenizer for fine-tuning purposes, e.g. an "end of turn" token may be
  799. added on chat models. In that case, we want the model configs to be aligned with the tokenizer, so that all
  800. downstream uses work as expected. This alignment should happen before training, to ensure the prediction step
  801. uses the new tokens as well.
  802. """
  803. if isinstance(self.processing_class, ProcessorMixin):
  804. tokenizer: PreTrainedTokenizerBase = self.processing_class.tokenizer
  805. else:
  806. tokenizer = self.processing_class
  807. model_has_generation_config = (
  808. hasattr(self.model, "generation_config") and self.model.generation_config is not None
  809. )
  810. updated_tokens = {}
  811. # 1 - Align EOS token. EOS is more complex than the others, as `generation_config` may hold more than one EOS
  812. # token.
  813. tokenizer_has_new_eos = tokenizer.eos_token_id != self.model.config.eos_token_id
  814. if model_has_generation_config:
  815. # `generation_config.eos_token_id` is None: direct comparison
  816. if self.model.generation_config.eos_token_id is None:
  817. tokenizer_has_new_eos |= tokenizer.eos_token_id != self.model.generation_config.eos_token_id
  818. else:
  819. # `generation_config.eos_token_id` is an `int`: convert it to list (and continue below)
  820. if isinstance(self.model.generation_config.eos_token_id, int):
  821. self.model.generation_config.eos_token_id = [self.model.generation_config.eos_token_id]
  822. # `generation_config.eos_token_id` is a `list`: check if the tokenizer's EOS token is in the list
  823. tokenizer_has_new_eos |= tokenizer.eos_token_id not in self.model.generation_config.eos_token_id
  824. if tokenizer_has_new_eos:
  825. updated_tokens["eos_token_id"] = tokenizer.eos_token_id
  826. self.model.config.eos_token_id = tokenizer.eos_token_id
  827. # The generation config may hold more than one EOS token. We preserve the original EOS tokens: any of the
  828. # EOS tokens defined here will halt generation.
  829. if model_has_generation_config:
  830. all_eos_tokens = [tokenizer.eos_token_id]
  831. if self.model.generation_config.eos_token_id is not None:
  832. all_eos_tokens += list(self.model.generation_config.eos_token_id)
  833. self.model.generation_config.eos_token_id = [token for token in all_eos_tokens if token is not None]
  834. # 2 - Align BOS
  835. tokenizer_has_new_bos = tokenizer.bos_token_id != self.model.config.bos_token_id
  836. if model_has_generation_config:
  837. tokenizer_has_new_bos |= tokenizer.bos_token_id != self.model.generation_config.bos_token_id
  838. if tokenizer_has_new_bos:
  839. updated_tokens["bos_token_id"] = tokenizer.bos_token_id
  840. self.model.config.bos_token_id = tokenizer.bos_token_id
  841. if model_has_generation_config:
  842. self.model.generation_config.bos_token_id = tokenizer.bos_token_id
  843. # 3 - Align PAD
  844. tokenizer_has_new_pad = tokenizer.pad_token_id != self.model.config.pad_token_id
  845. if model_has_generation_config:
  846. tokenizer_has_new_pad |= tokenizer.pad_token_id != self.model.generation_config.pad_token_id
  847. if tokenizer_has_new_pad:
  848. updated_tokens["pad_token_id"] = tokenizer.pad_token_id
  849. self.model.config.pad_token_id = tokenizer.pad_token_id
  850. if model_has_generation_config:
  851. self.model.generation_config.pad_token_id = tokenizer.pad_token_id
  852. # 4 - Warn users about the changes
  853. if len(updated_tokens) > 0:
  854. logger.warning(
  855. "The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. "
  856. "The model config and generation config were aligned accordingly, being updated with the tokenizer's "
  857. f"values. Updated tokens: {updated_tokens}."
  858. )
  859. def _set_signature_columns_if_needed(self):
  860. if self._signature_columns is None:
  861. # Inspect model forward signature to keep only the arguments it accepts.
  862. model_to_inspect = self.model
  863. if _is_peft_model(self.model):
  864. if hasattr(self.model, "get_base_model"):
  865. model_to_inspect = self.model.get_base_model()
  866. else:
  867. # PeftMixedModel do not provide a `get_base_model` method
  868. model_to_inspect = self.model.base_model.model
  869. signature = inspect.signature(model_to_inspect.forward)
  870. self._signature_columns = list(signature.parameters.keys())
  871. # Labels may be named label or label_ids, the default data collator handles that.
  872. self._signature_columns += list(set(["label", "label_ids"] + self.label_names))
  873. def _remove_unused_columns(self, dataset: "datasets.Dataset", description: Optional[str] = None):
  874. if not self.args.remove_unused_columns:
  875. return dataset
  876. self._set_signature_columns_if_needed()
  877. signature_columns = self._signature_columns
  878. ignored_columns = list(set(dataset.column_names) - set(signature_columns))
  879. if len(ignored_columns) > 0:
  880. dset_description = "" if description is None else f"in the {description} set"
  881. logger.info(
  882. f"The following columns {dset_description} don't have a corresponding argument in "
  883. f"`{self.model.__class__.__name__}.forward` and have been ignored: {', '.join(ignored_columns)}."
  884. f" If {', '.join(ignored_columns)} are not expected by `{self.model.__class__.__name__}.forward`, "
  885. " you can safely ignore this message."
  886. )
  887. columns = [k for k in signature_columns if k in dataset.column_names]
  888. if len(columns) == 0:
  889. raise ValueError(
  890. f"No columns in the dataset match the model's forward method signature: ({', '.join(signature_columns)}). "
  891. f"The following columns have been ignored: [{', '.join(ignored_columns)}]. "
  892. "Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`."
  893. )
  894. if version.parse(datasets.__version__) < version.parse("1.4.0"):
  895. dataset.set_format(
  896. type=dataset.format["type"], columns=columns, format_kwargs=dataset.format["format_kwargs"]
  897. )
  898. return dataset
  899. else:
  900. return dataset.remove_columns(ignored_columns)
  901. def _get_collator_with_removed_columns(
  902. self, data_collator: Callable, description: Optional[str] = None
  903. ) -> Callable:
  904. """Wrap the data collator in a callable removing unused columns."""
  905. if not self.args.remove_unused_columns:
  906. return data_collator
  907. self._set_signature_columns_if_needed()
  908. signature_columns = self._signature_columns
  909. remove_columns_collator = RemoveColumnsCollator(
  910. data_collator=data_collator,
  911. signature_columns=signature_columns,
  912. logger=logger,
  913. description=description,
  914. model_name=self.model.__class__.__name__,
  915. )
  916. return remove_columns_collator
  917. def _get_train_sampler(self, train_dataset: Optional[Dataset] = None) -> Optional[torch.utils.data.Sampler]:
  918. if train_dataset is None:
  919. train_dataset = self.train_dataset
  920. if train_dataset is None or not has_length(train_dataset):
  921. return None
  922. # Build the sampler.
  923. if self.args.group_by_length:
  924. if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
  925. lengths = (
  926. train_dataset[self.args.length_column_name]
  927. if self.args.length_column_name in train_dataset.column_names
  928. else None
  929. )
  930. else:
  931. lengths = None
  932. model_input_name = (
  933. self.processing_class.model_input_names[0] if self.processing_class is not None else None
  934. )
  935. return LengthGroupedSampler(
  936. self.args.train_batch_size * self.args.gradient_accumulation_steps,
  937. dataset=train_dataset,
  938. lengths=lengths,
  939. model_input_name=model_input_name,
  940. )
  941. else:
  942. return RandomSampler(train_dataset)
  943. def _get_dataloader(
  944. self,
  945. dataset: Dataset,
  946. description: str,
  947. batch_size: int,
  948. sampler_fn: Optional[Callable[[Dataset], torch.utils.data.Sampler]] = None,
  949. is_training: bool = False,
  950. dataloader_key: Optional[str] = None,
  951. ) -> DataLoader:
  952. """Create a [`~torch.utils.data.DataLoader`] from the given dataset."""
  953. data_collator = self.data_collator
  954. if is_datasets_available() and isinstance(dataset, datasets.Dataset):
  955. dataset = self._remove_unused_columns(dataset, description=description)
  956. else:
  957. data_collator = self._get_collator_with_removed_columns(self.data_collator, description=description)
  958. dataloader_params = {
  959. "batch_size": batch_size,
  960. "collate_fn": data_collator,
  961. "num_workers": self.args.dataloader_num_workers,
  962. "pin_memory": self.args.dataloader_pin_memory,
  963. "persistent_workers": self.args.dataloader_persistent_workers,
  964. }
  965. if not isinstance(dataset, torch.utils.data.IterableDataset):
  966. if sampler_fn is not None:
  967. dataloader_params["sampler"] = sampler_fn(dataset)
  968. dataloader_params["drop_last"] = self.args.dataloader_drop_last
  969. dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
  970. if is_training:
  971. dataloader_params["worker_init_fn"] = partial(
  972. seed_worker, num_workers=self.args.dataloader_num_workers, rank=self.args.process_index
  973. )
  974. dataloader = self.accelerator.prepare(DataLoader(dataset, **dataloader_params))
  975. # Store the prepared dataloader for subsequent evaluations if using persistent workers.
  976. if dataloader_key is not None and self.args.dataloader_persistent_workers:
  977. if hasattr(self, "_eval_dataloaders"):
  978. self._eval_dataloaders[dataloader_key] = dataloader
  979. else:
  980. self._eval_dataloaders = {dataloader_key: dataloader}
  981. return dataloader
  982. def get_train_dataloader(self) -> DataLoader:
  983. """
  984. Returns the training [`~torch.utils.data.DataLoader`].
  985. Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed
  986. training if necessary) otherwise.
  987. Subclass and override this method if you want to inject some custom behavior.
  988. """
  989. if self.train_dataset is None:
  990. raise ValueError("Trainer: training requires a train_dataset.")
  991. return self._get_dataloader(
  992. dataset=self.train_dataset,
  993. description="Training",
  994. batch_size=self._train_batch_size,
  995. sampler_fn=self._get_train_sampler,
  996. is_training=True,
  997. )
  998. def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.utils.data.Sampler]:
  999. if eval_dataset is None or not has_length(eval_dataset):
  1000. return None
  1001. # Build the sampler.
  1002. # Deprecated code
  1003. if self.args.use_legacy_prediction_loop:
  1004. if is_torch_xla_available():
  1005. return SequentialDistributedSampler(
  1006. eval_dataset, num_replicas=xr.world_size(), rank=xr.global_ordinal()
  1007. )
  1008. elif is_sagemaker_mp_enabled():
  1009. return SequentialDistributedSampler(
  1010. eval_dataset,
  1011. num_replicas=smp.dp_size(),
  1012. rank=smp.dp_rank(),
  1013. batch_size=self.args.per_device_eval_batch_size,
  1014. )
  1015. else:
  1016. return SequentialSampler(eval_dataset)
  1017. if self.args.group_by_length:
  1018. if is_datasets_available() and isinstance(eval_dataset, datasets.Dataset):
  1019. lengths = (
  1020. eval_dataset[self.args.length_column_name]
  1021. if self.args.length_column_name in eval_dataset.column_names
  1022. else None
  1023. )
  1024. else:
  1025. lengths = None
  1026. model_input_name = (
  1027. self.processing_class.model_input_names[0] if self.processing_class is not None else None
  1028. )
  1029. return LengthGroupedSampler(
  1030. self.args.eval_batch_size,
  1031. dataset=eval_dataset,
  1032. lengths=lengths,
  1033. model_input_name=model_input_name,
  1034. )
  1035. if self.args.world_size <= 1:
  1036. return SequentialSampler(eval_dataset)
  1037. else:
  1038. return None
  1039. def get_eval_dataloader(self, eval_dataset: Optional[Union[str, Dataset]] = None) -> DataLoader:
  1040. """
  1041. Returns the evaluation [`~torch.utils.data.DataLoader`].
  1042. Subclass and override this method if you want to inject some custom behavior.
  1043. Args:
  1044. eval_dataset (`str` or `torch.utils.data.Dataset`, *optional*):
  1045. If a `str`, will use `self.eval_dataset[eval_dataset]` as the evaluation dataset. If a `Dataset`, will override `self.eval_dataset` and must implement `__len__`. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed.
  1046. """
  1047. if eval_dataset is None and self.eval_dataset is None:
  1048. raise ValueError("Trainer: evaluation requires an eval_dataset.")
  1049. # If we have persistent workers, don't do a fork bomb especially as eval datasets
  1050. # don't change during training
  1051. dataloader_key = eval_dataset if isinstance(eval_dataset, str) else "eval"
  1052. if (
  1053. hasattr(self, "_eval_dataloaders")
  1054. and dataloader_key in self._eval_dataloaders
  1055. and self.args.dataloader_persistent_workers
  1056. ):
  1057. return self._eval_dataloaders[dataloader_key]
  1058. eval_dataset = (
  1059. self.eval_dataset[eval_dataset]
  1060. if isinstance(eval_dataset, str)
  1061. else eval_dataset
  1062. if eval_dataset is not None
  1063. else self.eval_dataset
  1064. )
  1065. return self._get_dataloader(
  1066. dataset=eval_dataset,
  1067. description="Evaluation",
  1068. batch_size=self.args.eval_batch_size,
  1069. sampler_fn=self._get_eval_sampler,
  1070. dataloader_key=dataloader_key,
  1071. )
  1072. def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
  1073. """
  1074. Returns the test [`~torch.utils.data.DataLoader`].
  1075. Subclass and override this method if you want to inject some custom behavior.
  1076. Args:
  1077. test_dataset (`torch.utils.data.Dataset`, *optional*):
  1078. The test dataset to use. If it is a [`~datasets.Dataset`], columns not accepted by the
  1079. `model.forward()` method are automatically removed. It must implement `__len__`.
  1080. """
  1081. return self._get_dataloader(
  1082. dataset=test_dataset,
  1083. description="test",
  1084. batch_size=self.args.eval_batch_size,
  1085. sampler_fn=self._get_eval_sampler,
  1086. )
  1087. def create_optimizer_and_scheduler(self, num_training_steps: int):
  1088. """
  1089. Setup the optimizer and the learning rate scheduler.
  1090. We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
  1091. Trainer's init through `optimizers`, or subclass and override this method (or `create_optimizer` and/or
  1092. `create_scheduler`) in a subclass.
  1093. """
  1094. self.create_optimizer()
  1095. if IS_SAGEMAKER_MP_POST_1_10 and smp.state.cfg.fp16:
  1096. # If smp >= 1.10 and fp16 is enabled, we unwrap the optimizer
  1097. optimizer = self.optimizer.optimizer
  1098. else:
  1099. optimizer = self.optimizer
  1100. self.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
  1101. def get_decay_parameter_names(self, model) -> list[str]:
  1102. """
  1103. Get all parameter names that weight decay will be applied to.
  1104. This function filters out parameters in two ways:
  1105. 1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)
  1106. 2. By parameter name patterns (containing 'bias', or variation of 'norm')
  1107. """
  1108. forbidden_name_patterns = [r"bias", r"layernorm", r"rmsnorm", r"(?:^|\.)norm(?:$|\.)", r"_norm(?:$|\.)"]
  1109. decay_parameters = get_parameter_names(model, [nn.LayerNorm], forbidden_name_patterns)
  1110. return decay_parameters
  1111. def create_optimizer(self):
  1112. """
  1113. Setup the optimizer.
  1114. We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
  1115. Trainer's init through `optimizers`, or subclass and override this method in a subclass.
  1116. """
  1117. opt_model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
  1118. if self.optimizer is None:
  1119. decay_parameters = self.get_decay_parameter_names(opt_model)
  1120. optimizer_grouped_parameters = [
  1121. {
  1122. "params": [
  1123. p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
  1124. ],
  1125. "weight_decay": self.args.weight_decay,
  1126. },
  1127. {
  1128. "params": [
  1129. p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
  1130. ],
  1131. "weight_decay": 0.0,
  1132. },
  1133. ]
  1134. if self.optimizer_cls_and_kwargs is not None:
  1135. optimizer_cls, optimizer_kwargs = self.optimizer_cls_and_kwargs
  1136. else:
  1137. optimizer_cls, optimizer_kwargs = self.get_optimizer_cls_and_kwargs(self.args, opt_model)
  1138. # Overwrite `params` in case it's created by `get_optimizer_cls_and_kwargs`
  1139. # e.g. for GaLore optimizer.
  1140. if "params" in optimizer_kwargs:
  1141. optimizer_grouped_parameters = optimizer_kwargs.pop("params")
  1142. # Overwrite `model` in case it's created by `get_optimizer_cls_and_kwargs`
  1143. # e.g. for LOMO optimizer.
  1144. if "model" in optimizer_kwargs:
  1145. optimizer_grouped_parameters = optimizer_kwargs.pop("model")
  1146. # For layer-wise dummy optimizers we overwrite optimizer_grouped_parameters with `optimizer_dict`
  1147. # to avoid arguments conflicts.
  1148. if "optimizer_dict" in optimizer_kwargs:
  1149. optimizer_grouped_parameters = optimizer_kwargs.pop("optimizer_dict")
  1150. self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
  1151. if "bitsandbytes" in str(optimizer_cls) and optimizer_kwargs.get("optim_bits", None) == 8:
  1152. import bitsandbytes
  1153. manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
  1154. skipped = 0
  1155. for module in opt_model.modules():
  1156. if isinstance(module, nn.Embedding):
  1157. skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
  1158. logger.info(f"skipped {module}: {skipped / 2**20}M params")
  1159. manager.register_module_override(module, "weight", {"optim_bits": 32})
  1160. logger.debug(f"bitsandbytes: will optimize {module} in fp32")
  1161. logger.info(f"skipped: {skipped / 2**20}M params")
  1162. if is_sagemaker_mp_enabled():
  1163. self.optimizer = smp.DistributedOptimizer(self.optimizer)
  1164. return self.optimizer
  1165. def get_num_trainable_parameters(self):
  1166. """
  1167. Get the number of trainable parameters.
  1168. """
  1169. return sum(p.numel() for p in self.model.parameters() if p.requires_grad)
  1170. def get_learning_rates(self):
  1171. """
  1172. Returns the learning rate of each parameter from self.optimizer.
  1173. """
  1174. if self.optimizer is None:
  1175. raise ValueError("Trainer optimizer is None, please make sure you have setup the optimizer before.")
  1176. return [group["lr"] for group in self.optimizer.param_groups]
  1177. def get_optimizer_group(self, param: Optional[Union[str, torch.nn.parameter.Parameter]] = None):
  1178. """
  1179. Returns optimizer group for a parameter if given, else returns all optimizer groups for params.
  1180. Args:
  1181. param (`str` or `torch.nn.parameter.Parameter`, *optional*):
  1182. The parameter for which optimizer group needs to be returned.
  1183. """
  1184. if self.optimizer is None:
  1185. raise ValueError("Trainer optimizer is None, please make sure you have setup the optimizer before.")
  1186. if param is not None:
  1187. for group in self.optimizer.param_groups:
  1188. if param in group["params"]:
  1189. return group
  1190. return [group["params"] for group in self.optimizer.param_groups]
  1191. @staticmethod
  1192. def get_optimizer_cls_and_kwargs(
  1193. args: TrainingArguments, model: Optional[PreTrainedModel] = None
  1194. ) -> tuple[Any, Any]:
  1195. """
  1196. Returns the optimizer class and optimizer parameters based on the training arguments.
  1197. Args:
  1198. args (`transformers.training_args.TrainingArguments`):
  1199. The training arguments for the training session.
  1200. """
  1201. # parse args.optim_args
  1202. optim_args = {}
  1203. if args.optim_args:
  1204. for mapping in args.optim_args.replace(" ", "").split(","):
  1205. key, value = mapping.split("=")
  1206. optim_args[key] = value
  1207. optimizer_kwargs = {"lr": args.learning_rate}
  1208. adam_kwargs = {
  1209. "betas": (args.adam_beta1, args.adam_beta2),
  1210. "eps": args.adam_epsilon,
  1211. }
  1212. def setup_low_rank_optimizer(
  1213. optimizer_name: str,
  1214. optimizer_mapping: dict[str, Any],
  1215. optim_kwargs: dict[str, Any],
  1216. is_layerwise_supported: bool = True,
  1217. ) -> tuple[Any, Any]:
  1218. """
  1219. Helper function to set up low-rank optimizers like GaLore and Apollo.
  1220. Args:
  1221. optimizer_name (str): Name of the optimizer.
  1222. optimizer_mapping (dict): Mapping of optimizer names to their classes.
  1223. optim_kwargs (dict): Keyword arguments for the optimizer.
  1224. is_layerwise_supported (bool): Whether layerwise optimization is supported.
  1225. Returns:
  1226. tuple[Any, Any]: Optimizer class and updated optimizer kwargs.
  1227. """
  1228. is_layerwise = optimizer_name.lower().endswith("layerwise")
  1229. if is_layerwise and args.parallel_mode == ParallelMode.DISTRIBUTED and is_layerwise_supported:
  1230. raise NotImplementedError(f"Layer-wise {optimizer_name} does not support DDP at this time")
  1231. optimizer_cls = optimizer_mapping[optimizer_name]
  1232. if args.optim_target_modules is None:
  1233. raise ValueError(f"You need to define `optim_target_modules` to use {optimizer_name} optimizers")
  1234. if not isinstance(args.optim_target_modules, (list, str)):
  1235. raise TypeError(
  1236. f"`optim_target_modules` must be a list of strings, a regex string, or 'all-linear'. Got: {args.optim_target_modules}"
  1237. )
  1238. if model is None:
  1239. raise ValueError(f"You need to pass a model to initialize {optimizer_name} optimizer.")
  1240. all_linear = (
  1241. isinstance(args.optim_target_modules, str)
  1242. and args.optim_target_modules.replace("_", "-") == "all-linear"
  1243. )
  1244. target_params_names = []
  1245. for module_name, module in model.named_modules():
  1246. target_module_exists, is_regex = check_target_module_exists(
  1247. args.optim_target_modules, module_name, return_is_regex=True
  1248. )
  1249. if not isinstance(module, nn.Linear):
  1250. if target_module_exists and not is_regex:
  1251. logger.warning(
  1252. f"{module_name} matched but ignored. {optimizer_name} only supports linear layers."
  1253. )
  1254. continue
  1255. if not target_module_exists and not all_linear:
  1256. continue
  1257. target_params_names.append(module_name + ".weight")
  1258. if len(target_params_names) == 0:
  1259. raise ValueError(f"No target modules found for {optimizer_name} ({args.optim_target_modules}).")
  1260. target_params = [p for n, p in model.named_parameters() if n in target_params_names]
  1261. non_target_params = [p for n, p in model.named_parameters() if n not in target_params_names]
  1262. optim_kwargs.update(optim_args)
  1263. param_groups = [
  1264. {"params": non_target_params},
  1265. {"params": target_params, **optim_kwargs},
  1266. ]
  1267. if is_layerwise:
  1268. if args.gradient_accumulation_steps != 1:
  1269. raise ValueError(f"Layerwise {optimizer_name} does not support gradient accumulation!")
  1270. optimizer_dict = {}
  1271. for param in non_target_params:
  1272. optimizer_dict[param] = optimizer_cls([{"params": [param]}], **optimizer_kwargs)
  1273. for param in target_params:
  1274. optimizer_dict[param] = optimizer_cls([{"params": [param], **optim_kwargs}], **optimizer_kwargs)
  1275. def optimizer_hook(param):
  1276. if param.grad is not None:
  1277. optimizer_dict[param].step()
  1278. optimizer_dict[param].zero_grad()
  1279. for param in model.parameters():
  1280. if param.requires_grad:
  1281. param.register_post_accumulate_grad_hook(optimizer_hook)
  1282. optimizer_cls = LayerWiseDummyOptimizer
  1283. optimizer_kwargs.update({"optimizer_dict": optimizer_dict})
  1284. optimizer_kwargs.update({"params": param_groups})
  1285. return optimizer_cls, optimizer_kwargs
  1286. if args.optim == OptimizerNames.ADAFACTOR:
  1287. optimizer_cls = Adafactor
  1288. optimizer_kwargs.update({"scale_parameter": False, "relative_step": False})
  1289. elif args.optim in [OptimizerNames.ADAMW_TORCH, OptimizerNames.ADAMW_TORCH_FUSED]:
  1290. from torch.optim import AdamW
  1291. optimizer_cls = AdamW
  1292. optimizer_kwargs.update(adam_kwargs)
  1293. if args.optim == OptimizerNames.ADAMW_TORCH_FUSED:
  1294. optimizer_kwargs.update({"fused": True})
  1295. elif args.optim == OptimizerNames.ADAMW_TORCH_XLA:
  1296. try:
  1297. from torch_xla.amp.syncfree import AdamW
  1298. optimizer_cls = AdamW
  1299. optimizer_kwargs.update(adam_kwargs)
  1300. except ImportError:
  1301. raise ValueError("Trainer failed to import syncfree AdamW from torch_xla.")
  1302. elif args.optim == OptimizerNames.ADAMW_TORCH_NPU_FUSED:
  1303. try:
  1304. from torch_npu.optim import NpuFusedAdamW
  1305. optimizer_cls = NpuFusedAdamW
  1306. optimizer_kwargs.update(adam_kwargs)
  1307. except ImportError:
  1308. raise ValueError("Trainer failed to import FusedAdamW from torch_npu.")
  1309. elif args.optim == OptimizerNames.ADAMW_APEX_FUSED:
  1310. try:
  1311. from apex.optimizers import FusedAdam
  1312. optimizer_cls = FusedAdam
  1313. optimizer_kwargs.update(adam_kwargs)
  1314. except ImportError:
  1315. raise ValueError("Trainer tried to instantiate apex FusedAdam but apex is not installed!")
  1316. elif args.optim in [
  1317. OptimizerNames.ADAMW_BNB,
  1318. OptimizerNames.ADAMW_8BIT,
  1319. OptimizerNames.PAGED_ADAMW,
  1320. OptimizerNames.PAGED_ADAMW_8BIT,
  1321. OptimizerNames.ADEMAMIX,
  1322. OptimizerNames.ADEMAMIX_8BIT,
  1323. OptimizerNames.PAGED_ADEMAMIX,
  1324. OptimizerNames.PAGED_ADEMAMIX_8BIT,
  1325. OptimizerNames.LION,
  1326. OptimizerNames.LION_8BIT,
  1327. OptimizerNames.PAGED_LION,
  1328. OptimizerNames.PAGED_LION_8BIT,
  1329. OptimizerNames.RMSPROP_BNB,
  1330. OptimizerNames.RMSPROP_8BIT,
  1331. OptimizerNames.RMSPROP_32BIT,
  1332. ]:
  1333. try:
  1334. from bitsandbytes.optim import AdamW, Lion, RMSprop
  1335. is_paged = False
  1336. optim_bits = 32
  1337. optimizer_cls = None
  1338. additional_optim_kwargs = adam_kwargs
  1339. if "paged" in args.optim:
  1340. is_paged = True
  1341. if "8bit" in args.optim:
  1342. optim_bits = 8
  1343. if "adam" in args.optim:
  1344. optimizer_cls = AdamW
  1345. elif "lion" in args.optim:
  1346. optimizer_cls = Lion
  1347. additional_optim_kwargs = {"betas": (args.adam_beta1, args.adam_beta2)}
  1348. elif "rmsprop" in args.optim:
  1349. optimizer_cls = RMSprop
  1350. # Above we pass all `adam_kwargs` to the optimizer, here
  1351. # we only pass `optim_args` which can be passed by the user.
  1352. additional_optim_kwargs = optim_args
  1353. elif "ademamix" in args.optim:
  1354. if is_bitsandbytes_available() and version.parse(
  1355. importlib.metadata.version("bitsandbytes")
  1356. ) < version.parse("0.44.0"):
  1357. raise ValueError(
  1358. "The AdEMAMix optimizer is not supported by your current version of `bitsandbytes`. "
  1359. "Please install `bitsandbytes` >= 0.44.0."
  1360. )
  1361. from bitsandbytes.optim import AdEMAMix
  1362. optimizer_cls = AdEMAMix
  1363. additional_optim_kwargs = {
  1364. "betas": (
  1365. float(optim_args.get("beta1", args.adam_beta1)),
  1366. float(optim_args.get("beta2", args.adam_beta2)),
  1367. float(optim_args.get("beta3", 0.9999)),
  1368. ),
  1369. "alpha": float(optim_args.get("alpha", 5.0)),
  1370. "eps": float(optim_args.get("eps", args.adam_epsilon)),
  1371. }
  1372. if "t_alpha" in optim_args:
  1373. additional_optim_kwargs["t_alpha"] = int(optim_args["t_alpha"])
  1374. if "t_beta3" in optim_args:
  1375. additional_optim_kwargs["t_beta3"] = int(optim_args["t_beta3"])
  1376. bnb_kwargs = {"optim_bits": optim_bits}
  1377. if "rmsprop" not in args.optim:
  1378. bnb_kwargs["is_paged"] = is_paged
  1379. optimizer_kwargs.update(additional_optim_kwargs)
  1380. optimizer_kwargs.update(bnb_kwargs)
  1381. except ImportError:
  1382. raise ValueError("Trainer tried to instantiate bnb optimizer but `bitsandbytes` is not installed!")
  1383. if is_bitsandbytes_available() and version.parse(
  1384. importlib.metadata.version("bitsandbytes")
  1385. ) < version.parse("0.41.1"):
  1386. logger.warning(
  1387. "You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. "
  1388. "It is recommended to update your version as a major bug has been fixed in 8-bit optimizers."
  1389. )
  1390. elif args.optim == OptimizerNames.ADAMW_ANYPRECISION:
  1391. try:
  1392. from torchdistx.optimizers import AnyPrecisionAdamW
  1393. optimizer_cls = AnyPrecisionAdamW
  1394. optimizer_kwargs.update(adam_kwargs)
  1395. # TODO Change dtypes back to M=FP32, Var = BF16, Kahan = False once they can be cast together in torchdistx.
  1396. optimizer_kwargs.update(
  1397. {
  1398. "use_kahan_summation": strtobool(optim_args.get("use_kahan_summation", "False")),
  1399. "momentum_dtype": getattr(torch, optim_args.get("momentum_dtype", "float32")),
  1400. "variance_dtype": getattr(torch, optim_args.get("variance_dtype", "float32")),
  1401. "compensation_buffer_dtype": getattr(
  1402. torch, optim_args.get("compensation_buffer_dtype", "bfloat16")
  1403. ),
  1404. }
  1405. )
  1406. except ImportError:
  1407. raise ValueError("Please install https://github.com/pytorch/torchdistx")
  1408. elif args.optim == OptimizerNames.SGD:
  1409. optimizer_cls = torch.optim.SGD
  1410. elif args.optim == OptimizerNames.ADAGRAD:
  1411. optimizer_cls = torch.optim.Adagrad
  1412. elif args.optim == OptimizerNames.RMSPROP:
  1413. optimizer_cls = torch.optim.RMSprop
  1414. elif args.optim in [
  1415. OptimizerNames.GALORE_ADAMW,
  1416. OptimizerNames.GALORE_ADAMW_8BIT,
  1417. OptimizerNames.GALORE_ADAFACTOR,
  1418. OptimizerNames.GALORE_ADAMW_LAYERWISE,
  1419. OptimizerNames.GALORE_ADAMW_8BIT_LAYERWISE,
  1420. OptimizerNames.GALORE_ADAFACTOR_LAYERWISE,
  1421. ]:
  1422. if not is_galore_torch_available():
  1423. raise ImportError(
  1424. "You need to install `galore_torch` in order to use GaLore optimizers"
  1425. " install it with `pip install git+https://github.com/jiaweizzhao/GaLore`"
  1426. )
  1427. from galore_torch import GaLoreAdafactor, GaLoreAdamW, GaLoreAdamW8bit
  1428. optimizer_mapping = {
  1429. OptimizerNames.GALORE_ADAMW: GaLoreAdamW,
  1430. OptimizerNames.GALORE_ADAMW_8BIT: GaLoreAdamW8bit,
  1431. OptimizerNames.GALORE_ADAFACTOR: GaLoreAdafactor,
  1432. OptimizerNames.GALORE_ADAMW_LAYERWISE: GaLoreAdamW,
  1433. OptimizerNames.GALORE_ADAMW_8BIT_LAYERWISE: GaLoreAdamW8bit,
  1434. OptimizerNames.GALORE_ADAFACTOR_LAYERWISE: GaLoreAdafactor,
  1435. }
  1436. galore_optim_kwargs = {
  1437. "rank": int(optim_args.pop("rank", 128)),
  1438. "update_proj_gap": int(optim_args.pop("update_proj_gap", 200)),
  1439. "scale": float(optim_args.pop("scale", 0.25)),
  1440. "proj_type": optim_args.pop("proj_type", "std"),
  1441. }
  1442. optimizer_cls, optimizer_kwargs = setup_low_rank_optimizer(
  1443. args.optim, optimizer_mapping, galore_optim_kwargs
  1444. )
  1445. if args.optim == OptimizerNames.GALORE_ADAFACTOR:
  1446. optimizer_kwargs.update({"scale_parameter": False, "relative_step": False})
  1447. elif args.optim in [
  1448. OptimizerNames.APOLLO_ADAMW,
  1449. OptimizerNames.APOLLO_ADAMW_LAYERWISE,
  1450. ]:
  1451. if not is_apollo_torch_available():
  1452. raise ImportError(
  1453. "You need to install `apollo_torch` in order to use APOLLO optimizers"
  1454. " install it with `pip install git+https://github.com/zhuhanqing/APOLLO`"
  1455. )
  1456. from apollo_torch import APOLLOAdamW
  1457. optimizer_mapping = {
  1458. OptimizerNames.APOLLO_ADAMW: APOLLOAdamW,
  1459. OptimizerNames.APOLLO_ADAMW_LAYERWISE: APOLLOAdamW,
  1460. }
  1461. apollo_optim_kwargs = {
  1462. "rank": int(optim_args.pop("rank", 128)),
  1463. "proj": optim_args.pop("proj", "random"),
  1464. "scale_type": optim_args.pop("scale_type", "channel"),
  1465. "update_proj_gap": int(optim_args.pop("update_proj_gap", 200)),
  1466. "scale": float(optim_args.pop("scale", 1.0)),
  1467. "proj_type": optim_args.pop("proj_type", "std"),
  1468. }
  1469. apollo_optim_kwargs.update(adam_kwargs)
  1470. optimizer_cls, optimizer_kwargs = setup_low_rank_optimizer(
  1471. args.optim, optimizer_mapping, apollo_optim_kwargs
  1472. )
  1473. elif args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
  1474. if not is_lomo_available():
  1475. raise ImportError(
  1476. "You need to install `lomo_optim` in order to use LOMO optimizers"
  1477. " install it with `pip install lomo-optim`"
  1478. )
  1479. if not is_accelerate_available("0.30.0"):
  1480. raise ImportError("You need to have `accelerate>=0.30.0` to be able to use LOMO optimizers")
  1481. if model is None:
  1482. raise ValueError("You need to pass a `model` in order to correctly initialize a LOMO optimizer.")
  1483. from lomo_optim import AdaLomo, Lomo
  1484. if "ada" in args.optim:
  1485. optimizer_cls = AdaLomo
  1486. else:
  1487. optimizer_cls = Lomo
  1488. optimizer_kwargs.update({"model": model})
  1489. elif args.optim == OptimizerNames.GROKADAMW:
  1490. if not is_grokadamw_available():
  1491. raise ValueError("Please install grokadamw with `pip install grokadamw`")
  1492. from grokadamw import GrokAdamW
  1493. optimizer_cls = GrokAdamW
  1494. optimizer_kwargs.update(
  1495. {
  1496. "alpha_init": float(optim_args.get("alpha_init", 0.98)),
  1497. "lamb": float(optim_args.get("lamb", 2.0)),
  1498. "gamma": float(optim_args.get("gamma", 0.1)),
  1499. "grokking_signal_decay_rate": float(optim_args.get("grokking_signal_decay_rate", 0.1)),
  1500. "gradient_clipping": float(optim_args.get("gradient_clipping", 1.0)),
  1501. }
  1502. )
  1503. elif args.optim in [
  1504. OptimizerNames.ADAMW_TORCH_4BIT,
  1505. OptimizerNames.ADAMW_TORCH_8BIT,
  1506. ]:
  1507. if not is_torchao_available() or version.parse(importlib.metadata.version("torchao")) < version.parse(
  1508. "0.4.0"
  1509. ):
  1510. raise ImportError(
  1511. "You need to have `torchao>=0.4.0` in order to use torch 4-bit optimizers."
  1512. "Install it with `pip install torchao` or follow the instructions here: https://github.com/pytorch/ao"
  1513. )
  1514. if version.parse(importlib.metadata.version("torch")) <= version.parse("2.4"):
  1515. raise ImportError(
  1516. "You need to have `torch>2.4` in order to use torch 4-bit optimizers. "
  1517. "Install it with `pip install --upgrade torch` it is available on pipy. Otherwise, you need to install torch nightly."
  1518. )
  1519. if version.parse(importlib.metadata.version("torchao")) >= version.parse("0.11.0"):
  1520. # https://github.com/pytorch/ao/pull/2159
  1521. from torchao.optim import AdamW4bit, AdamW8bit
  1522. else:
  1523. from torchao.prototype.low_bit_optim import AdamW4bit, AdamW8bit
  1524. if args.optim == OptimizerNames.ADAMW_TORCH_4BIT:
  1525. optimizer_cls = AdamW4bit
  1526. elif args.optim == OptimizerNames.ADAMW_TORCH_8BIT:
  1527. optimizer_cls = AdamW8bit
  1528. else:
  1529. raise ValueError("Invalid optimizer")
  1530. optimizer_kwargs.update(adam_kwargs)
  1531. elif args.optim in [
  1532. OptimizerNames.SCHEDULE_FREE_RADAM,
  1533. OptimizerNames.SCHEDULE_FREE_ADAMW,
  1534. OptimizerNames.SCHEDULE_FREE_SGD,
  1535. ]:
  1536. if not is_schedulefree_available():
  1537. raise ImportError(
  1538. "You need to install `schedulefree` in order to use schedulefree optimizers. "
  1539. "Install it with `pip install schedulefree.`"
  1540. )
  1541. if not is_accelerate_available("0.30.0"):
  1542. raise ImportError("You need to have `accelerate>=0.30.0` to be able to use schedulefree optimizers")
  1543. from schedulefree import AdamWScheduleFree, SGDScheduleFree
  1544. additional_optim_kwargs = {}
  1545. require_warmup = True
  1546. if args.optim == OptimizerNames.SCHEDULE_FREE_RADAM:
  1547. if not is_schedulefree_available("1.4.0"):
  1548. raise ImportError(
  1549. "You need to install `schedulefree>=1.4.0` in order to use RAdamScheduleFree optimizer. "
  1550. "Install it with `pip install schedulefree.`"
  1551. )
  1552. from schedulefree import RAdamScheduleFree
  1553. optimizer_cls = RAdamScheduleFree
  1554. additional_optim_kwargs = adam_kwargs
  1555. require_warmup = False
  1556. elif args.optim == OptimizerNames.SCHEDULE_FREE_ADAMW:
  1557. optimizer_cls = AdamWScheduleFree
  1558. additional_optim_kwargs = adam_kwargs
  1559. elif args.optim == OptimizerNames.SCHEDULE_FREE_SGD:
  1560. optimizer_cls = SGDScheduleFree
  1561. else:
  1562. raise ValueError("Invalid schedulefree optimizer")
  1563. additional_optim_kwargs["weight_decay"] = args.weight_decay
  1564. if require_warmup:
  1565. additional_optim_kwargs["warmup_steps"] = args.warmup_steps
  1566. additional_optim_kwargs.update(
  1567. {
  1568. "weight_lr_power": float(optim_args.get("weight_lr_power", 2.0)),
  1569. "r": float(optim_args.get("r", 0.0)),
  1570. }
  1571. )
  1572. optimizer_kwargs.update(additional_optim_kwargs)
  1573. elif args.optim == OptimizerNames.STABLE_ADAMW:
  1574. if not is_torch_optimi_available():
  1575. raise ImportError(
  1576. "You need to install `torch-optimi` in order to use stable_adamw optimizers. "
  1577. "Install it with `pip install torch-optimi`."
  1578. )
  1579. from optimi import StableAdamW
  1580. max_lr = optim_args.pop("max_lr", None)
  1581. if max_lr is not None:
  1582. max_lr = float(max_lr)
  1583. kahan_sum = optim_args.pop("kahan_sum", None)
  1584. if kahan_sum is not None:
  1585. kahan_sum = bool(kahan_sum)
  1586. adam_kwargs["weight_decay"] = args.weight_decay
  1587. stable_adamw_kwargs = {
  1588. "decouple_lr": bool(optim_args.pop("decouple_lr", False)),
  1589. "max_lr": max_lr,
  1590. "kahan_sum": kahan_sum,
  1591. }
  1592. optimizer_cls = StableAdamW
  1593. optimizer_kwargs.update(adam_kwargs)
  1594. optimizer_kwargs.update(stable_adamw_kwargs)
  1595. else:
  1596. raise ValueError(f"Trainer cannot instantiate unsupported optimizer: {args.optim}")
  1597. return optimizer_cls, optimizer_kwargs
  1598. def create_scheduler(self, num_training_steps: int, optimizer: torch.optim.Optimizer = None):
  1599. """
  1600. Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
  1601. passed as an argument.
  1602. Args:
  1603. num_training_steps (int): The number of training steps to do.
  1604. """
  1605. if self.lr_scheduler is None:
  1606. self.lr_scheduler = get_scheduler(
  1607. self.args.lr_scheduler_type,
  1608. optimizer=self.optimizer if optimizer is None else optimizer,
  1609. num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
  1610. num_training_steps=num_training_steps,
  1611. scheduler_specific_kwargs=self.args.lr_scheduler_kwargs,
  1612. )
  1613. self._created_lr_scheduler = True
  1614. return self.lr_scheduler
  1615. def num_examples(self, dataloader: DataLoader) -> int:
  1616. """
  1617. Helper to get number of samples in a [`~torch.utils.data.DataLoader`] by accessing its dataset. When
  1618. dataloader.dataset does not exist or has no length, estimates as best it can
  1619. """
  1620. try:
  1621. dataset = dataloader.dataset
  1622. # Special case for IterableDatasetShard, we need to dig deeper
  1623. if isinstance(dataset, IterableDatasetShard):
  1624. return len(dataloader.dataset.dataset)
  1625. return len(dataloader.dataset)
  1626. except (NameError, AttributeError, TypeError): # no dataset or length, estimate by length of dataloader
  1627. return len(dataloader) * self.args.per_device_train_batch_size
  1628. @staticmethod
  1629. def num_tokens(train_dl: DataLoader, max_steps: Optional[int] = None) -> int:
  1630. """
  1631. Helper to get number of tokens in a [`~torch.utils.data.DataLoader`] by enumerating dataloader.
  1632. """
  1633. train_tokens = 0
  1634. try:
  1635. for batch in train_dl:
  1636. tokens = batch["input_ids"].numel()
  1637. if max_steps is not None:
  1638. return tokens * max_steps
  1639. train_tokens += tokens
  1640. except KeyError:
  1641. logger.warning("Cannot get num_tokens from dataloader")
  1642. return train_tokens
  1643. def _hp_search_setup(self, trial: Union["optuna.Trial", dict[str, Any]]):
  1644. """HP search setup code"""
  1645. self._trial = trial
  1646. if self.hp_search_backend is None or trial is None:
  1647. return
  1648. if self.hp_search_backend == HPSearchBackend.OPTUNA:
  1649. params = self.hp_space(trial)
  1650. elif self.hp_search_backend == HPSearchBackend.RAY:
  1651. params = trial
  1652. params.pop("wandb", None)
  1653. elif self.hp_search_backend == HPSearchBackend.SIGOPT:
  1654. params = {k: int(v) if isinstance(v, str) else v for k, v in trial.assignments.items()}
  1655. elif self.hp_search_backend == HPSearchBackend.WANDB:
  1656. params = trial
  1657. for key, value in params.items():
  1658. if not hasattr(self.args, key):
  1659. logger.warning(
  1660. f"Trying to set {key} in the hyperparameter search but there is no corresponding field in"
  1661. " `TrainingArguments`."
  1662. )
  1663. continue
  1664. old_attr = getattr(self.args, key, None)
  1665. # Casting value to the proper type
  1666. if old_attr is not None:
  1667. value = type(old_attr)(value)
  1668. setattr(self.args, key, value)
  1669. if self.hp_search_backend == HPSearchBackend.OPTUNA:
  1670. logger.info(f"Trial: {trial.params}")
  1671. if self.hp_search_backend == HPSearchBackend.SIGOPT:
  1672. logger.info(f"SigOpt Assignments: {trial.assignments}")
  1673. if self.hp_search_backend == HPSearchBackend.WANDB:
  1674. logger.info(f"W&B Sweep parameters: {trial}")
  1675. if self.is_deepspeed_enabled:
  1676. if self.args.deepspeed is None:
  1677. raise ValueError("For sweeps with deepspeed, `args.deepspeed` must be set")
  1678. self.accelerator.free_memory()
  1679. # Rebuild the deepspeed config to reflect the updated training parameters
  1680. from accelerate.utils import DeepSpeedPlugin
  1681. from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig
  1682. self.args.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.args.deepspeed)
  1683. self.args.hf_deepspeed_config.trainer_config_process(self.args)
  1684. self.args.deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.args.hf_deepspeed_config)
  1685. # From 1.0 on, we need to fully wipe the DS plugin when doing sweeps.
  1686. # Simply calling `_reset_state` is enough and doesn't need a version pin.
  1687. AcceleratorState()._reset_state()
  1688. self.create_accelerator_and_postprocess()
  1689. def _report_to_hp_search(self, trial: Union["optuna.Trial", dict[str, Any]], step: int, metrics: dict[str, float]):
  1690. if self.hp_search_backend is None or trial is None:
  1691. return
  1692. metrics = metrics.copy()
  1693. self.objective = self.compute_objective(metrics)
  1694. if self.hp_search_backend == HPSearchBackend.OPTUNA:
  1695. import optuna
  1696. if hasattr(trial, "study") and not trial.study._is_multi_objective():
  1697. trial.report(self.objective, step)
  1698. if trial.should_prune():
  1699. self.callback_handler.on_train_end(self.args, self.state, self.control)
  1700. raise optuna.TrialPruned()
  1701. elif self.hp_search_backend == HPSearchBackend.RAY:
  1702. import ray.train
  1703. with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
  1704. checkpoint = None
  1705. if self.control.should_save:
  1706. self._tune_save_checkpoint(checkpoint_dir=temp_checkpoint_dir)
  1707. checkpoint = ray.train.Checkpoint.from_directory(temp_checkpoint_dir)
  1708. metrics["objective"] = self.objective
  1709. ray.train.report(metrics, checkpoint=checkpoint)
  1710. def _tune_save_checkpoint(self, checkpoint_dir: str):
  1711. output_dir = os.path.join(checkpoint_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}")
  1712. self.save_model(output_dir, _internal_call=True)
  1713. if self.args.should_save:
  1714. # Update the `TrainerControl` state to where we are currently
  1715. self.state.stateful_callbacks["TrainerControl"] = self.control.state()
  1716. self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
  1717. torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
  1718. torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
  1719. def call_model_init(self, trial=None):
  1720. model_init_argcount = number_of_arguments(self.model_init)
  1721. if model_init_argcount == 0:
  1722. model = self.model_init()
  1723. elif model_init_argcount == 1:
  1724. model = self.model_init(trial)
  1725. else:
  1726. raise RuntimeError("model_init should have 0 or 1 argument.")
  1727. if model is None:
  1728. raise RuntimeError("model_init should not return None.")
  1729. return model
  1730. def torch_jit_model_eval(self, model, dataloader, training=False):
  1731. if not training:
  1732. if dataloader is None:
  1733. logger.warning("failed to use PyTorch jit mode due to current dataloader is none.")
  1734. return model
  1735. example_batch = next(iter(dataloader))
  1736. example_batch = self._prepare_inputs(example_batch)
  1737. try:
  1738. jit_model = copy.copy(model)
  1739. jit_model.eval()
  1740. original_forward = jit_model.__dict__.pop("_original_forward", None)
  1741. # remove mixed precision hooks from the model
  1742. if original_forward:
  1743. jit_model.forward = original_forward
  1744. autocast_handler = AutocastKwargs(cache_enabled=False)
  1745. with self.accelerator.autocast(autocast_handler=autocast_handler), torch.no_grad():
  1746. if isinstance(example_batch, dict):
  1747. jit_model = torch.jit.trace(jit_model, example_kwarg_inputs=example_batch, strict=False)
  1748. else:
  1749. jit_model = torch.jit.trace(
  1750. jit_model,
  1751. example_kwarg_inputs={key: example_batch[key] for key in example_batch},
  1752. strict=False,
  1753. )
  1754. jit_model = torch.jit.freeze(jit_model)
  1755. with torch.no_grad():
  1756. jit_model(**example_batch)
  1757. jit_model(**example_batch)
  1758. model = jit_model
  1759. self.use_cpu_amp = False
  1760. except (RuntimeError, TypeError, ValueError, NameError, IndexError) as e:
  1761. logger.warning(f"failed to use PyTorch jit mode due to: {e}.")
  1762. return model
  1763. def compare_trainer_and_checkpoint_args(self, training_args, trainer_state):
  1764. attributes_map = {
  1765. "logging_steps": "logging_steps",
  1766. "eval_steps": "eval_steps",
  1767. "save_steps": "save_steps",
  1768. }
  1769. has_warning = False
  1770. warning_str = "Warning: The following arguments do not match the ones in the `trainer_state.json` within the checkpoint directory: "
  1771. for arg_attr, state_attr in attributes_map.items():
  1772. arg_value = getattr(training_args, arg_attr, None)
  1773. state_value = getattr(trainer_state, state_attr, None)
  1774. if arg_value is not None and state_value is not None and arg_value != state_value:
  1775. warning_str += f"\n\t{arg_attr}: {arg_value} (from args) != {state_value} (from trainer_state.json)"
  1776. has_warning = True
  1777. # train bs is special as we need to account for multi-GPU
  1778. train_bs_args = training_args.per_device_train_batch_size
  1779. train_bs_state = trainer_state.train_batch_size // max(1, training_args.n_gpu)
  1780. if train_bs_args != train_bs_state:
  1781. warning_str += f"\n\tper_device_train_batch_size: {train_bs_args} (from args) != {train_bs_state} (from trainer_state.json)"
  1782. has_warning = True
  1783. if has_warning:
  1784. logger.warning_once(warning_str)
  1785. def _wrap_model(self, model, training=True, dataloader=None):
  1786. if is_sagemaker_mp_enabled():
  1787. # Wrapping the base model twice in a DistributedModel will raise an error.
  1788. if isinstance(self.model_wrapped, smp.model.DistributedModel):
  1789. return self.model_wrapped
  1790. return smp.DistributedModel(model, backward_passes_per_step=self.args.gradient_accumulation_steps)
  1791. # train/eval could be run multiple-times - if already wrapped, don't re-wrap it again
  1792. if self.accelerator.unwrap_model(model, keep_torch_compile=False) is not model:
  1793. return model
  1794. # Mixed precision training with apex
  1795. if self.use_apex and training:
  1796. from apex import amp
  1797. model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level)
  1798. # Multi-gpu training (should be after apex fp16 initialization) / 8bit models does not support DDP
  1799. if self.args.n_gpu > 1 and not getattr(model, "is_loaded_in_8bit", False):
  1800. model = nn.DataParallel(model)
  1801. if self.args.jit_mode_eval:
  1802. start_time = time.time()
  1803. model = self.torch_jit_model_eval(model, dataloader, training)
  1804. self.jit_compilation_time = round(time.time() - start_time, 4)
  1805. # Note: in torch.distributed mode, there's no point in wrapping the model
  1806. # inside a DistributedDataParallel as we'll be under `no_grad` anyways.
  1807. if not training:
  1808. return model
  1809. # Distributed training (should be after apex fp16 initialization)
  1810. # Distributed training using PyTorch FSDP
  1811. if self.is_fsdp_xla_enabled:
  1812. try:
  1813. from torch_xla.distributed.fsdp import XlaFullyShardedDataParallel as FSDP
  1814. from torch_xla.distributed.fsdp import checkpoint_module
  1815. from torch_xla.distributed.fsdp.wrap import (
  1816. size_based_auto_wrap_policy,
  1817. transformer_auto_wrap_policy,
  1818. )
  1819. if self.is_fsdp_xla_v2_enabled:
  1820. from torch_xla.experimental.spmd_fully_sharded_data_parallel import (
  1821. SpmdFullyShardedDataParallel as FSDPv2,
  1822. )
  1823. except ImportError:
  1824. raise ImportError("Missing XLA FSDP related module; please make sure to use torch-xla >= 2.0.")
  1825. auto_wrap_policy = None
  1826. auto_wrapper_callable = None
  1827. default_transformer_cls_names_to_wrap = getattr(model, "_no_split_modules", None)
  1828. fsdp_transformer_layer_cls_to_wrap = self.args.fsdp_config.get(
  1829. "transformer_layer_cls_to_wrap", default_transformer_cls_names_to_wrap
  1830. )
  1831. if self.args.fsdp_config["min_num_params"] > 0:
  1832. auto_wrap_policy = functools.partial(
  1833. size_based_auto_wrap_policy, min_num_params=self.args.fsdp_config["min_num_params"]
  1834. )
  1835. elif fsdp_transformer_layer_cls_to_wrap is not None:
  1836. transformer_cls_to_wrap = set()
  1837. for layer_class in fsdp_transformer_layer_cls_to_wrap:
  1838. transformer_cls = get_module_class_from_name(model, layer_class)
  1839. if transformer_cls is None:
  1840. raise Exception("Could not find the transformer layer class to wrap in the model.")
  1841. else:
  1842. transformer_cls_to_wrap.add(transformer_cls)
  1843. auto_wrap_policy = functools.partial(
  1844. transformer_auto_wrap_policy,
  1845. # Transformer layer class to wrap
  1846. transformer_layer_cls=transformer_cls_to_wrap,
  1847. )
  1848. fsdp_kwargs = self.args.xla_fsdp_config
  1849. if self.args.fsdp_config["xla_fsdp_grad_ckpt"]:
  1850. if model.config.use_cache:
  1851. logger.warning_once(
  1852. "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
  1853. )
  1854. model.config.use_cache = False
  1855. # Apply gradient checkpointing to auto-wrapped sub-modules if specified
  1856. def auto_wrapper_callable(m, *args, **kwargs):
  1857. target_cls = FSDP if not self.is_fsdp_xla_v2_enabled else FSDPv2
  1858. return target_cls(checkpoint_module(m), *args, **kwargs)
  1859. # Wrap the base model with an outer FSDP wrapper
  1860. if self.is_fsdp_xla_v2_enabled:
  1861. def shard_output(output, mesh):
  1862. from .modeling_outputs import CausalLMOutputWithPast
  1863. real_output = None
  1864. if isinstance(output, torch.Tensor):
  1865. real_output = output
  1866. elif isinstance(output, tuple):
  1867. real_output = output[0]
  1868. elif isinstance(output, CausalLMOutputWithPast):
  1869. real_output = output.logits
  1870. if real_output is None:
  1871. raise ValueError("Something went wrong, the output of the model shouldn't be `None`")
  1872. xs.mark_sharding(real_output, mesh, ("fsdp", None, None))
  1873. self.model = model = FSDPv2(
  1874. model,
  1875. shard_output=shard_output,
  1876. auto_wrap_policy=auto_wrap_policy,
  1877. auto_wrapper_callable=auto_wrapper_callable,
  1878. )
  1879. else:
  1880. self.model = model = FSDP(
  1881. model,
  1882. auto_wrap_policy=auto_wrap_policy,
  1883. auto_wrapper_callable=auto_wrapper_callable,
  1884. **fsdp_kwargs,
  1885. )
  1886. # Patch `xm.optimizer_step` should not reduce gradients in this case,
  1887. # as FSDP does not need gradient reduction over sharded parameters.
  1888. def patched_optimizer_step(optimizer, barrier=False, optimizer_args={}):
  1889. loss = optimizer.step(**optimizer_args)
  1890. if barrier:
  1891. xm.mark_step()
  1892. return loss
  1893. xm.optimizer_step = patched_optimizer_step
  1894. elif is_sagemaker_dp_enabled():
  1895. model = nn.parallel.DistributedDataParallel(
  1896. model, device_ids=[int(os.getenv("SMDATAPARALLEL_LOCAL_RANK"))]
  1897. )
  1898. elif self.args.parallel_mode == ParallelMode.DISTRIBUTED:
  1899. if is_torch_neuroncore_available():
  1900. return model
  1901. kwargs = {}
  1902. if self.args.ddp_find_unused_parameters is not None:
  1903. kwargs["find_unused_parameters"] = self.args.ddp_find_unused_parameters
  1904. elif isinstance(model, PreTrainedModel):
  1905. # find_unused_parameters breaks checkpointing as per
  1906. # https://github.com/huggingface/transformers/pull/4659#issuecomment-643356021
  1907. kwargs["find_unused_parameters"] = not model.is_gradient_checkpointing
  1908. else:
  1909. kwargs["find_unused_parameters"] = True
  1910. if self.args.ddp_bucket_cap_mb is not None:
  1911. kwargs["bucket_cap_mb"] = self.args.ddp_bucket_cap_mb
  1912. if self.args.ddp_broadcast_buffers is not None:
  1913. kwargs["broadcast_buffers"] = self.args.ddp_broadcast_buffers
  1914. self.accelerator.ddp_handler = DistributedDataParallelKwargs(**kwargs)
  1915. return model
  1916. def train(
  1917. self,
  1918. resume_from_checkpoint: Optional[Union[str, bool]] = None,
  1919. trial: Union["optuna.Trial", dict[str, Any], None] = None,
  1920. ignore_keys_for_eval: Optional[list[str]] = None,
  1921. **kwargs: Any,
  1922. ):
  1923. """
  1924. Main training entry point.
  1925. Args:
  1926. resume_from_checkpoint (`str` or `bool`, *optional*):
  1927. If a `str`, local path to a saved checkpoint as saved by a previous instance of [`Trainer`]. If a
  1928. `bool` and equals `True`, load the last checkpoint in *args.output_dir* as saved by a previous instance
  1929. of [`Trainer`]. If present, training will resume from the model/optimizer/scheduler states loaded here.
  1930. trial (`optuna.Trial` or `dict[str, Any]`, *optional*):
  1931. The trial run or the hyperparameter dictionary for hyperparameter search.
  1932. ignore_keys_for_eval (`list[str]`, *optional*)
  1933. A list of keys in the output of your model (if it is a dictionary) that should be ignored when
  1934. gathering predictions for evaluation during the training.
  1935. kwargs (`dict[str, Any]`, *optional*):
  1936. Additional keyword arguments used to hide deprecated arguments
  1937. """
  1938. if resume_from_checkpoint is False:
  1939. resume_from_checkpoint = None
  1940. # memory metrics - must set up as early as possible
  1941. self._memory_tracker.start()
  1942. args = self.args
  1943. self.is_in_train = True
  1944. # If the model uses a tokenizer, it may have a new tokens for fine-tuning purposes.
  1945. if isinstance(self.processing_class, (PreTrainedTokenizerBase, ProcessorMixin)) and hasattr(
  1946. self.model, "config"
  1947. ):
  1948. self._align_special_tokens()
  1949. # Attach NEFTune hooks if necessary
  1950. if self.neftune_noise_alpha is not None:
  1951. self.model = self._activate_neftune(self.model)
  1952. # do_train is not a reliable argument, as it might not be set and .train() still called, so
  1953. # the following is a workaround:
  1954. if (
  1955. (args.fp16_full_eval or args.bf16_full_eval)
  1956. and not args.do_train
  1957. and not self.is_model_parallel
  1958. and self.model_init is None
  1959. ):
  1960. self._move_model_to_device(self.model, args.device)
  1961. if "model_path" in kwargs:
  1962. resume_from_checkpoint = kwargs.pop("model_path")
  1963. warnings.warn(
  1964. "`model_path` is deprecated and will be removed in a future version. Use `resume_from_checkpoint` "
  1965. "instead.",
  1966. FutureWarning,
  1967. )
  1968. if len(kwargs) > 0:
  1969. raise TypeError(f"train() got unexpected keyword arguments: {', '.join(list(kwargs.keys()))}.")
  1970. # This might change the seed so needs to run first.
  1971. self._hp_search_setup(trial)
  1972. self._train_batch_size = self.args.train_batch_size
  1973. # Model re-init
  1974. model_reloaded = False
  1975. if self.model_init is not None:
  1976. # Seed must be set before instantiating the model when using model_init.
  1977. enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
  1978. self.model = self.call_model_init(trial)
  1979. model_reloaded = True
  1980. # Reinitializes optimizer and scheduler
  1981. self.optimizer, self.lr_scheduler = None, None
  1982. # Load potential model checkpoint
  1983. if isinstance(resume_from_checkpoint, bool) and resume_from_checkpoint:
  1984. resume_from_checkpoint = get_last_checkpoint(args.output_dir)
  1985. if resume_from_checkpoint is None:
  1986. raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})")
  1987. if resume_from_checkpoint is not None:
  1988. if not is_sagemaker_mp_enabled() and not self.is_deepspeed_enabled and not self.is_fsdp_enabled:
  1989. self._load_from_checkpoint(resume_from_checkpoint)
  1990. # In case of repeating the find_executable_batch_size, set `self._train_batch_size` properly
  1991. state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
  1992. if state.train_batch_size is not None:
  1993. self._train_batch_size = state.train_batch_size
  1994. # If model was re-initialized, put it on the right device and update self.model_wrapped
  1995. if model_reloaded:
  1996. if self.place_model_on_device:
  1997. self._move_model_to_device(self.model, args.device)
  1998. self.model_wrapped = self.model
  1999. inner_training_loop = find_executable_batch_size(
  2000. self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
  2001. )
  2002. if args.push_to_hub:
  2003. try:
  2004. # Disable progress bars when uploading models during checkpoints to avoid polluting stdout
  2005. hf_hub_utils.disable_progress_bars()
  2006. return inner_training_loop(
  2007. args=args,
  2008. resume_from_checkpoint=resume_from_checkpoint,
  2009. trial=trial,
  2010. ignore_keys_for_eval=ignore_keys_for_eval,
  2011. )
  2012. finally:
  2013. hf_hub_utils.enable_progress_bars()
  2014. else:
  2015. return inner_training_loop(
  2016. args=args,
  2017. resume_from_checkpoint=resume_from_checkpoint,
  2018. trial=trial,
  2019. ignore_keys_for_eval=ignore_keys_for_eval,
  2020. )
  2021. def get_tp_size(self) -> int:
  2022. """Get the tensor parallel size from either the model or DeepSpeed config."""
  2023. # 1. Check model.tp_size first
  2024. if (model_tp := getattr(self.model, "_tp_size", None)) is not None:
  2025. return model_tp
  2026. # 2. Fall back to DeepSpeed config if enabled
  2027. if self.is_deepspeed_enabled and (deepspeed_config := getattr(self.args, "hf_deepspeed_config", None)):
  2028. return deepspeed_config.config.get("tensor_parallel", {}).get("autotp_size", 1)
  2029. # 3. Default fallback
  2030. return 1
  2031. def get_total_train_batch_size(self, args) -> int:
  2032. """Calculates total batch size (micro_batch * grad_accum * dp_world_size).
  2033. Note: Only considers DP and TP (dp_world_size = world_size // tp_size)."""
  2034. dp_world_size = args.world_size // self.get_tp_size()
  2035. return self._train_batch_size * args.gradient_accumulation_steps * dp_world_size
  2036. def _inner_training_loop(
  2037. self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
  2038. ):
  2039. self.accelerator.free_memory()
  2040. self._train_batch_size = batch_size
  2041. if self.args.auto_find_batch_size:
  2042. if self.state.train_batch_size != self._train_batch_size:
  2043. from accelerate.utils import release_memory
  2044. (self.model_wrapped,) = release_memory(self.model_wrapped)
  2045. self.model_wrapped = self.model
  2046. # Check for DeepSpeed *after* the initial pass and modify the config
  2047. if self.is_deepspeed_enabled:
  2048. # Temporarily unset `self.args.train_batch_size`
  2049. original_bs = self.args.per_device_train_batch_size
  2050. self.args.per_device_train_batch_size = self._train_batch_size // max(1, self.args.n_gpu)
  2051. self.propagate_args_to_deepspeed(True)
  2052. self.args.per_device_train_batch_size = original_bs
  2053. self.state.train_batch_size = self._train_batch_size
  2054. logger.debug(f"Currently training with a batch size of: {self._train_batch_size}")
  2055. # Data loader and number of training steps
  2056. train_dataloader = self.get_train_dataloader()
  2057. if self.is_fsdp_xla_v2_enabled:
  2058. train_dataloader = tpu_spmd_dataloader(train_dataloader)
  2059. # Setting up training control variables:
  2060. # number of training epochs: num_train_epochs
  2061. # number of training steps per epoch: num_update_steps_per_epoch
  2062. # total number of training steps to execute: max_steps
  2063. total_train_batch_size = self.get_total_train_batch_size(args)
  2064. (
  2065. num_train_epochs,
  2066. num_update_steps_per_epoch,
  2067. num_examples,
  2068. num_train_samples,
  2069. epoch_based,
  2070. len_dataloader,
  2071. max_steps,
  2072. ) = self.set_initial_training_values(args, train_dataloader, total_train_batch_size)
  2073. num_train_tokens = None
  2074. if self.args.include_tokens_per_second:
  2075. num_train_tokens = self.num_tokens(train_dataloader, None if epoch_based else max_steps)
  2076. # If going by epochs, multiply tokens linearly
  2077. if len_dataloader is not None and epoch_based:
  2078. num_train_tokens *= args.num_train_epochs
  2079. # Otherwise since its steps, we just multiply by grad accum
  2080. else:
  2081. num_train_tokens *= args.gradient_accumulation_steps
  2082. if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug:
  2083. if self.args.n_gpu > 1:
  2084. # nn.DataParallel(model) replicates the model, creating new variables and module
  2085. # references registered here no longer work on other gpus, breaking the module
  2086. raise ValueError(
  2087. "Currently --debug underflow_overflow is not supported under DP. Please use DDP"
  2088. " (torchrun or torch.distributed.launch (deprecated))."
  2089. )
  2090. else:
  2091. DebugUnderflowOverflow(self.model)
  2092. delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled
  2093. # Can't delay optimizer creation when using FSDP2: https://github.com/huggingface/accelerate/blob/3f636d626063ffcf9a337c7d3624d61b7d187d59/src/accelerate/accelerator.py#L1404
  2094. is_fsdp2 = self.is_fsdp_enabled and (getattr(self.accelerator.state.fsdp_plugin, "fsdp_version", 1) == 2)
  2095. if is_fsdp2:
  2096. delay_optimizer_creation = False
  2097. # We need to reset the scheduler, as its parameters may be different on subsequent calls
  2098. if self._created_lr_scheduler:
  2099. self.lr_scheduler = None
  2100. self._created_lr_scheduler = False
  2101. if self.is_deepspeed_enabled:
  2102. self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps)
  2103. if not delay_optimizer_creation:
  2104. self.create_optimizer_and_scheduler(num_training_steps=max_steps)
  2105. self.state = TrainerState(
  2106. stateful_callbacks=[
  2107. cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState)
  2108. ]
  2109. )
  2110. self.state.is_hyper_param_search = trial is not None
  2111. self.state.train_batch_size = self._train_batch_size
  2112. # Compute absolute values for logging, eval, and save if given as ratio
  2113. self.state.compute_steps(args, max_steps)
  2114. # Activate gradient checkpointing if needed
  2115. if args.gradient_checkpointing:
  2116. self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=args.gradient_checkpointing_kwargs)
  2117. model = self._wrap_model(self.model_wrapped)
  2118. # as the model is wrapped, don't use `accelerator.prepare`
  2119. # this is for unhandled cases such as
  2120. # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX
  2121. use_accelerator_prepare = model is self.model
  2122. if use_accelerator_prepare and self.is_fsdp_enabled:
  2123. # In case of auto_find_batch_size=True
  2124. # Remove FSDP wrapping from sub-models.
  2125. self.model = unwrap_model(self.model, recursive=True)
  2126. if delay_optimizer_creation:
  2127. if use_accelerator_prepare:
  2128. # configure fsdp plugin for qlora if any
  2129. self._fsdp_qlora_plugin_updates()
  2130. if self.accelerator.mixed_precision != "fp8":
  2131. self.model = self.accelerator.prepare(self.model)
  2132. self.create_optimizer_and_scheduler(num_training_steps=max_steps)
  2133. # prepare using `accelerator` prepare
  2134. if use_accelerator_prepare:
  2135. self.model.train()
  2136. if hasattr(self.lr_scheduler, "step"):
  2137. if self.use_apex:
  2138. model = self.accelerator.prepare(self.model)
  2139. else:
  2140. # We should avoid accelerate preparing the model in TP case since we dont need it as it is handled by transformers from_pretrained and also it goes into DDP based preparation.
  2141. if self.is_tp_enabled:
  2142. self.optimizer = self.accelerator.prepare(self.optimizer)
  2143. else:
  2144. model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
  2145. else:
  2146. # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config.
  2147. model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
  2148. self.model, self.optimizer, self.lr_scheduler
  2149. )
  2150. else:
  2151. self.optimizer = self.accelerator.prepare(self.optimizer)
  2152. if self.is_fsdp_enabled:
  2153. self.model = self.model_wrapped = model
  2154. # for the rest of this function `model` is the outside model, whether it was wrapped or not
  2155. if model is not self.model:
  2156. self.model_wrapped = model
  2157. # backward compatibility
  2158. if self.is_deepspeed_enabled:
  2159. self.deepspeed = self.model_wrapped
  2160. # ckpt loading
  2161. if resume_from_checkpoint is not None:
  2162. if self.is_deepspeed_enabled:
  2163. deepspeed_load_checkpoint(
  2164. self.model_wrapped, resume_from_checkpoint, load_module_strict=not _is_peft_model(self.model)
  2165. )
  2166. elif is_sagemaker_mp_enabled() or self.is_fsdp_enabled:
  2167. self._load_from_checkpoint(resume_from_checkpoint, self.model_wrapped)
  2168. # Check if saved optimizer or scheduler states exist
  2169. self._load_optimizer_and_scheduler(resume_from_checkpoint)
  2170. self._load_scaler(resume_from_checkpoint)
  2171. # important: at this point:
  2172. # self.model is the Transformers Model
  2173. # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model),
  2174. # FSDP(Transformers Model), Dynamo Optimized Module(Transformers Model) etc.
  2175. # Train!
  2176. logger.info("***** Running training *****")
  2177. logger.info(f" Num examples = {num_examples:,}")
  2178. logger.info(f" Num Epochs = {num_train_epochs:,}")
  2179. logger.info(f" Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}")
  2180. if self.args.per_device_train_batch_size != self._train_batch_size:
  2181. logger.info(f" Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}")
  2182. logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}")
  2183. logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
  2184. logger.info(f" Total optimization steps = {max_steps:,}")
  2185. logger.info(f" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}")
  2186. self.state.epoch = 0
  2187. start_time = time.time()
  2188. epochs_trained = 0
  2189. steps_trained_in_current_epoch = 0
  2190. # Check if continuing training from a checkpoint
  2191. if resume_from_checkpoint is not None and os.path.isfile(
  2192. os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)
  2193. ):
  2194. self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
  2195. self.compare_trainer_and_checkpoint_args(self.args, self.state)
  2196. self._load_callback_state()
  2197. epochs_trained = int(self.state.global_step // num_update_steps_per_epoch)
  2198. if not args.ignore_data_skip:
  2199. steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch)
  2200. steps_trained_in_current_epoch *= args.gradient_accumulation_steps
  2201. else:
  2202. steps_trained_in_current_epoch = 0
  2203. logger.info(" Continuing training from checkpoint, will skip to saved global_step")
  2204. logger.info(f" Continuing training from epoch {epochs_trained}")
  2205. logger.info(f" Continuing training from global step {self.state.global_step}")
  2206. if not args.ignore_data_skip:
  2207. logger.info(
  2208. f" Will skip the first {epochs_trained} epochs then the first"
  2209. f" {steps_trained_in_current_epoch} batches in the first epoch."
  2210. )
  2211. # Update the references
  2212. for attr in ("model", "optimizer", "lr_scheduler"):
  2213. setattr(self.callback_handler, attr, getattr(self, attr))
  2214. self.callback_handler.train_dataloader = train_dataloader
  2215. self.state.init_training_references(self, max_steps, num_train_epochs, trial)
  2216. # tr_loss is a tensor to avoid synchronization of TPUs through .item()
  2217. tr_loss = torch.tensor(0.0, device=args.device)
  2218. # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses
  2219. self._total_loss_scalar = 0.0
  2220. self._globalstep_last_logged = self.state.global_step
  2221. model.zero_grad()
  2222. grad_norm: Optional[float] = None
  2223. learning_rate = None
  2224. self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
  2225. if args.eval_on_start:
  2226. self._evaluate(trial, ignore_keys_for_eval, skip_scheduler=True)
  2227. for epoch in range(epochs_trained, num_train_epochs):
  2228. epoch_dataloader = train_dataloader
  2229. if hasattr(epoch_dataloader, "set_epoch"):
  2230. epoch_dataloader.set_epoch(epoch)
  2231. # Reset the past mems state at the beginning of each epoch if necessary.
  2232. if args.past_index >= 0:
  2233. self._past = None
  2234. steps_in_epoch = (
  2235. len(epoch_dataloader)
  2236. if len_dataloader is not None
  2237. else args.max_steps * args.gradient_accumulation_steps
  2238. )
  2239. self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)
  2240. step = -1
  2241. rng_to_sync = False
  2242. # Handle resumption from checkpoint
  2243. if epoch == epochs_trained and resume_from_checkpoint is not None:
  2244. if steps_trained_in_current_epoch > 0 and not args.ignore_data_skip:
  2245. epoch_dataloader = skip_first_batches(epoch_dataloader, steps_trained_in_current_epoch)
  2246. step = steps_trained_in_current_epoch - 1
  2247. rng_to_sync = True
  2248. elif steps_trained_in_current_epoch == 0:
  2249. self._load_rng_state(resume_from_checkpoint)
  2250. epoch_iterator = iter(epoch_dataloader)
  2251. # We chunkify the epoch iterator into gradient accumulation steps `n` batches
  2252. remainder = steps_in_epoch % args.gradient_accumulation_steps
  2253. if remainder == 0:
  2254. remainder = args.gradient_accumulation_steps
  2255. update_step = -1
  2256. total_updates = steps_in_epoch // args.gradient_accumulation_steps + int(
  2257. remainder < args.gradient_accumulation_steps
  2258. )
  2259. for _ in range(total_updates):
  2260. update_step += 1
  2261. num_batches = args.gradient_accumulation_steps if update_step != (total_updates - 1) else remainder
  2262. batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches, args.device)
  2263. # Store the number of batches for current gradient accumulation
  2264. # This is used to correctly scale the loss when the last accumulation step has fewer batches
  2265. self.current_gradient_accumulation_steps = len(batch_samples)
  2266. for i, inputs in enumerate(batch_samples):
  2267. step += 1
  2268. do_sync_step = (step + 1) % args.gradient_accumulation_steps == 0 or (step + 1) == steps_in_epoch
  2269. # Since we perform prefetching, we need to manually set sync_gradients
  2270. self.accelerator.gradient_state._set_sync_gradients(do_sync_step)
  2271. if self.args.include_num_input_tokens_seen not in ["no", False]:
  2272. main_input_name = getattr(self.model, "main_input_name", "input_ids")
  2273. if main_input_name not in inputs:
  2274. logger.warning(
  2275. "Tried to track the number of tokens seen, however the current model is "
  2276. "not configured properly to know what item is the input. To fix this, add "
  2277. "a `main_input_name` attribute to the model class you are using."
  2278. )
  2279. else:
  2280. if self.args.include_num_input_tokens_seen == "non_padding":
  2281. if "attention_mask" in inputs:
  2282. input_tokens = inputs["attention_mask"].sum()
  2283. elif (
  2284. self.processing_class is not None
  2285. and hasattr(self.processing_class, "pad_token_id")
  2286. and self.processing_class.pad_token_id is not None
  2287. ):
  2288. input_tokens = (
  2289. inputs[main_input_name] != self.processing_class.pad_token_id
  2290. ).sum()
  2291. else:
  2292. logger.warning(
  2293. "Could not determine method to count non-padding tokens, falling back to counting all tokens."
  2294. )
  2295. input_tokens = inputs[main_input_name].numel()
  2296. else:
  2297. input_tokens = inputs[main_input_name].numel()
  2298. input_tokens = torch.tensor(input_tokens, device=self.args.device, dtype=torch.int64)
  2299. self.state.num_input_tokens_seen += self.accelerator.gather(input_tokens).sum().item()
  2300. if rng_to_sync:
  2301. self._load_rng_state(resume_from_checkpoint)
  2302. rng_to_sync = False
  2303. if step % args.gradient_accumulation_steps == 0:
  2304. self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
  2305. # We explicitly want to avoid relying on `accelerator.accumulate` for generation training
  2306. context = (
  2307. functools.partial(self.accelerator.no_sync, model=model)
  2308. if i != len(batch_samples) - 1
  2309. and self.accelerator.distributed_type != DistributedType.DEEPSPEED
  2310. else contextlib.nullcontext
  2311. )
  2312. with context():
  2313. tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
  2314. if (
  2315. args.logging_nan_inf_filter
  2316. and not is_torch_xla_available()
  2317. and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
  2318. ):
  2319. # if loss is nan or inf simply add the average of previous logged losses
  2320. tr_loss = tr_loss + tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
  2321. else:
  2322. if tr_loss.device != tr_loss_step.device:
  2323. raise ValueError(
  2324. f"Calculated loss must be on the original device: {tr_loss.device} but device in use is {tr_loss_step.device}"
  2325. )
  2326. tr_loss = tr_loss + tr_loss_step
  2327. self.current_flos += float(self.floating_point_ops(inputs))
  2328. if do_sync_step:
  2329. # Since we perform prefetching, we need to manually set sync_gradients to True
  2330. self.accelerator.gradient_state._set_sync_gradients(True)
  2331. # Gradient clipping
  2332. if args.max_grad_norm is not None and args.max_grad_norm > 0:
  2333. if is_sagemaker_mp_enabled() and args.fp16:
  2334. _grad_norm = self.optimizer.clip_master_grads(args.max_grad_norm)
  2335. elif self.use_apex:
  2336. from apex import amp
  2337. # Revert to normal clipping otherwise, handling Apex or full precision
  2338. _grad_norm = nn.utils.clip_grad_norm_(
  2339. amp.master_params(self.optimizer),
  2340. args.max_grad_norm,
  2341. )
  2342. else:
  2343. grad_norm_context = contextlib.nullcontext
  2344. if self.is_tp_enabled:
  2345. from torch.distributed._tensor.experimental import implicit_replication
  2346. grad_norm_context = implicit_replication
  2347. with grad_norm_context():
  2348. _grad_norm = self.accelerator.clip_grad_norm_(
  2349. model.parameters(),
  2350. args.max_grad_norm,
  2351. )
  2352. if (
  2353. is_accelerate_available()
  2354. and self.accelerator.distributed_type == DistributedType.DEEPSPEED
  2355. ):
  2356. grad_norm = model.get_global_grad_norm()
  2357. # In some cases the grad norm may not return a float
  2358. if hasattr(grad_norm, "item"):
  2359. grad_norm = grad_norm.item()
  2360. else:
  2361. grad_norm = _grad_norm
  2362. self.control = self.callback_handler.on_pre_optimizer_step(args, self.state, self.control)
  2363. context = contextlib.nullcontext
  2364. if self.is_tp_enabled:
  2365. from torch.distributed._tensor.experimental import implicit_replication
  2366. context = implicit_replication
  2367. with context():
  2368. self.optimizer.step()
  2369. self.control = self.callback_handler.on_optimizer_step(args, self.state, self.control)
  2370. # get leaning rate before update
  2371. learning_rate = self._get_learning_rate()
  2372. if not self.accelerator.optimizer_step_was_skipped:
  2373. # Delay optimizer scheduling until metrics are generated
  2374. if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
  2375. self.lr_scheduler.step()
  2376. model.zero_grad()
  2377. self.state.global_step += 1
  2378. self.state.epoch = epoch + (step + 1) / steps_in_epoch
  2379. self.control = self.callback_handler.on_step_end(args, self.state, self.control)
  2380. self._maybe_log_save_evaluate(
  2381. tr_loss,
  2382. grad_norm,
  2383. model,
  2384. trial,
  2385. epoch,
  2386. ignore_keys_for_eval,
  2387. start_time,
  2388. learning_rate=learning_rate,
  2389. )
  2390. else:
  2391. self.control = self.callback_handler.on_substep_end(args, self.state, self.control)
  2392. # PyTorch/XLA relies on the data loader to insert the mark_step for
  2393. # each step. Since we are breaking the loop early, we need to manually
  2394. # insert the mark_step here.
  2395. if self.control.should_epoch_stop or self.control.should_training_stop:
  2396. if is_torch_xla_available():
  2397. xm.mark_step()
  2398. break
  2399. # We also need to break out of the nested loop
  2400. if self.control.should_epoch_stop or self.control.should_training_stop:
  2401. if is_torch_xla_available():
  2402. xm.mark_step()
  2403. break
  2404. if step < 0:
  2405. logger.warning(
  2406. "There seems not to be a single sample in your epoch_iterator, stopping training at step"
  2407. f" {self.state.global_step}! This is expected if you're using an IterableDataset and set"
  2408. f" num_steps ({max_steps}) higher than the number of available samples."
  2409. )
  2410. self.control.should_training_stop = True
  2411. self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
  2412. self._maybe_log_save_evaluate(
  2413. tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time, learning_rate=learning_rate
  2414. )
  2415. if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
  2416. if is_torch_xla_available():
  2417. # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
  2418. xm.master_print(met.metrics_report())
  2419. else:
  2420. logger.warning(
  2421. "You enabled PyTorch/XLA debug metrics but you don't have a TPU "
  2422. "configured. Check your training configuration if this is unexpected."
  2423. )
  2424. if self.control.should_training_stop:
  2425. break
  2426. if args.past_index and hasattr(self, "_past"):
  2427. # Clean the state at the end of training
  2428. delattr(self, "_past")
  2429. logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n")
  2430. if args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
  2431. self._load_best_model()
  2432. # add remaining tr_loss
  2433. self._total_loss_scalar += tr_loss.item()
  2434. effective_global_step = max(self.state.global_step, 0.001) # Avoid ZeroDivisionError
  2435. train_loss = self._total_loss_scalar / effective_global_step
  2436. metrics = speed_metrics(
  2437. "train",
  2438. start_time,
  2439. num_samples=num_train_samples,
  2440. num_steps=self.state.max_steps,
  2441. num_tokens=num_train_tokens,
  2442. )
  2443. self.store_flos()
  2444. metrics["total_flos"] = self.state.total_flos
  2445. metrics["train_loss"] = train_loss
  2446. self.is_in_train = False
  2447. self._memory_tracker.stop_and_update_metrics(metrics)
  2448. self.log(metrics)
  2449. run_dir = self._get_output_dir(trial)
  2450. checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir)
  2451. # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save.
  2452. if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
  2453. for checkpoint in checkpoints_sorted:
  2454. if not os.path.samefile(checkpoint, self.state.best_model_checkpoint):
  2455. logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
  2456. shutil.rmtree(checkpoint, ignore_errors=True)
  2457. self.control = self.callback_handler.on_train_end(args, self.state, self.control)
  2458. # Wait for the checkpoint to be uploaded.
  2459. self._finish_current_push()
  2460. # After training we make sure to retrieve back the original forward pass method
  2461. # for the embedding layer by removing the forward post hook.
  2462. if self.neftune_noise_alpha is not None:
  2463. self._deactivate_neftune(self.model)
  2464. return TrainOutput(self.state.global_step, train_loss, metrics)
  2465. def _get_output_dir(self, trial):
  2466. if self.hp_search_backend is not None and trial is not None:
  2467. if self.hp_search_backend == HPSearchBackend.OPTUNA:
  2468. run_id = trial.number
  2469. elif self.hp_search_backend == HPSearchBackend.RAY:
  2470. import ray.train
  2471. run_id = ray.train.get_context().get_trial_id()
  2472. elif self.hp_search_backend == HPSearchBackend.SIGOPT:
  2473. run_id = trial.id
  2474. elif self.hp_search_backend == HPSearchBackend.WANDB:
  2475. import wandb
  2476. run_id = wandb.run.id
  2477. run_name = self.hp_name(trial) if self.hp_name is not None else f"run-{run_id}"
  2478. run_dir = os.path.join(self.args.output_dir, run_name)
  2479. else:
  2480. run_dir = self.args.output_dir
  2481. return run_dir
  2482. def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
  2483. if model is None:
  2484. model = self.model
  2485. config_file = os.path.join(resume_from_checkpoint, CONFIG_NAME)
  2486. adapter_weights_file = os.path.join(resume_from_checkpoint, ADAPTER_WEIGHTS_NAME)
  2487. adapter_safe_weights_file = os.path.join(resume_from_checkpoint, ADAPTER_SAFE_WEIGHTS_NAME)
  2488. weights_file = os.path.join(resume_from_checkpoint, WEIGHTS_NAME)
  2489. weights_index_file = os.path.join(resume_from_checkpoint, WEIGHTS_INDEX_NAME)
  2490. safe_weights_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_NAME)
  2491. safe_weights_index_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_INDEX_NAME)
  2492. is_fsdp_ckpt = os.path.isdir(resume_from_checkpoint) and (
  2493. # this checks the FSDP state dict when `SHARDED_STATE_DICT` is used
  2494. any(
  2495. FSDP_MODEL_NAME in folder_name
  2496. for folder_name in os.listdir(resume_from_checkpoint)
  2497. if os.path.isdir(os.path.join(resume_from_checkpoint, folder_name))
  2498. )
  2499. # this checks the FSDP state dict when `FULL_STATE_DICT` is used
  2500. or os.path.isfile(os.path.join(resume_from_checkpoint, f"{FSDP_MODEL_NAME}.bin"))
  2501. )
  2502. # if multiple adapters exist, they get saved in sub directories
  2503. adapter_subdirs = (
  2504. [
  2505. folder_name
  2506. for folder_name in os.listdir(resume_from_checkpoint)
  2507. if os.path.isdir(os.path.join(resume_from_checkpoint, folder_name))
  2508. and (
  2509. os.path.isfile(os.path.join(resume_from_checkpoint, folder_name, ADAPTER_WEIGHTS_NAME))
  2510. or os.path.isfile(os.path.join(resume_from_checkpoint, folder_name, ADAPTER_SAFE_WEIGHTS_NAME))
  2511. )
  2512. ]
  2513. if os.path.isdir(resume_from_checkpoint)
  2514. else []
  2515. )
  2516. if is_fsdp_ckpt and not self.is_fsdp_enabled:
  2517. raise ValueError(f"Checkpoint found at {resume_from_checkpoint} is only supported when using PyTorch FSDP")
  2518. if not (
  2519. any(
  2520. os.path.isfile(f)
  2521. for f in [
  2522. weights_file,
  2523. safe_weights_file,
  2524. weights_index_file,
  2525. safe_weights_index_file,
  2526. adapter_weights_file,
  2527. adapter_safe_weights_file,
  2528. ]
  2529. )
  2530. or is_fsdp_ckpt
  2531. or adapter_subdirs
  2532. ):
  2533. raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}")
  2534. logger.info(f"Loading model from {resume_from_checkpoint}.")
  2535. if os.path.isfile(config_file):
  2536. config = PretrainedConfig.from_json_file(config_file)
  2537. checkpoint_version = config.transformers_version
  2538. if checkpoint_version is not None and checkpoint_version != __version__:
  2539. logger.warning(
  2540. f"You are resuming training from a checkpoint trained with {checkpoint_version} of "
  2541. f"Transformers but your current version is {__version__}. This is not recommended and could "
  2542. "yield to errors or unwanted behaviors."
  2543. )
  2544. if os.path.isfile(weights_file) or os.path.isfile(safe_weights_file) or is_fsdp_ckpt:
  2545. # If the model is on the GPU, it still works!
  2546. if is_sagemaker_mp_enabled():
  2547. if os.path.isfile(os.path.join(resume_from_checkpoint, "user_content.pt")):
  2548. # If the 'user_content.pt' file exists, load with the new smp api.
  2549. # Checkpoint must have been saved with the new smp api.
  2550. smp.resume_from_checkpoint(
  2551. path=resume_from_checkpoint, tag=WEIGHTS_NAME, partial=False, load_optimizer=False
  2552. )
  2553. else:
  2554. # If the 'user_content.pt' file does NOT exist, load with the old smp api.
  2555. # Checkpoint must have been saved with the old smp api.
  2556. if hasattr(self.args, "fp16") and self.args.fp16 is True:
  2557. logger.warning(
  2558. "Enabling FP16 and loading from smp < 1.10 checkpoint together is not supported."
  2559. )
  2560. check_torch_load_is_safe()
  2561. state_dict = torch.load(weights_file, map_location="cpu", weights_only=True)
  2562. # Required for smp to not auto-translate state_dict from hf to smp (is already smp).
  2563. state_dict["_smp_is_partial"] = False
  2564. load_result = model.load_state_dict(state_dict, strict=True)
  2565. # release memory
  2566. del state_dict
  2567. elif self.is_fsdp_enabled:
  2568. load_fsdp_model(
  2569. self.accelerator.state.fsdp_plugin,
  2570. self.accelerator,
  2571. model,
  2572. resume_from_checkpoint,
  2573. **_get_fsdp_ckpt_kwargs(),
  2574. )
  2575. else:
  2576. # We load the model state dict on the CPU to avoid an OOM error.
  2577. if self.args.save_safetensors and os.path.isfile(safe_weights_file):
  2578. state_dict = safetensors.torch.load_file(safe_weights_file, device="cpu")
  2579. else:
  2580. check_torch_load_is_safe()
  2581. state_dict = torch.load(weights_file, map_location="cpu", weights_only=True)
  2582. # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
  2583. # which takes *args instead of **kwargs
  2584. load_result = model.load_state_dict(state_dict, False)
  2585. # release memory
  2586. del state_dict
  2587. self._issue_warnings_after_load(load_result)
  2588. # Load adapters following PR # 24096
  2589. elif _is_peft_model(model):
  2590. # If train a model using PEFT & LoRA, assume that adapter have been saved properly.
  2591. # TODO: in the future support only specific min PEFT versions
  2592. if (hasattr(model, "active_adapter") or hasattr(model, "active_adapters")) and hasattr(
  2593. model, "load_adapter"
  2594. ):
  2595. if os.path.exists(resume_from_checkpoint):
  2596. # For BC for older PEFT versions
  2597. if hasattr(model, "active_adapters"):
  2598. active_adapters = model.active_adapters
  2599. if len(active_adapters) > 1:
  2600. logger.warning("Multiple active adapters detected will only consider the first adapter")
  2601. active_adapter = active_adapters[0]
  2602. else:
  2603. active_adapter = model.active_adapter
  2604. if adapter_subdirs:
  2605. for subdir_name in adapter_subdirs:
  2606. peft_id = os.path.join(resume_from_checkpoint, subdir_name)
  2607. model.load_adapter(peft_id, subdir_name, is_trainable=(subdir_name == active_adapter))
  2608. model.set_adapter(active_adapter)
  2609. else:
  2610. model.load_adapter(resume_from_checkpoint, active_adapter, is_trainable=True)
  2611. else:
  2612. logger.warning(
  2613. "The intermediate checkpoints of PEFT may not be saved correctly, "
  2614. f"consider using a custom callback to save {ADAPTER_WEIGHTS_NAME} in corresponding saving folders. "
  2615. "Check some examples here: https://github.com/huggingface/peft/issues/96"
  2616. )
  2617. else:
  2618. logger.warning("Could not load adapter model, make sure to have `peft>=0.3.0` installed")
  2619. else:
  2620. # We load the sharded checkpoint
  2621. load_result = load_sharded_checkpoint(
  2622. model, resume_from_checkpoint, strict=is_sagemaker_mp_enabled(), prefer_safe=self.args.save_safetensors
  2623. )
  2624. if not is_sagemaker_mp_enabled():
  2625. self._issue_warnings_after_load(load_result)
  2626. def _load_best_model(self):
  2627. logger.info(f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric}).")
  2628. best_model_path = os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME)
  2629. best_safe_model_path = os.path.join(self.state.best_model_checkpoint, SAFE_WEIGHTS_NAME)
  2630. best_adapter_model_path = os.path.join(self.state.best_model_checkpoint, ADAPTER_WEIGHTS_NAME)
  2631. best_safe_adapter_model_path = os.path.join(self.state.best_model_checkpoint, ADAPTER_SAFE_WEIGHTS_NAME)
  2632. model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
  2633. if self.is_deepspeed_enabled:
  2634. deepspeed_load_checkpoint(
  2635. self.model_wrapped,
  2636. self.state.best_model_checkpoint,
  2637. load_module_strict=not _is_peft_model(self.model),
  2638. )
  2639. elif self.is_fsdp_enabled:
  2640. load_result = load_fsdp_model(
  2641. self.accelerator.state.fsdp_plugin,
  2642. self.accelerator,
  2643. model,
  2644. self.state.best_model_checkpoint,
  2645. **_get_fsdp_ckpt_kwargs(),
  2646. )
  2647. elif (
  2648. os.path.exists(best_model_path)
  2649. or os.path.exists(best_safe_model_path)
  2650. or os.path.exists(best_adapter_model_path)
  2651. or os.path.exists(best_safe_adapter_model_path)
  2652. ):
  2653. has_been_loaded = True
  2654. if is_sagemaker_mp_enabled():
  2655. if os.path.isfile(os.path.join(self.state.best_model_checkpoint, "user_content.pt")):
  2656. # If the 'user_content.pt' file exists, load with the new smp api.
  2657. # Checkpoint must have been saved with the new smp api.
  2658. smp.resume_from_checkpoint(
  2659. path=self.state.best_model_checkpoint,
  2660. tag=WEIGHTS_NAME,
  2661. partial=False,
  2662. load_optimizer=False,
  2663. )
  2664. else:
  2665. # If the 'user_content.pt' file does NOT exist, load with the old smp api.
  2666. # Checkpoint must have been saved with the old smp api.
  2667. if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
  2668. state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
  2669. else:
  2670. check_torch_load_is_safe()
  2671. state_dict = torch.load(best_model_path, map_location="cpu", weights_only=True)
  2672. state_dict["_smp_is_partial"] = False
  2673. load_result = model.load_state_dict(state_dict, strict=True)
  2674. else:
  2675. if _is_peft_model(model):
  2676. # If train a model using PEFT & LoRA, assume that adapter have been saved properly.
  2677. # TODO: in the future support only specific min PEFT versions
  2678. if (hasattr(model, "active_adapter") or hasattr(model, "active_adapters")) and hasattr(
  2679. model, "load_adapter"
  2680. ):
  2681. # For BC for older PEFT versions
  2682. if hasattr(model, "active_adapters"):
  2683. active_adapter = model.active_adapters[0]
  2684. if len(model.active_adapters) > 1:
  2685. logger.warning("Detected multiple active adapters, will only consider the first one")
  2686. else:
  2687. active_adapter = model.active_adapter
  2688. if os.path.exists(best_adapter_model_path) or os.path.exists(best_safe_adapter_model_path):
  2689. try:
  2690. model.load_adapter(self.state.best_model_checkpoint, active_adapter)
  2691. except RuntimeError as exc:
  2692. if model.peft_config[active_adapter].is_prompt_learning:
  2693. # for context: https://github.com/huggingface/peft/issues/2256
  2694. msg = (
  2695. "When using prompt learning PEFT methods such as "
  2696. f"{model.peft_config[active_adapter].peft_type.value}, setting "
  2697. "load_best_model_at_end=True can lead to errors, it is recommended "
  2698. "to set this to False and to load the model manually from the checkpoint "
  2699. "directory using PeftModel.from_pretrained(base_model, <path>) after training "
  2700. "has finished."
  2701. )
  2702. raise RuntimeError(msg) from exc
  2703. else:
  2704. raise
  2705. # Load_adapter has no return value present, modify it when appropriate.
  2706. from torch.nn.modules.module import _IncompatibleKeys
  2707. load_result = _IncompatibleKeys([], [])
  2708. else:
  2709. logger.warning(
  2710. "The intermediate checkpoints of PEFT may not be saved correctly, "
  2711. f"consider using a custom callback to save {ADAPTER_WEIGHTS_NAME} in corresponding saving folders. "
  2712. "Check some examples here: https://github.com/huggingface/peft/issues/96"
  2713. )
  2714. has_been_loaded = False
  2715. else:
  2716. logger.warning("Could not load adapter model, make sure to have `peft>=0.3.0` installed")
  2717. has_been_loaded = False
  2718. else:
  2719. # We load the model state dict on the CPU to avoid an OOM error.
  2720. if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
  2721. state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
  2722. else:
  2723. check_torch_load_is_safe()
  2724. state_dict = torch.load(best_model_path, map_location="cpu", weights_only=True)
  2725. # If the model is on the GPU, it still works!
  2726. # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
  2727. # which takes *args instead of **kwargs
  2728. load_result = model.load_state_dict(state_dict, False)
  2729. if not is_sagemaker_mp_enabled() and has_been_loaded:
  2730. self._issue_warnings_after_load(load_result)
  2731. elif os.path.exists(os.path.join(self.state.best_model_checkpoint, SAFE_WEIGHTS_INDEX_NAME)) or os.path.exists(
  2732. os.path.join(self.state.best_model_checkpoint, WEIGHTS_INDEX_NAME)
  2733. ):
  2734. load_result = load_sharded_checkpoint(
  2735. model, self.state.best_model_checkpoint, strict=is_sagemaker_mp_enabled()
  2736. )
  2737. if not is_sagemaker_mp_enabled():
  2738. self._issue_warnings_after_load(load_result)
  2739. else:
  2740. logger.warning(
  2741. f"Could not locate the best model at {best_model_path}, if you are running a distributed training "
  2742. "on multiple nodes, you should activate `--save_on_each_node`."
  2743. )
  2744. def _issue_warnings_after_load(self, load_result):
  2745. if len(load_result.missing_keys) != 0:
  2746. if self.model._keys_to_ignore_on_save is not None and set(load_result.missing_keys) == set(
  2747. self.model._keys_to_ignore_on_save
  2748. ):
  2749. self.model.tie_weights()
  2750. else:
  2751. logger.warning(f"There were missing keys in the checkpoint model loaded: {load_result.missing_keys}.")
  2752. if len(load_result.unexpected_keys) != 0:
  2753. logger.warning(
  2754. f"There were unexpected keys in the checkpoint model loaded: {load_result.unexpected_keys}."
  2755. )
  2756. def _evaluate(self, trial, ignore_keys_for_eval, skip_scheduler=False):
  2757. metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
  2758. self._report_to_hp_search(trial, self.state.global_step, metrics)
  2759. # Run delayed LR scheduler now that metrics are populated
  2760. if isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau) and not skip_scheduler:
  2761. metric_to_check = self.args.metric_for_best_model
  2762. if not metric_to_check.startswith("eval_"):
  2763. metric_to_check = f"eval_{metric_to_check}"
  2764. try:
  2765. self.lr_scheduler.step(metrics[metric_to_check])
  2766. except KeyError as exc:
  2767. raise KeyError(
  2768. f"The `metric_for_best_model` training argument is set to '{metric_to_check}', "
  2769. f"which is not found in the evaluation metrics. "
  2770. f"The available evaluation metrics are: {list(metrics.keys())}. "
  2771. f"Please ensure that the `compute_metrics` function returns a dictionary that includes '{metric_to_check}' or "
  2772. f"consider changing the `metric_for_best_model` via the TrainingArguments."
  2773. ) from exc
  2774. return metrics
  2775. def _maybe_log_save_evaluate(
  2776. self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time, learning_rate=None
  2777. ):
  2778. if self.control.should_log and self.state.global_step > self._globalstep_last_logged:
  2779. if is_torch_xla_available():
  2780. xm.mark_step()
  2781. logs: dict[str, float] = {}
  2782. # all_gather + mean() to get average loss over all processes
  2783. tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
  2784. # reset tr_loss to zero
  2785. tr_loss -= tr_loss
  2786. logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4)
  2787. if grad_norm is not None:
  2788. logs["grad_norm"] = grad_norm.item() if isinstance(grad_norm, torch.Tensor) else grad_norm
  2789. if learning_rate is not None:
  2790. logs["learning_rate"] = learning_rate
  2791. else:
  2792. logs["learning_rate"] = self._get_learning_rate()
  2793. self._total_loss_scalar += tr_loss_scalar
  2794. self._globalstep_last_logged = self.state.global_step
  2795. self.store_flos()
  2796. self.log(logs, start_time)
  2797. metrics = None
  2798. if self.control.should_evaluate:
  2799. metrics = self._evaluate(trial, ignore_keys_for_eval)
  2800. is_new_best_metric = self._determine_best_metric(metrics=metrics, trial=trial)
  2801. if self.args.save_strategy == SaveStrategy.BEST:
  2802. self.control.should_save = is_new_best_metric
  2803. if self.control.should_save:
  2804. self._save_checkpoint(model, trial)
  2805. self.control = self.callback_handler.on_save(self.args, self.state, self.control)
  2806. def _load_rng_state(self, checkpoint):
  2807. # Load RNG states from `checkpoint`
  2808. if checkpoint is None:
  2809. return
  2810. if self.args.world_size > 1:
  2811. process_index = self.args.process_index
  2812. rng_file = os.path.join(checkpoint, f"rng_state_{process_index}.pth")
  2813. if not os.path.isfile(rng_file):
  2814. logger.info(
  2815. f"Didn't find an RNG file for process {process_index}, if you are resuming a training that "
  2816. "wasn't launched in a distributed fashion, reproducibility is not guaranteed."
  2817. )
  2818. return
  2819. else:
  2820. rng_file = os.path.join(checkpoint, "rng_state.pth")
  2821. if not os.path.isfile(rng_file):
  2822. logger.info(
  2823. "Didn't find an RNG file, if you are resuming a training that was launched in a distributed "
  2824. "fashion, reproducibility is not guaranteed."
  2825. )
  2826. return
  2827. with safe_globals():
  2828. checkpoint_rng_state = torch.load(rng_file)
  2829. random.setstate(checkpoint_rng_state["python"])
  2830. np.random.set_state(checkpoint_rng_state["numpy"])
  2831. torch.random.set_rng_state(checkpoint_rng_state["cpu"])
  2832. if is_torch_xla_available():
  2833. xm.set_rng_state(checkpoint_rng_state["xla"])
  2834. is_distributed = self.args.parallel_mode == ParallelMode.DISTRIBUTED
  2835. if torch.cuda.is_available():
  2836. set_rng_state_for_device("CUDA", torch.cuda, checkpoint_rng_state, is_distributed)
  2837. if is_torch_npu_available():
  2838. set_rng_state_for_device("NPU", torch.npu, checkpoint_rng_state, is_distributed)
  2839. if is_torch_hpu_available():
  2840. set_rng_state_for_device("HPU", torch.hpu, checkpoint_rng_state, is_distributed)
  2841. if is_torch_mlu_available():
  2842. set_rng_state_for_device("MLU", torch.mlu, checkpoint_rng_state, is_distributed)
  2843. if is_torch_musa_available():
  2844. set_rng_state_for_device("MUSA", torch.musa, checkpoint_rng_state, is_distributed)
  2845. def _determine_best_metric(self, metrics, trial):
  2846. """
  2847. Determine if the model should be saved based on the evaluation metrics.
  2848. Returns:
  2849. bool: True if a new best metric was found, else False
  2850. """
  2851. is_new_best_metric = False
  2852. if self.args.metric_for_best_model is not None:
  2853. metric_to_check = self.args.metric_for_best_model
  2854. if not metric_to_check.startswith("eval_"):
  2855. metric_to_check = f"eval_{metric_to_check}"
  2856. try:
  2857. metric_value = metrics[metric_to_check]
  2858. except KeyError as exc:
  2859. raise KeyError(
  2860. f"The `metric_for_best_model` training argument is set to '{metric_to_check}', which is not found in the evaluation metrics. "
  2861. f"The available evaluation metrics are: {list(metrics.keys())}. Consider changing the `metric_for_best_model` via the TrainingArguments."
  2862. ) from exc
  2863. operator = np.greater if self.args.greater_is_better else np.less
  2864. if self.state.best_metric is None:
  2865. self.state.best_metric = float("-inf") if self.args.greater_is_better else float("inf")
  2866. if operator(metric_value, self.state.best_metric):
  2867. self.state.best_metric = metric_value
  2868. if self.args.save_strategy in [SaveStrategy.STEPS, SaveStrategy.EPOCH]:
  2869. self.state.best_global_step = self.state.global_step
  2870. is_new_best_metric = True
  2871. return is_new_best_metric
  2872. def _save_checkpoint(self, model, trial):
  2873. # In all cases, including ddp/dp/deepspeed, self.model is always a reference to the model we
  2874. # want to save except FullyShardedDDP.
  2875. # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"
  2876. # Save model checkpoint
  2877. checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
  2878. if self.hp_search_backend is None and trial is None:
  2879. self.store_flos()
  2880. run_dir = self._get_output_dir(trial=trial)
  2881. output_dir = os.path.join(run_dir, checkpoint_folder)
  2882. self.save_model(output_dir, _internal_call=True)
  2883. if self.args.save_strategy in [SaveStrategy.STEPS, SaveStrategy.EPOCH] and self.state.best_global_step:
  2884. # Wait for everyone to get here so we are sure the model has been saved by process 0
  2885. # before we check if the best_checkpoint_dir exists
  2886. if is_torch_xla_available():
  2887. xm.rendezvous("load_best_model_at_end")
  2888. elif self.args.parallel_mode == ParallelMode.DISTRIBUTED:
  2889. dist.barrier()
  2890. elif is_sagemaker_mp_enabled():
  2891. smp.barrier()
  2892. best_checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.best_global_step}"
  2893. best_checkpoint_dir = os.path.join(run_dir, best_checkpoint_folder)
  2894. if os.path.exists(best_checkpoint_dir):
  2895. self.state.best_model_checkpoint = best_checkpoint_dir
  2896. if not self.args.save_only_model:
  2897. # Save optimizer and scheduler
  2898. self._save_optimizer_and_scheduler(output_dir)
  2899. self._save_scaler(output_dir)
  2900. # Save RNG state
  2901. self._save_rng_state(output_dir)
  2902. # Save the Trainer state
  2903. if self.args.should_save:
  2904. # Update `ExportableState` callbacks and `TrainerControl` state to where we are currently
  2905. for cb in [
  2906. cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState)
  2907. ]:
  2908. cb_name = cb.__class__.__name__
  2909. cb_state = cb.state()
  2910. if isinstance(self.state.stateful_callbacks[cb_name], list):
  2911. self.state.stateful_callbacks[cb_name].append(cb_state)
  2912. else:
  2913. self.state.stateful_callbacks[cb_name] = cb_state
  2914. self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
  2915. if self.args.push_to_hub:
  2916. self._push_from_checkpoint(output_dir)
  2917. # Maybe delete some older checkpoints.
  2918. if self.args.should_save:
  2919. # we use mtime as default, filesystems without mtime support will be detected in `_sorted_checkpoints`
  2920. self._rotate_checkpoints(use_mtime=True, output_dir=run_dir)
  2921. def _save_rng_state(self, output_dir):
  2922. # Save RNG state in non-distributed training
  2923. rng_states = {
  2924. "python": random.getstate(),
  2925. "numpy": np.random.get_state(),
  2926. "cpu": torch.random.get_rng_state(),
  2927. }
  2928. if torch.cuda.is_available():
  2929. if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
  2930. # In non distributed, we save the global CUDA RNG state (will take care of DataParallel)
  2931. rng_states["cuda"] = torch.cuda.random.get_rng_state_all()
  2932. else:
  2933. rng_states["cuda"] = torch.cuda.random.get_rng_state()
  2934. if is_torch_xla_available():
  2935. rng_states["xla"] = xm.get_rng_state()
  2936. if is_torch_npu_available():
  2937. if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
  2938. rng_states["npu"] = torch.npu.random.get_rng_state_all()
  2939. else:
  2940. rng_states["npu"] = torch.npu.random.get_rng_state()
  2941. if is_torch_hpu_available():
  2942. if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
  2943. rng_states["hpu"] = torch.hpu.random.get_rng_state_all()
  2944. else:
  2945. rng_states["hpu"] = torch.hpu.random.get_rng_state()
  2946. if is_torch_mlu_available():
  2947. if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
  2948. rng_states["mlu"] = torch.mlu.random.get_rng_state_all()
  2949. else:
  2950. rng_states["mlu"] = torch.mlu.random.get_rng_state()
  2951. if is_torch_musa_available():
  2952. if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
  2953. rng_states["musa"] = torch.musa.get_rng_state_all()
  2954. else:
  2955. rng_states["musa"] = torch.musa.get_rng_state()
  2956. # A process can arrive here before the process 0 has a chance to save the model, in which case output_dir may
  2957. # not yet exist.
  2958. os.makedirs(output_dir, exist_ok=True)
  2959. if self.args.world_size <= 1:
  2960. torch.save(rng_states, os.path.join(output_dir, "rng_state.pth"))
  2961. else:
  2962. torch.save(rng_states, os.path.join(output_dir, f"rng_state_{self.args.process_index}.pth"))
  2963. def _save_optimizer_and_scheduler(self, output_dir):
  2964. if is_torch_xla_available():
  2965. xm.rendezvous("saving_optimizer_states")
  2966. if self.is_fsdp_xla_v1_enabled:
  2967. optm = {
  2968. "optimizer": self.optimizer.state_dict(),
  2969. "shard_metadata": self.model.get_shard_metadata(),
  2970. }
  2971. xm.save(
  2972. optm,
  2973. os.path.join(
  2974. output_dir, f"rank{self.args.process_index}-of-{self.args.world_size}-{OPTIMIZER_NAME}"
  2975. ),
  2976. master_only=False,
  2977. )
  2978. else:
  2979. xm.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
  2980. with warnings.catch_warnings(record=True) as caught_warnings:
  2981. xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
  2982. reissue_pt_warnings(caught_warnings)
  2983. elif is_sagemaker_mp_enabled():
  2984. opt_state_dict = self.optimizer.local_state_dict(gather_if_shard=False)
  2985. smp.barrier()
  2986. if smp.rdp_rank() == 0 or smp.state.cfg.shard_optimizer_state:
  2987. smp.save(
  2988. opt_state_dict,
  2989. os.path.join(output_dir, OPTIMIZER_NAME),
  2990. partial=True,
  2991. v3=smp.state.cfg.shard_optimizer_state,
  2992. )
  2993. elif self.is_deepspeed_enabled:
  2994. # under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed
  2995. # config `stage3_gather_16bit_weights_on_model_save` is True
  2996. accept_exclude_frozen_parameters = "exclude_frozen_parameters" in set(
  2997. inspect.signature(self.model_wrapped.save_checkpoint).parameters.keys()
  2998. )
  2999. if accept_exclude_frozen_parameters and _is_peft_model(self.model):
  3000. self.model_wrapped.save_checkpoint(output_dir, exclude_frozen_parameters=True)
  3001. else:
  3002. self.model_wrapped.save_checkpoint(output_dir)
  3003. elif self.is_fsdp_enabled:
  3004. # save fsdp specific ckpt for resuming from ckpt
  3005. save_fsdp_model(
  3006. self.accelerator.state.fsdp_plugin, self.accelerator, self.model, output_dir, **_get_fsdp_ckpt_kwargs()
  3007. )
  3008. save_fsdp_optimizer(
  3009. self.accelerator.state.fsdp_plugin, self.accelerator, self.optimizer, self.model, output_dir
  3010. )
  3011. elif self.args.should_save:
  3012. # deepspeed.save_checkpoint above saves model/optim/sched
  3013. torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
  3014. # Save SCHEDULER & SCALER
  3015. is_deepspeed_custom_scheduler = self.is_deepspeed_enabled and not isinstance(
  3016. self.lr_scheduler, DeepSpeedSchedulerWrapper
  3017. )
  3018. if (
  3019. self.args.should_save
  3020. and (not self.is_deepspeed_enabled or is_deepspeed_custom_scheduler)
  3021. and not is_torch_xla_available()
  3022. ):
  3023. with warnings.catch_warnings(record=True) as caught_warnings:
  3024. torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
  3025. reissue_pt_warnings(caught_warnings)
  3026. def _load_optimizer_and_scheduler(self, checkpoint):
  3027. """If optimizer and scheduler states exist, load them."""
  3028. if checkpoint is None:
  3029. return
  3030. if self.is_deepspeed_enabled:
  3031. # deepspeed loads optimizer/lr_scheduler together with the model in deepspeed_init
  3032. if not isinstance(self.lr_scheduler, DeepSpeedSchedulerWrapper):
  3033. with warnings.catch_warnings(record=True) as caught_warnings:
  3034. check_torch_load_is_safe()
  3035. self.lr_scheduler.load_state_dict(
  3036. torch.load(os.path.join(checkpoint, SCHEDULER_NAME), weights_only=True)
  3037. )
  3038. reissue_pt_warnings(caught_warnings)
  3039. return
  3040. checkpoint_file_exists = (
  3041. glob.glob(os.path.join(checkpoint, OPTIMIZER_NAME) + "_*")
  3042. if is_sagemaker_mp_enabled()
  3043. else (
  3044. os.path.isfile(os.path.join(checkpoint, OPTIMIZER_NAME))
  3045. or os.path.isfile(os.path.join(checkpoint, OPTIMIZER_NAME_BIN))
  3046. or (
  3047. os.path.isdir(checkpoint)
  3048. and any(
  3049. OPTIMIZER_NAME_BIN.split(".")[0] in folder_name
  3050. for folder_name in os.listdir(checkpoint)
  3051. if os.path.isdir(os.path.join(checkpoint, folder_name))
  3052. )
  3053. )
  3054. )
  3055. )
  3056. checkpoint_file_exists = (
  3057. glob.glob(os.path.join(checkpoint, f"rank*-of-{self.args.world_size}-{OPTIMIZER_NAME}"))
  3058. if self.is_fsdp_xla_v1_enabled
  3059. else checkpoint_file_exists
  3060. )
  3061. if checkpoint_file_exists and os.path.isfile(os.path.join(checkpoint, SCHEDULER_NAME)):
  3062. # Load in optimizer and scheduler states
  3063. if is_torch_xla_available():
  3064. # On TPU we have to take some extra precautions to properly load the states on the right device.
  3065. if self.is_fsdp_xla_v1_enabled:
  3066. check_torch_load_is_safe()
  3067. optimizer_state = torch.load(
  3068. os.path.join(
  3069. checkpoint, f"rank{self.args.process_index}-of-{self.args.world_size}-{OPTIMIZER_NAME}"
  3070. ),
  3071. map_location="cpu",
  3072. weights_only=True,
  3073. )
  3074. # We only need `optimizer` when resuming from checkpoint
  3075. optimizer_state = optimizer_state["optimizer"]
  3076. else:
  3077. check_torch_load_is_safe()
  3078. optimizer_state = torch.load(
  3079. os.path.join(checkpoint, OPTIMIZER_NAME), map_location="cpu", weights_only=True
  3080. )
  3081. with warnings.catch_warnings(record=True) as caught_warnings:
  3082. check_torch_load_is_safe()
  3083. lr_scheduler_state = torch.load(
  3084. os.path.join(checkpoint, SCHEDULER_NAME), map_location="cpu", weights_only=True
  3085. )
  3086. reissue_pt_warnings(caught_warnings)
  3087. xm.send_cpu_data_to_device(optimizer_state, self.args.device)
  3088. xm.send_cpu_data_to_device(lr_scheduler_state, self.args.device)
  3089. self.optimizer.load_state_dict(optimizer_state)
  3090. self.lr_scheduler.load_state_dict(lr_scheduler_state)
  3091. else:
  3092. if is_sagemaker_mp_enabled():
  3093. if os.path.isfile(os.path.join(checkpoint, "user_content.pt")):
  3094. # Optimizer checkpoint was saved with smp >= 1.10
  3095. def opt_load_hook(mod, opt):
  3096. opt.load_state_dict(smp.load(os.path.join(checkpoint, OPTIMIZER_NAME), partial=True))
  3097. else:
  3098. # Optimizer checkpoint was saved with smp < 1.10
  3099. def opt_load_hook(mod, opt):
  3100. if IS_SAGEMAKER_MP_POST_1_10:
  3101. opt.load_state_dict(
  3102. smp.load(os.path.join(checkpoint, OPTIMIZER_NAME), partial=True, back_compat=True)
  3103. )
  3104. else:
  3105. opt.load_state_dict(smp.load(os.path.join(checkpoint, OPTIMIZER_NAME), partial=True))
  3106. self.model_wrapped.register_post_step_hook(opt_load_hook)
  3107. else:
  3108. # We use the CPU when training on one GPU to avoid OOM for GPU RAM when training big models.
  3109. # In distributed training however, we load directly on each GPU and risk the GPU OOM as it's more
  3110. # likely to get OOM on CPU (since we load num_gpu times the optimizer state
  3111. map_location = self.args.device if self.args.world_size > 1 else "cpu"
  3112. if self.is_fsdp_enabled:
  3113. load_fsdp_optimizer(
  3114. self.accelerator.state.fsdp_plugin,
  3115. self.accelerator,
  3116. self.optimizer,
  3117. self.model,
  3118. checkpoint,
  3119. **_get_fsdp_ckpt_kwargs(),
  3120. )
  3121. else:
  3122. check_torch_load_is_safe()
  3123. self.optimizer.load_state_dict(
  3124. torch.load(
  3125. os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location, weights_only=True
  3126. )
  3127. )
  3128. with warnings.catch_warnings(record=True) as caught_warnings:
  3129. check_torch_load_is_safe()
  3130. self.lr_scheduler.load_state_dict(
  3131. torch.load(os.path.join(checkpoint, SCHEDULER_NAME), weights_only=True)
  3132. )
  3133. reissue_pt_warnings(caught_warnings)
  3134. def _save_scaler(self, output_dir):
  3135. # See if there is a scaler attribute
  3136. try:
  3137. scaler = self.accelerator.scaler
  3138. except AttributeError:
  3139. return
  3140. if scaler is None:
  3141. return
  3142. if is_torch_xla_available():
  3143. xm.rendezvous("saving_scaler_state")
  3144. with warnings.catch_warnings(record=True) as caught_warnings:
  3145. xm.save(self.accelerator.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
  3146. reissue_pt_warnings(caught_warnings)
  3147. # Save SCALER
  3148. if self.args.should_save and not is_torch_xla_available():
  3149. with warnings.catch_warnings(record=True) as caught_warnings:
  3150. torch.save(self.accelerator.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
  3151. reissue_pt_warnings(caught_warnings)
  3152. def _load_scaler(self, checkpoint):
  3153. """If scaler state exists, load it."""
  3154. if checkpoint is None:
  3155. return
  3156. checkpoint_file_exists = os.path.isfile(os.path.join(checkpoint, SCALER_NAME))
  3157. if checkpoint_file_exists:
  3158. # On TPU we have to take some extra precautions to properly load the states on the right device.
  3159. # Load in scaler states
  3160. if is_torch_xla_available():
  3161. with warnings.catch_warnings(record=True) as caught_warnings:
  3162. check_torch_load_is_safe()
  3163. scaler_state = torch.load(
  3164. os.path.join(checkpoint, SCALER_NAME), map_location="cpu", weights_only=True
  3165. )
  3166. reissue_pt_warnings(caught_warnings)
  3167. xm.send_cpu_data_to_device(scaler_state, self.args.device)
  3168. self.accelerator.scaler.load_state_dict(scaler_state)
  3169. else:
  3170. with warnings.catch_warnings(record=True) as caught_warnings:
  3171. check_torch_load_is_safe()
  3172. self.accelerator.scaler.load_state_dict(
  3173. torch.load(os.path.join(checkpoint, SCALER_NAME), weights_only=True)
  3174. )
  3175. reissue_pt_warnings(caught_warnings)
  3176. def _load_callback_state(self):
  3177. """If callback states exist and were passed in, restore their states if enabled"""
  3178. if not self.args.restore_callback_states_from_checkpoint:
  3179. return
  3180. # Callback states are stored in stateful_callbacks
  3181. not_found = []
  3182. new_callbacks = []
  3183. original_callbacks = self.callback_handler.callbacks + [self.control]
  3184. for stored_callback, data in self.state.stateful_callbacks.items():
  3185. if not isinstance(data, list):
  3186. data = [data]
  3187. if any(callback.__class__.__name__ == stored_callback for callback in original_callbacks):
  3188. # We can load/restore from multiple callbacks of the same type.
  3189. duplicates = [
  3190. callback for callback in original_callbacks if callback.__class__.__name__ == stored_callback
  3191. ]
  3192. for callback, callback_data in zip(duplicates, data):
  3193. args = callback_data.get("args", {})
  3194. attributes = callback_data.get("attributes", {})
  3195. new_callback = type(callback)(**args)
  3196. for attribute, value in attributes.items():
  3197. setattr(new_callback, attribute, value)
  3198. if isinstance(callback, TrainerControl):
  3199. # Specifically for restoring the `control` state
  3200. self.control = new_callback
  3201. else:
  3202. new_callbacks.append(new_callback)
  3203. # We remove the existing callback and add it to the list of new callbacks
  3204. self.callback_handler.remove_callback(type(new_callback))
  3205. logger.info("Continuing training from checkpoint, restoring any callbacks that were passed in")
  3206. else:
  3207. not_found.append(stored_callback)
  3208. if len(not_found) > 0:
  3209. logger.warning(
  3210. f"Checkpoint included callbacks not included in current configuration. Ignoring. ({', '.join(not_found)})"
  3211. )
  3212. for callback in new_callbacks:
  3213. self.callback_handler.add_callback(callback)
  3214. def hyperparameter_search(
  3215. self,
  3216. hp_space: Optional[Callable[["optuna.Trial"], dict[str, float]]] = None,
  3217. compute_objective: Optional[Callable[[dict[str, float]], float]] = None,
  3218. n_trials: int = 20,
  3219. direction: Union[str, list[str]] = "minimize",
  3220. backend: Optional[Union["str", HPSearchBackend]] = None,
  3221. hp_name: Optional[Callable[["optuna.Trial"], str]] = None,
  3222. **kwargs,
  3223. ) -> Union[BestRun, list[BestRun]]:
  3224. """
  3225. Launch an hyperparameter search using `optuna` or `Ray Tune` or `SigOpt`. The optimized quantity is determined
  3226. by `compute_objective`, which defaults to a function returning the evaluation loss when no metric is provided,
  3227. the sum of all metrics otherwise.
  3228. <Tip warning={true}>
  3229. To use this method, you need to have provided a `model_init` when initializing your [`Trainer`]: we need to
  3230. reinitialize the model at each new run. This is incompatible with the `optimizers` argument, so you need to
  3231. subclass [`Trainer`] and override the method [`~Trainer.create_optimizer_and_scheduler`] for custom
  3232. optimizer/scheduler.
  3233. </Tip>
  3234. Args:
  3235. hp_space (`Callable[["optuna.Trial"], dict[str, float]]`, *optional*):
  3236. A function that defines the hyperparameter search space. Will default to
  3237. [`~trainer_utils.default_hp_space_optuna`] or [`~trainer_utils.default_hp_space_ray`] or
  3238. [`~trainer_utils.default_hp_space_sigopt`] depending on your backend.
  3239. compute_objective (`Callable[[dict[str, float]], float]`, *optional*):
  3240. A function computing the objective to minimize or maximize from the metrics returned by the `evaluate`
  3241. method. Will default to [`~trainer_utils.default_compute_objective`].
  3242. n_trials (`int`, *optional*, defaults to 100):
  3243. The number of trial runs to test.
  3244. direction (`str` or `list[str]`, *optional*, defaults to `"minimize"`):
  3245. If it's single objective optimization, direction is `str`, can be `"minimize"` or `"maximize"`, you
  3246. should pick `"minimize"` when optimizing the validation loss, `"maximize"` when optimizing one or
  3247. several metrics. If it's multi objectives optimization, direction is `list[str]`, can be List of
  3248. `"minimize"` and `"maximize"`, you should pick `"minimize"` when optimizing the validation loss,
  3249. `"maximize"` when optimizing one or several metrics.
  3250. backend (`str` or [`~training_utils.HPSearchBackend`], *optional*):
  3251. The backend to use for hyperparameter search. Will default to optuna or Ray Tune or SigOpt, depending
  3252. on which one is installed. If all are installed, will default to optuna.
  3253. hp_name (`Callable[["optuna.Trial"], str]]`, *optional*):
  3254. A function that defines the trial/run name. Will default to None.
  3255. kwargs (`dict[str, Any]`, *optional*):
  3256. Additional keyword arguments for each backend:
  3257. - `optuna`: parameters from
  3258. [optuna.study.create_study](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.create_study.html)
  3259. and also the parameters `timeout`, `n_jobs` and `gc_after_trial` from
  3260. [optuna.study.Study.optimize](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study.optimize)
  3261. - `ray`: parameters from [tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html#tune-run).
  3262. If `resources_per_trial` is not set in the `kwargs`, it defaults to 1 CPU core and 1 GPU (if available).
  3263. If `progress_reporter` is not set in the `kwargs`,
  3264. [ray.tune.CLIReporter](https://docs.ray.io/en/latest/tune/api/doc/ray.tune.CLIReporter.html) is used.
  3265. - `sigopt`: the parameter `proxies` from
  3266. [sigopt.Connection.set_proxies](https://docs.sigopt.com/support/faq#how-do-i-use-sigopt-with-a-proxy).
  3267. Returns:
  3268. [`trainer_utils.BestRun` or `list[trainer_utils.BestRun]`]: All the information about the best run or best
  3269. runs for multi-objective optimization. Experiment summary can be found in `run_summary` attribute for Ray
  3270. backend.
  3271. """
  3272. if backend is None:
  3273. backend = default_hp_search_backend()
  3274. backend = HPSearchBackend(backend)
  3275. backend_obj = ALL_HYPERPARAMETER_SEARCH_BACKENDS[backend]()
  3276. backend_obj.ensure_available()
  3277. self.hp_search_backend = backend
  3278. if self.model_init is None:
  3279. raise RuntimeError(
  3280. "To use hyperparameter search, you need to pass your model through a model_init function."
  3281. )
  3282. self.hp_space = backend_obj.default_hp_space if hp_space is None else hp_space
  3283. self.hp_name = hp_name
  3284. self.compute_objective = default_compute_objective if compute_objective is None else compute_objective
  3285. best_run = backend_obj.run(self, n_trials, direction, **kwargs)
  3286. self.hp_search_backend = None
  3287. return best_run
  3288. def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
  3289. """
  3290. Log `logs` on the various objects watching training.
  3291. Subclass and override this method to inject custom behavior.
  3292. Args:
  3293. logs (`dict[str, float]`):
  3294. The values to log.
  3295. start_time (`Optional[float]`):
  3296. The start of training.
  3297. """
  3298. if self.state.epoch is not None:
  3299. logs["epoch"] = self.state.epoch
  3300. if self.args.include_num_input_tokens_seen != "no":
  3301. logs["num_input_tokens_seen"] = self.state.num_input_tokens_seen
  3302. if start_time is not None:
  3303. logs.update(speed_metrics("train", start_time, num_tokens=self.state.num_input_tokens_seen))
  3304. output = {**logs, **{"step": self.state.global_step}}
  3305. self.state.log_history.append(output)
  3306. self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
  3307. def _prepare_input(self, data: Union[torch.Tensor, Any]) -> Union[torch.Tensor, Any]:
  3308. """
  3309. Prepares one `data` before feeding it to the model, be it a tensor or a nested list/dictionary of tensors.
  3310. """
  3311. if isinstance(data, Mapping):
  3312. return type(data)({k: self._prepare_input(v) for k, v in data.items()})
  3313. elif isinstance(data, (tuple, list)):
  3314. return type(data)(self._prepare_input(v) for v in data)
  3315. elif isinstance(data, torch.Tensor):
  3316. kwargs = {"device": self.args.device}
  3317. if self.is_deepspeed_enabled and (torch.is_floating_point(data) or torch.is_complex(data)):
  3318. # NLP models inputs are int/uint and those get adjusted to the right dtype of the
  3319. # embedding. Other models such as wav2vec2's inputs are already float and thus
  3320. # may need special handling to match the dtypes of the model
  3321. kwargs.update({"dtype": self.accelerator.state.deepspeed_plugin.hf_ds_config.dtype()})
  3322. return data.to(**kwargs)
  3323. return data
  3324. def _prepare_inputs(self, inputs: dict[str, Union[torch.Tensor, Any]]) -> dict[str, Union[torch.Tensor, Any]]:
  3325. """
  3326. Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and
  3327. handling potential state.
  3328. """
  3329. inputs = self._prepare_input(inputs)
  3330. if len(inputs) == 0:
  3331. raise ValueError(
  3332. "The batch received was empty, your model won't be able to train on it. Double-check that your "
  3333. f"training dataset contains keys expected by the model: {','.join(self._signature_columns)}."
  3334. )
  3335. if self.args.past_index >= 0 and self._past is not None:
  3336. inputs["mems"] = self._past
  3337. return inputs
  3338. def _is_attention_mask_causal(self, attention_mask):
  3339. """
  3340. Check if an attention mask is causal (compatible with causal attention).
  3341. Context parallelism only supports causal attention patterns. This function
  3342. checks if the provided attention mask is compatible.
  3343. Args:
  3344. attention_mask (torch.Tensor): The attention mask to check
  3345. Returns:
  3346. bool: True if the mask is causal or compatible with causal attention
  3347. """
  3348. if attention_mask is None:
  3349. return True # No mask is considered causal (model uses default causal masking)
  3350. # Handle different mask dimensions
  3351. if attention_mask.dim() == 2:
  3352. # (batch_size, seq_len) - standard padding mask, compatible with causal attention
  3353. return True
  3354. elif attention_mask.dim() in [3, 4]:
  3355. # (batch_size, seq_len, seq_len) or (batch_size, num_heads, seq_len, seq_len)
  3356. # Check if it's lower triangular (causal)
  3357. seq_len = attention_mask.shape[-1]
  3358. if seq_len <= 1:
  3359. return True # Single token or empty is always causal
  3360. # Take first batch and head (if 4D) for checking pattern
  3361. if attention_mask.dim() == 4:
  3362. mask = attention_mask[0, 0] # First batch, first head
  3363. else:
  3364. mask = attention_mask[0] # First batch
  3365. # Check if upper triangular part is masked (should be 0 or very negative for causal)
  3366. upper_triangular = torch.triu(mask, diagonal=1)
  3367. # For causal masks, upper triangular should be 0 or very negative (like -inf)
  3368. # Use a reasonable threshold to handle float precision issues
  3369. is_causal = torch.all(upper_triangular <= 1e-6) or torch.all(upper_triangular < -1e4)
  3370. return is_causal.item() if isinstance(is_causal, torch.Tensor) else is_causal
  3371. # For unknown dimensions, be conservative and reject
  3372. return False
  3373. def _prepare_context_parallel_inputs(self, model, inputs: dict[str, Union[torch.Tensor, Any]]):
  3374. """
  3375. Prepare inputs for context parallelism by setting up buffers and validation.
  3376. Args:
  3377. model: The model being trained
  3378. inputs: Input tensors to prepare
  3379. Returns:
  3380. tuple: (context_manager, prepared_inputs) where context_manager is either
  3381. the context parallelism wrapper or a no-op context
  3382. """
  3383. if (
  3384. getattr(self.accelerator, "parallelism_config", None) is not None
  3385. and self.accelerator.parallelism_config.cp_enabled
  3386. ):
  3387. if hasattr(model, "config"):
  3388. if model.config._attn_implementation != "sdpa":
  3389. raise ValueError(
  3390. f"Context parallelism is supported only with SDPA attention, you are using {model.config._attn_implementation}."
  3391. )
  3392. if "position_ids" not in inputs:
  3393. logger.warning_once("Position IDs not found in the inputs, generating manually")
  3394. inputs["position_ids"] = torch.arange(
  3395. inputs["input_ids"].size(1), device=inputs["input_ids"].device
  3396. ).expand(inputs["input_ids"].size(0), -1)
  3397. if "shift_labels" not in inputs:
  3398. logger.warning_once("Shift labels not found in the inputs, shifting manually")
  3399. if "labels" in inputs:
  3400. _ignore_index = -100
  3401. labels = nn.functional.pad(inputs["labels"], (0, 1), value=_ignore_index)
  3402. inputs["shift_labels"] = labels[:, 1:].contiguous()
  3403. buffers = []
  3404. buffer_seq_dims = []
  3405. if "input_ids" in inputs:
  3406. buffers.append(inputs["input_ids"])
  3407. buffer_seq_dims.append(1) # Sequence dimension
  3408. if "labels" in inputs:
  3409. buffers.append(inputs["labels"])
  3410. buffer_seq_dims.append(1)
  3411. if "shift_labels" in inputs:
  3412. buffers.append(inputs["shift_labels"])
  3413. buffer_seq_dims.append(1)
  3414. # Add attention_mask to buffers for context parallel splitting (only if causal)
  3415. if "attention_mask" in inputs:
  3416. # Only validate causal mask once for performance
  3417. if not getattr(self, "_attn_mask_causal_checked", False):
  3418. # Context parallel currently doesn't support other masks than causal
  3419. # Accelerate applies hooks to replace mask with is_causal arg in SDPA
  3420. # Check if the mask is really causal and if not throw an error
  3421. attention_mask = inputs["attention_mask"]
  3422. if not self._is_attention_mask_causal(attention_mask):
  3423. raise ValueError(
  3424. "Context parallelism only supports causal attention masks. "
  3425. "The provided attention_mask is not causal. "
  3426. "Please ensure your data uses causal masking (lower triangular) "
  3427. "or remove the attention_mask to use the model's default causal masking."
  3428. )
  3429. self._attn_mask_causal_checked = True
  3430. if self._attn_mask_causal_checked:
  3431. # Add to buffers only after validation (or if validation already passed)
  3432. attention_mask = inputs["attention_mask"]
  3433. if attention_mask.dim() == 2:
  3434. buffers.append(attention_mask)
  3435. buffer_seq_dims.append(1)
  3436. else:
  3437. # Other dimensionality; keep as-is without sharding to avoid incorrect splits
  3438. pass
  3439. # Include position_ids in context parallelism splitting
  3440. if "position_ids" in inputs and inputs["position_ids"] is not None:
  3441. buffers.append(inputs["position_ids"])
  3442. buffer_seq_dims.append(1)
  3443. return partial(
  3444. self.accelerator.maybe_context_parallel,
  3445. buffers=buffers,
  3446. buffer_seq_dims=buffer_seq_dims,
  3447. no_restore_buffers=set(buffers),
  3448. ), inputs
  3449. return contextlib.nullcontext, inputs
  3450. def compute_loss_context_manager(self):
  3451. """
  3452. A helper wrapper to group together context managers.
  3453. """
  3454. ctx_stack = contextlib.ExitStack()
  3455. autocast_ctx = self.autocast_smart_context_manager()
  3456. if not isinstance(autocast_ctx, contextlib.nullcontext):
  3457. ctx_stack.enter_context(autocast_ctx)
  3458. return ctx_stack
  3459. def autocast_smart_context_manager(self, cache_enabled: Optional[bool] = True):
  3460. """
  3461. A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired
  3462. arguments, depending on the situation.
  3463. """
  3464. if self.use_cpu_amp:
  3465. ctx_manager = torch.autocast(device_type="cpu", cache_enabled=cache_enabled, dtype=self.amp_dtype)
  3466. else:
  3467. ctx_manager = contextlib.nullcontext()
  3468. return ctx_manager
  3469. def training_step(
  3470. self,
  3471. model: nn.Module,
  3472. inputs: dict[str, Union[torch.Tensor, Any]],
  3473. num_items_in_batch: Optional[torch.Tensor] = None,
  3474. ) -> torch.Tensor:
  3475. """
  3476. Perform a training step on a batch of inputs.
  3477. Subclass and override to inject custom behavior.
  3478. Args:
  3479. model (`nn.Module`):
  3480. The model to train.
  3481. inputs (`dict[str, Union[torch.Tensor, Any]]`):
  3482. The inputs and targets of the model.
  3483. The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
  3484. argument `labels`. Check your model's documentation for all accepted arguments.
  3485. Return:
  3486. `torch.Tensor`: The tensor with training loss on this batch.
  3487. """
  3488. # Prepare buffers for context parallelism
  3489. cp_context, inputs = self._prepare_context_parallel_inputs(model, inputs)
  3490. # Context manager is no-op if CP isn't enabled
  3491. with cp_context():
  3492. model.train()
  3493. if hasattr(self.optimizer, "train") and callable(self.optimizer.train):
  3494. self.optimizer.train()
  3495. inputs = self._prepare_inputs(inputs)
  3496. if is_sagemaker_mp_enabled():
  3497. loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps)
  3498. return loss_mb.reduce_mean().detach().to(self.args.device)
  3499. with self.compute_loss_context_manager():
  3500. loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
  3501. del inputs
  3502. if (
  3503. self.args.torch_empty_cache_steps is not None
  3504. and self.state.global_step % self.args.torch_empty_cache_steps == 0
  3505. ):
  3506. if is_torch_xpu_available():
  3507. torch.xpu.empty_cache()
  3508. elif is_torch_mlu_available():
  3509. torch.mlu.empty_cache()
  3510. elif is_torch_musa_available():
  3511. torch.musa.empty_cache()
  3512. elif is_torch_npu_available():
  3513. torch.npu.empty_cache()
  3514. elif is_torch_mps_available():
  3515. torch.mps.empty_cache()
  3516. elif is_torch_hpu_available():
  3517. logger.warning(
  3518. "`torch_empty_cache_steps` is set but HPU device/backend does not support empty_cache()."
  3519. )
  3520. else:
  3521. torch.cuda.empty_cache()
  3522. kwargs = {}
  3523. # For LOMO optimizers you need to explicitly use the learning rate
  3524. if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
  3525. kwargs["learning_rate"] = self._get_learning_rate()
  3526. if self.args.n_gpu > 1:
  3527. loss = loss.mean() # mean() to average on multi-gpu parallel training
  3528. if self.use_apex:
  3529. from apex import amp
  3530. with amp.scale_loss(loss, self.optimizer) as scaled_loss:
  3531. scaled_loss.backward()
  3532. else:
  3533. # Finally we need to normalize the loss for reporting if GA loss bug is not fixed during compute loss
  3534. if (
  3535. not self.model_accepts_loss_kwargs or num_items_in_batch is None
  3536. ) and self.compute_loss_func is None:
  3537. # If the model does not accept loss kwargs, we need to normalize the loss by the number of gradient accumulation steps
  3538. loss = loss / self.current_gradient_accumulation_steps
  3539. # Turning off loss scaling w.r.t. gradient accumulation when DeepSpeed is enabled
  3540. # https://github.com/huggingface/transformers/pull/35808
  3541. if self.accelerator.distributed_type == DistributedType.DEEPSPEED:
  3542. kwargs["scale_wrt_gas"] = False
  3543. self.accelerator.backward(loss, **kwargs)
  3544. return loss.detach()
  3545. def compute_loss(
  3546. self,
  3547. model: nn.Module,
  3548. inputs: dict[str, Union[torch.Tensor, Any]],
  3549. return_outputs: bool = False,
  3550. num_items_in_batch: Optional[torch.Tensor] = None,
  3551. ):
  3552. """
  3553. How the loss is computed by Trainer. By default, all models return the loss in the first element.
  3554. Args:
  3555. model (`nn.Module`):
  3556. The model to compute the loss for.
  3557. inputs (`dict[str, Union[torch.Tensor, Any]]`):
  3558. The input data for the model.
  3559. return_outputs (`bool`, *optional*, defaults to `False`):
  3560. Whether to return the model outputs along with the loss.
  3561. num_items_in_batch (Optional[torch.Tensor], *optional*):
  3562. The number of items in the batch. If num_items_in_batch is not passed,
  3563. Returns:
  3564. The loss of the model along with its output if return_outputs was set to True
  3565. Subclass and override for custom behavior. If you are not using `num_items_in_batch` when computing your loss,
  3566. make sure to overwrite `self.model_accepts_loss_kwargs` to `False`. Otherwise, the loss calculating might be slightly inaccurate when performing gradient accumulation.
  3567. """
  3568. if (self.label_smoother is not None or self.compute_loss_func is not None) and "labels" in inputs:
  3569. labels = inputs.pop("labels")
  3570. else:
  3571. labels = None
  3572. if self.model_accepts_loss_kwargs:
  3573. kwargs = {}
  3574. if num_items_in_batch is not None:
  3575. kwargs["num_items_in_batch"] = num_items_in_batch
  3576. inputs = {**inputs, **kwargs}
  3577. outputs = model(**inputs)
  3578. # Save past state if it exists
  3579. # TODO: this needs to be fixed and made cleaner later.
  3580. if self.args.past_index >= 0:
  3581. self._past = outputs[self.args.past_index]
  3582. # User-defined compute_loss function
  3583. if self.compute_loss_func is not None:
  3584. if labels is None:
  3585. logger.warning(
  3586. "Trainer: `compute_loss_func` is defined but `labels=None`. "
  3587. "Your custom loss function will still be called with labels=None. "
  3588. )
  3589. loss = self.compute_loss_func(
  3590. outputs,
  3591. labels,
  3592. num_items_in_batch=num_items_in_batch,
  3593. )
  3594. # Default HF loss handling (label smoothing) if no custom loss function
  3595. elif labels is not None:
  3596. unwrapped_model = self.accelerator.unwrap_model(model)
  3597. model_name = (
  3598. unwrapped_model.base_model.model._get_name()
  3599. if _is_peft_model(unwrapped_model)
  3600. else unwrapped_model._get_name()
  3601. )
  3602. if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
  3603. loss = self.label_smoother(outputs, labels, shift_labels=True)
  3604. else:
  3605. loss = self.label_smoother(outputs, labels)
  3606. else:
  3607. if isinstance(outputs, dict) and "loss" not in outputs:
  3608. raise ValueError(
  3609. "The model did not return a loss from the inputs, only the following keys: "
  3610. f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
  3611. )
  3612. # We don't use .loss here since the model may return tuples instead of ModelOutput.
  3613. loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
  3614. if (
  3615. self.args.average_tokens_across_devices
  3616. and (self.model_accepts_loss_kwargs or self.compute_loss_func)
  3617. and num_items_in_batch is not None
  3618. ):
  3619. loss *= self.accelerator.num_processes if self.args.n_gpu <= 1 else self.args.n_gpu
  3620. return (loss, outputs) if return_outputs else loss
  3621. def is_local_process_zero(self) -> bool:
  3622. """
  3623. Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on several
  3624. machines) main process.
  3625. """
  3626. return self.args.local_process_index == 0
  3627. def is_world_process_zero(self) -> bool:
  3628. """
  3629. Whether or not this process is the global main process (when training in a distributed fashion on several
  3630. machines, this is only going to be `True` for one process).
  3631. """
  3632. # Special case for SageMaker ModelParallel since there process_index is dp_process_index, not the global
  3633. # process index.
  3634. if is_sagemaker_mp_enabled():
  3635. return smp.rank() == 0
  3636. else:
  3637. return self.args.process_index == 0
  3638. def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
  3639. """
  3640. Will save the model, so you can reload it using `from_pretrained()`.
  3641. Will only save from the main process.
  3642. """
  3643. if output_dir is None:
  3644. output_dir = self.args.output_dir
  3645. if is_torch_xla_available():
  3646. self._save_tpu(output_dir)
  3647. elif is_sagemaker_mp_enabled():
  3648. # Calling the state_dict needs to be done on the wrapped model and on all processes.
  3649. os.makedirs(output_dir, exist_ok=True)
  3650. state_dict = self.model_wrapped.state_dict()
  3651. if self.args.should_save:
  3652. self._save(output_dir, state_dict=state_dict)
  3653. if IS_SAGEMAKER_MP_POST_1_10:
  3654. # 'user_content.pt' indicates model state_dict saved with smp >= 1.10
  3655. Path(os.path.join(output_dir, "user_content.pt")).touch()
  3656. # We are in N-D parallelism if we have parallelism_config set, so we check accelerate if we're on a to_save rank
  3657. elif getattr(self.accelerator, "parallelism_config", None) is not None:
  3658. if self.accelerator.should_save_model:
  3659. self._save(output_dir)
  3660. # If we drop to here, we're in 1D parallelism, so all ranks need to go to `save_pretrained`
  3661. elif (tp_size := getattr(self.model, "_tp_size", 0)) is not None and tp_size > 1:
  3662. self._save(output_dir)
  3663. elif self.is_fsdp_enabled:
  3664. if "FULL_STATE_DICT" in str(self.accelerator.state.fsdp_plugin.state_dict_type):
  3665. state_dict = self.accelerator.get_state_dict(self.model)
  3666. if self.args.should_save:
  3667. self._save(output_dir, state_dict=state_dict)
  3668. elif self.is_deepspeed_enabled:
  3669. try:
  3670. state_dict = self.accelerator.get_state_dict(self.deepspeed)
  3671. if self.args.should_save:
  3672. self._save(output_dir, state_dict=state_dict)
  3673. except ValueError:
  3674. logger.warning(
  3675. " stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead, use"
  3676. " zero_to_fp32.py to recover weights"
  3677. )
  3678. if self.args.should_save:
  3679. self._save(output_dir, state_dict={})
  3680. # remove the dummy state_dict
  3681. remove_dummy_checkpoint(self.args.should_save, output_dir, [WEIGHTS_NAME, SAFE_WEIGHTS_NAME])
  3682. self.model_wrapped.save_checkpoint(output_dir)
  3683. elif self.args.should_save:
  3684. self._save(output_dir)
  3685. # Push to the Hub when `save_model` is called by the user.
  3686. if self.args.push_to_hub and not _internal_call:
  3687. self.push_to_hub(commit_message="Model save", revision=self.args.hub_revision)
  3688. def _save_tpu(self, output_dir: Optional[str] = None):
  3689. output_dir = output_dir if output_dir is not None else self.args.output_dir
  3690. logger.info(f"Saving model checkpoint to {output_dir}")
  3691. model = self.model
  3692. xm.mark_step()
  3693. if xm.is_master_ordinal(local=False):
  3694. os.makedirs(output_dir, exist_ok=True)
  3695. torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
  3696. # Save a trained model and configuration using `save_pretrained()`.
  3697. # They can then be reloaded using `from_pretrained()`
  3698. supported_classes = (PushToHubMixin,)
  3699. xm.rendezvous("saving_checkpoint")
  3700. if self.is_fsdp_xla_v1_enabled:
  3701. ckpt = {
  3702. "model": model.state_dict(),
  3703. "shard_metadata": model.get_shard_metadata(),
  3704. }
  3705. ckpt_path = os.path.join(
  3706. output_dir, f"rank{self.args.process_index}-of-{self.args.world_size}-{WEIGHTS_NAME}"
  3707. )
  3708. # All ranks save sharded checkpoint
  3709. xm.save(ckpt, ckpt_path, master_only=False)
  3710. # Make sure all ranks have saved checkpoints
  3711. xm.rendezvous("save_full_checkpoints")
  3712. # Master save full checkpoint
  3713. if self.args.should_save:
  3714. from torch_xla.distributed.fsdp import consolidate_sharded_model_checkpoints
  3715. full_state_dict, _ = consolidate_sharded_model_checkpoints(
  3716. ckpt_prefix=os.path.join(output_dir, ""),
  3717. ckpt_suffix=f"rank*-of-*-{WEIGHTS_NAME}",
  3718. save_model=False,
  3719. )
  3720. model = model.module.module
  3721. unwrapped_model = self.accelerator.unwrap_model(model)
  3722. if isinstance(unwrapped_model, supported_classes):
  3723. unwrapped_model.save_pretrained(
  3724. output_dir,
  3725. state_dict=full_state_dict,
  3726. save_function=xm.save,
  3727. safe_serialization=self.args.save_safetensors,
  3728. )
  3729. else:
  3730. logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
  3731. xm.save(full_state_dict, os.path.join(output_dir, WEIGHTS_NAME))
  3732. elif not isinstance(model, supported_classes):
  3733. if isinstance(self.accelerator.unwrap_model(model), supported_classes):
  3734. self.accelerator.unwrap_model(model).save_pretrained(
  3735. output_dir,
  3736. is_main_process=self.args.should_save,
  3737. state_dict=xm._maybe_convert_to_cpu(model.state_dict()),
  3738. save_function=xm.save,
  3739. safe_serialization=self.args.save_safetensors,
  3740. )
  3741. else:
  3742. logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
  3743. state_dict = xm._maybe_convert_to_cpu(model.state_dict())
  3744. xm.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
  3745. else:
  3746. model.save_pretrained(
  3747. output_dir,
  3748. is_main_process=self.args.should_save,
  3749. save_function=xm.save,
  3750. safe_serialization=self.args.save_safetensors,
  3751. state_dict=xm._maybe_convert_to_cpu(model.state_dict()),
  3752. )
  3753. if self.processing_class is not None and self.args.should_save:
  3754. self.processing_class.save_pretrained(output_dir)
  3755. def _save(self, output_dir: Optional[str] = None, state_dict=None):
  3756. # If we are executing this function, we are the process zero, so we don't check for that.
  3757. output_dir = output_dir if output_dir is not None else self.args.output_dir
  3758. os.makedirs(output_dir, exist_ok=True)
  3759. logger.info(f"Saving model checkpoint to {output_dir}")
  3760. supported_classes = (PreTrainedModel,) if not is_peft_available() else (PreTrainedModel, PeftModel)
  3761. # Save a trained model and configuration using `save_pretrained()`.
  3762. # They can then be reloaded using `from_pretrained()`
  3763. if not isinstance(self.model, supported_classes):
  3764. if state_dict is None:
  3765. state_dict = self.model.state_dict()
  3766. if isinstance(self.accelerator.unwrap_model(self.model, keep_torch_compile=False), supported_classes):
  3767. self.accelerator.unwrap_model(self.model, keep_torch_compile=False).save_pretrained(
  3768. output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
  3769. )
  3770. else:
  3771. logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
  3772. if self.args.save_safetensors:
  3773. safetensors.torch.save_file(
  3774. state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME), metadata={"format": "pt"}
  3775. )
  3776. else:
  3777. torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
  3778. else:
  3779. self.model.save_pretrained(
  3780. output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
  3781. )
  3782. if self.processing_class is not None:
  3783. self.processing_class.save_pretrained(output_dir)
  3784. elif (
  3785. self.data_collator is not None
  3786. and hasattr(self.data_collator, "tokenizer")
  3787. and self.data_collator.tokenizer is not None
  3788. ):
  3789. logger.info("Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`")
  3790. self.data_collator.tokenizer.save_pretrained(output_dir)
  3791. # Good practice: save your training arguments together with the trained model
  3792. torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
  3793. def store_flos(self):
  3794. # Storing the number of floating-point operations that went into the model
  3795. if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
  3796. self.state.total_flos += (
  3797. distributed_broadcast_scalars([self.current_flos], device=self.args.device).sum().item()
  3798. )
  3799. self.current_flos = 0
  3800. else:
  3801. self.state.total_flos += self.current_flos
  3802. self.current_flos = 0
  3803. def _sorted_checkpoints(
  3804. self, output_dir=None, checkpoint_prefix=PREFIX_CHECKPOINT_DIR, use_mtime=False
  3805. ) -> list[str]:
  3806. ordering_and_checkpoint_path = []
  3807. glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*") if os.path.isdir(x)]
  3808. for path in glob_checkpoints:
  3809. if use_mtime:
  3810. ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
  3811. else:
  3812. regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path)
  3813. if regex_match is not None and regex_match.groups() is not None:
  3814. ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
  3815. checkpoints_sorted = sorted(ordering_and_checkpoint_path)
  3816. # mtime is not reliable on all filesystems, especially on some fuse fs in cloud environments
  3817. # so we check if the mtime is fake and fallback to numerical ordering if needed
  3818. if use_mtime and len(ordering_and_checkpoint_path) > 1:
  3819. mtime_diff = checkpoints_sorted[-1][0] - checkpoints_sorted[0][0]
  3820. if mtime_diff < 1.0: # less than 1 second, which is almost impossible when mtime works fine
  3821. warnings.warn("mtime may not be reliable on this filesystem, falling back to numerical ordering")
  3822. return self._sorted_checkpoints(
  3823. use_mtime=False, output_dir=output_dir, checkpoint_prefix=checkpoint_prefix
  3824. )
  3825. checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
  3826. # Make sure we don't delete the best model.
  3827. if (
  3828. self.state.best_model_checkpoint is not None
  3829. and str(Path(self.state.best_model_checkpoint)) in checkpoints_sorted
  3830. ):
  3831. best_model_index = checkpoints_sorted.index(str(Path(self.state.best_model_checkpoint)))
  3832. for i in range(best_model_index, len(checkpoints_sorted) - 2):
  3833. checkpoints_sorted[i], checkpoints_sorted[i + 1] = checkpoints_sorted[i + 1], checkpoints_sorted[i]
  3834. return checkpoints_sorted
  3835. def _rotate_checkpoints(self, use_mtime=False, output_dir=None) -> None:
  3836. if self.args.save_total_limit is None or self.args.save_total_limit <= 0:
  3837. return
  3838. # Check if we should delete older checkpoint(s)
  3839. checkpoints_sorted = self._sorted_checkpoints(use_mtime=use_mtime, output_dir=output_dir)
  3840. if len(checkpoints_sorted) <= self.args.save_total_limit:
  3841. return
  3842. # If save_total_limit=1 with load_best_model_at_end=True, we could end up deleting the last checkpoint, which
  3843. # we don't do to allow resuming.
  3844. save_total_limit = self.args.save_total_limit
  3845. if (
  3846. self.state.best_model_checkpoint is not None
  3847. and self.args.save_total_limit == 1
  3848. and checkpoints_sorted[-1] != self.state.best_model_checkpoint
  3849. ):
  3850. save_total_limit = 2
  3851. number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - save_total_limit)
  3852. checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
  3853. for checkpoint in checkpoints_to_be_deleted:
  3854. logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
  3855. shutil.rmtree(checkpoint, ignore_errors=True)
  3856. def evaluate(
  3857. self,
  3858. eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
  3859. ignore_keys: Optional[list[str]] = None,
  3860. metric_key_prefix: str = "eval",
  3861. ) -> dict[str, float]:
  3862. """
  3863. Run evaluation and returns metrics.
  3864. The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
  3865. (pass it to the init `compute_metrics` argument).
  3866. You can also subclass and override this method to inject custom behavior.
  3867. Args:
  3868. eval_dataset (Union[`Dataset`, dict[str, `Dataset`]), *optional*):
  3869. Pass a dataset if you wish to override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns
  3870. not accepted by the `model.forward()` method are automatically removed. If it is a dictionary, it will
  3871. evaluate on each dataset, prepending the dictionary key to the metric name. Datasets must implement the
  3872. `__len__` method.
  3873. <Tip>
  3874. If you pass a dictionary with names of datasets as keys and datasets as values, evaluate will run
  3875. separate evaluations on each dataset. This can be useful to monitor how training affects other
  3876. datasets or simply to get a more fine-grained evaluation.
  3877. When used with `load_best_model_at_end`, make sure `metric_for_best_model` references exactly one
  3878. of the datasets. If you, for example, pass in `{"data1": data1, "data2": data2}` for two datasets
  3879. `data1` and `data2`, you could specify `metric_for_best_model="eval_data1_loss"` for using the
  3880. loss on `data1` and `metric_for_best_model="eval_data2_loss"` for the loss on `data2`.
  3881. </Tip>
  3882. ignore_keys (`list[str]`, *optional*):
  3883. A list of keys in the output of your model (if it is a dictionary) that should be ignored when
  3884. gathering predictions.
  3885. metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
  3886. An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
  3887. "eval_bleu" if the prefix is "eval" (default)
  3888. Returns:
  3889. A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
  3890. dictionary also contains the epoch number which comes from the training state.
  3891. """
  3892. # handle multiple eval datasets
  3893. override = eval_dataset is not None
  3894. eval_dataset = eval_dataset if override else self.eval_dataset
  3895. if isinstance(eval_dataset, dict):
  3896. metrics = {}
  3897. for eval_dataset_name, _eval_dataset in eval_dataset.items():
  3898. dataset_metrics = self.evaluate(
  3899. eval_dataset=_eval_dataset if override else eval_dataset_name,
  3900. ignore_keys=ignore_keys,
  3901. metric_key_prefix=f"{metric_key_prefix}_{eval_dataset_name}",
  3902. )
  3903. metrics.update(dataset_metrics)
  3904. return metrics
  3905. # memory metrics - must set up as early as possible
  3906. self._memory_tracker.start()
  3907. eval_dataloader = self.get_eval_dataloader(eval_dataset)
  3908. if self.is_fsdp_xla_v2_enabled:
  3909. eval_dataloader = tpu_spmd_dataloader(eval_dataloader)
  3910. start_time = time.time()
  3911. eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
  3912. output = eval_loop(
  3913. eval_dataloader,
  3914. description="Evaluation",
  3915. # No point gathering the predictions if there are no metrics, otherwise we defer to
  3916. # self.args.prediction_loss_only
  3917. prediction_loss_only=True if self.compute_metrics is None else None,
  3918. ignore_keys=ignore_keys,
  3919. metric_key_prefix=metric_key_prefix,
  3920. )
  3921. total_batch_size = self.args.eval_batch_size * self.args.world_size
  3922. if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
  3923. start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
  3924. if f"{metric_key_prefix}_model_preparation_time" in output.metrics:
  3925. start_time += output.metrics[f"{metric_key_prefix}_model_preparation_time"]
  3926. output.metrics.update(
  3927. speed_metrics(
  3928. metric_key_prefix,
  3929. start_time,
  3930. num_samples=output.num_samples,
  3931. num_steps=math.ceil(output.num_samples / total_batch_size),
  3932. )
  3933. )
  3934. self.log(output.metrics)
  3935. if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
  3936. # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
  3937. xm.master_print(met.metrics_report())
  3938. self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics)
  3939. self._memory_tracker.stop_and_update_metrics(output.metrics)
  3940. return output.metrics
  3941. def predict(
  3942. self, test_dataset: Dataset, ignore_keys: Optional[list[str]] = None, metric_key_prefix: str = "test"
  3943. ) -> PredictionOutput:
  3944. """
  3945. Run prediction and returns predictions and potential metrics.
  3946. Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
  3947. will also return metrics, like in `evaluate()`.
  3948. Args:
  3949. test_dataset (`Dataset`):
  3950. Dataset to run the predictions on. If it is an `datasets.Dataset`, columns not accepted by the
  3951. `model.forward()` method are automatically removed. Has to implement the method `__len__`
  3952. ignore_keys (`list[str]`, *optional*):
  3953. A list of keys in the output of your model (if it is a dictionary) that should be ignored when
  3954. gathering predictions.
  3955. metric_key_prefix (`str`, *optional*, defaults to `"test"`):
  3956. An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
  3957. "test_bleu" if the prefix is "test" (default)
  3958. <Tip>
  3959. If your predictions or labels have different sequence length (for instance because you're doing dynamic padding
  3960. in a token classification task) the predictions will be padded (on the right) to allow for concatenation into
  3961. one array. The padding index is -100.
  3962. </Tip>
  3963. Returns: *NamedTuple* A namedtuple with the following keys:
  3964. - predictions (`np.ndarray`): The predictions on `test_dataset`.
  3965. - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
  3966. - metrics (`dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
  3967. labels).
  3968. """
  3969. # memory metrics - must set up as early as possible
  3970. self._memory_tracker.start()
  3971. test_dataloader = self.get_test_dataloader(test_dataset)
  3972. start_time = time.time()
  3973. eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
  3974. output = eval_loop(
  3975. test_dataloader, description="Prediction", ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix
  3976. )
  3977. total_batch_size = self.args.eval_batch_size * self.args.world_size
  3978. if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
  3979. start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
  3980. if f"{metric_key_prefix}_model_preparation_time" in output.metrics:
  3981. start_time += output.metrics[f"{metric_key_prefix}_model_preparation_time"]
  3982. output.metrics.update(
  3983. speed_metrics(
  3984. metric_key_prefix,
  3985. start_time,
  3986. num_samples=output.num_samples,
  3987. num_steps=math.ceil(output.num_samples / total_batch_size),
  3988. )
  3989. )
  3990. self.control = self.callback_handler.on_predict(self.args, self.state, self.control, output.metrics)
  3991. self._memory_tracker.stop_and_update_metrics(output.metrics)
  3992. return PredictionOutput(predictions=output.predictions, label_ids=output.label_ids, metrics=output.metrics)
  3993. def evaluation_loop(
  3994. self,
  3995. dataloader: DataLoader,
  3996. description: str,
  3997. prediction_loss_only: Optional[bool] = None,
  3998. ignore_keys: Optional[list[str]] = None,
  3999. metric_key_prefix: str = "eval",
  4000. ) -> EvalLoopOutput:
  4001. """
  4002. Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
  4003. Works both with or without labels.
  4004. """
  4005. args = self.args
  4006. prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
  4007. # if eval is called w/o train, handle model prep here
  4008. if self.is_deepspeed_enabled and self.deepspeed is None:
  4009. _, _ = deepspeed_init(self, num_training_steps=0, inference=True)
  4010. model = self._wrap_model(self.model, training=False, dataloader=dataloader)
  4011. if len(self.accelerator._models) == 0 and model is self.model:
  4012. start_time = time.time()
  4013. model = (
  4014. self.accelerator.prepare(model)
  4015. if self.is_deepspeed_enabled
  4016. or (self.is_fsdp_enabled and self.accelerator.mixed_precision != "fp8" and not self.args.torch_compile)
  4017. else self.accelerator.prepare_model(model, evaluation_mode=True)
  4018. )
  4019. self.model_preparation_time = round(time.time() - start_time, 4)
  4020. if self.is_fsdp_enabled:
  4021. self.model = model
  4022. # for the rest of this function `model` is the outside model, whether it was wrapped or not
  4023. if model is not self.model:
  4024. self.model_wrapped = model
  4025. # backward compatibility
  4026. if self.is_deepspeed_enabled:
  4027. self.deepspeed = self.model_wrapped
  4028. # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
  4029. # while ``train`` is running, cast it to the right dtype first and then put on device
  4030. if not self.is_in_train:
  4031. if args.fp16_full_eval:
  4032. model = model.to(dtype=torch.float16, device=args.device)
  4033. elif args.bf16_full_eval:
  4034. model = model.to(dtype=torch.bfloat16, device=args.device)
  4035. batch_size = self.args.eval_batch_size
  4036. logger.info(f"\n***** Running {description} *****")
  4037. if has_length(dataloader):
  4038. logger.info(f" Num examples = {self.num_examples(dataloader)}")
  4039. else:
  4040. logger.info(" Num examples: Unknown")
  4041. logger.info(f" Batch size = {batch_size}")
  4042. if hasattr(model, "eval") and callable(model.eval):
  4043. model.eval()
  4044. if hasattr(self.optimizer, "eval") and callable(self.optimizer.eval):
  4045. self.optimizer.eval()
  4046. self.callback_handler.eval_dataloader = dataloader
  4047. # Do this before wrapping.
  4048. eval_dataset = getattr(dataloader, "dataset", None)
  4049. if args.past_index >= 0:
  4050. self._past = None
  4051. # Initialize containers
  4052. all_losses = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
  4053. all_preds = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
  4054. all_labels = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
  4055. all_inputs = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
  4056. metrics = None
  4057. eval_set_kwargs = {}
  4058. # Will be useful when we have an iterable dataset so don't know its length.
  4059. observed_num_examples = 0
  4060. # Main evaluation loop
  4061. for step, inputs in enumerate(dataloader):
  4062. # Update the observed num examples
  4063. observed_batch_size = find_batch_size(inputs)
  4064. if observed_batch_size is not None:
  4065. observed_num_examples += observed_batch_size
  4066. # For batch samplers, batch_size is not known by the dataloader in advance.
  4067. if batch_size is None:
  4068. batch_size = observed_batch_size
  4069. # Prediction step
  4070. losses, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
  4071. main_input_name = getattr(self.model, "main_input_name", "input_ids")
  4072. inputs_decode = (
  4073. self._prepare_input(inputs[main_input_name]) if "inputs" in args.include_for_metrics else None
  4074. )
  4075. if is_torch_xla_available():
  4076. xm.mark_step()
  4077. # Update containers
  4078. if losses is not None:
  4079. losses = self.gather_function(losses.repeat(batch_size))
  4080. all_losses.add(losses)
  4081. if inputs_decode is not None:
  4082. inputs_decode = self.accelerator.pad_across_processes(inputs_decode, dim=1, pad_index=-100)
  4083. inputs_decode = self.gather_function(inputs_decode)
  4084. if not self.args.batch_eval_metrics or description == "Prediction":
  4085. all_inputs.add(inputs_decode)
  4086. if labels is not None:
  4087. # Pad labels here, preparing for preprocess_logits_for_metrics in next logits block.
  4088. labels = self.accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
  4089. if logits is not None:
  4090. logits = self.accelerator.pad_across_processes(logits, dim=1, pad_index=-100)
  4091. if self.preprocess_logits_for_metrics is not None:
  4092. logits = self.preprocess_logits_for_metrics(logits, labels)
  4093. logits = self.gather_function(logits)
  4094. if not self.args.batch_eval_metrics or description == "Prediction":
  4095. all_preds.add(logits)
  4096. if labels is not None:
  4097. labels = self.gather_function(labels)
  4098. if not self.args.batch_eval_metrics or description == "Prediction":
  4099. all_labels.add(labels)
  4100. self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
  4101. if self.args.batch_eval_metrics:
  4102. if self.compute_metrics is not None and logits is not None and labels is not None:
  4103. is_last_step = self.accelerator.gradient_state.end_of_dataloader
  4104. batch_kwargs = {}
  4105. batch_kwargs["losses"] = losses if "loss" in args.include_for_metrics else None
  4106. batch_kwargs["inputs"] = inputs if "inputs" in args.include_for_metrics else None
  4107. metrics = self.compute_metrics(
  4108. EvalPrediction(predictions=logits, label_ids=labels, **batch_kwargs),
  4109. compute_result=is_last_step,
  4110. )
  4111. del losses, logits, labels, inputs
  4112. torch.cuda.empty_cache()
  4113. # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
  4114. elif args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
  4115. all_losses.to_cpu_and_numpy()
  4116. all_preds.to_cpu_and_numpy()
  4117. all_labels.to_cpu_and_numpy()
  4118. all_inputs.to_cpu_and_numpy()
  4119. del losses, logits, labels, inputs
  4120. torch.cuda.empty_cache()
  4121. # After all calls to `.gather_function`, reset to `gather_for_metrics`:
  4122. self.gather_function = self.accelerator.gather_for_metrics
  4123. if args.past_index and hasattr(self, "_past"):
  4124. # Clean the state at the end of the evaluation loop
  4125. delattr(self, "_past")
  4126. # Gather all remaining tensors and put them back on the CPU
  4127. all_losses = all_losses.get_arrays()
  4128. all_preds = all_preds.get_arrays()
  4129. all_labels = all_labels.get_arrays()
  4130. all_inputs = all_inputs.get_arrays()
  4131. # Number of samples
  4132. if has_length(eval_dataset):
  4133. num_samples = len(eval_dataset)
  4134. # The instance check is weird and does not actually check for the type, but whether the dataset has the right
  4135. # methods. Therefore we need to make sure it also has the attribute.
  4136. elif isinstance(eval_dataset, IterableDatasetShard) and getattr(eval_dataset, "num_examples", 0) > 0:
  4137. num_samples = eval_dataset.num_examples
  4138. else:
  4139. if has_length(dataloader):
  4140. num_samples = self.num_examples(dataloader)
  4141. else: # both len(dataloader.dataset) and len(dataloader) fail
  4142. num_samples = observed_num_examples
  4143. if num_samples == 0 and observed_num_examples > 0:
  4144. num_samples = observed_num_examples
  4145. # Metrics!
  4146. if (
  4147. self.compute_metrics is not None
  4148. and all_preds is not None
  4149. and all_labels is not None
  4150. and not self.args.batch_eval_metrics
  4151. ):
  4152. eval_set_kwargs["losses"] = all_losses if "loss" in args.include_for_metrics else None
  4153. eval_set_kwargs["inputs"] = all_inputs if "inputs" in args.include_for_metrics else None
  4154. metrics = self.compute_metrics(
  4155. EvalPrediction(predictions=all_preds, label_ids=all_labels, **eval_set_kwargs)
  4156. )
  4157. elif metrics is None:
  4158. metrics = {}
  4159. # To be JSON-serializable, we need to remove numpy types or zero-d tensors
  4160. metrics = denumpify_detensorize(metrics)
  4161. if isinstance(all_losses, list) and all_losses:
  4162. metrics[f"{metric_key_prefix}_loss"] = np.concatenate(all_losses).mean().item()
  4163. elif isinstance(all_losses, np.ndarray):
  4164. metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()
  4165. if hasattr(self, "jit_compilation_time"):
  4166. metrics[f"{metric_key_prefix}_jit_compilation_time"] = self.jit_compilation_time
  4167. if hasattr(self, "model_preparation_time"):
  4168. metrics[f"{metric_key_prefix}_model_preparation_time"] = self.model_preparation_time
  4169. # Prefix all keys with metric_key_prefix + '_'
  4170. for key in list(metrics.keys()):
  4171. if not key.startswith(f"{metric_key_prefix}_"):
  4172. metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
  4173. return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples)
  4174. def _nested_gather(self, tensors, name=None):
  4175. """
  4176. Gather value of `tensors` (tensor or list/tuple of nested tensors) and convert them to numpy before
  4177. concatenating them to `gathered`
  4178. """
  4179. if tensors is None:
  4180. return
  4181. if is_torch_xla_available():
  4182. if name is None:
  4183. name = "nested_gather"
  4184. tensors = nested_xla_mesh_reduce(tensors, name)
  4185. elif is_sagemaker_mp_enabled():
  4186. tensors = smp_gather(tensors)
  4187. elif (self.args.distributed_state is not None and self.args.distributed_state.distributed_type != "NO") or (
  4188. self.args.distributed_state is None and self.args.local_rank != -1
  4189. ):
  4190. tensors = distributed_concat(tensors)
  4191. return tensors
  4192. def prediction_step(
  4193. self,
  4194. model: nn.Module,
  4195. inputs: dict[str, Union[torch.Tensor, Any]],
  4196. prediction_loss_only: bool,
  4197. ignore_keys: Optional[list[str]] = None,
  4198. ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
  4199. """
  4200. Perform an evaluation step on `model` using `inputs`.
  4201. Subclass and override to inject custom behavior.
  4202. Args:
  4203. model (`nn.Module`):
  4204. The model to evaluate.
  4205. inputs (`dict[str, Union[torch.Tensor, Any]]`):
  4206. The inputs and targets of the model.
  4207. The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
  4208. argument `labels`. Check your model's documentation for all accepted arguments.
  4209. prediction_loss_only (`bool`):
  4210. Whether or not to return the loss only.
  4211. ignore_keys (`list[str]`, *optional*):
  4212. A list of keys in the output of your model (if it is a dictionary) that should be ignored when
  4213. gathering predictions.
  4214. Return:
  4215. tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
  4216. logits and labels (each being optional).
  4217. """
  4218. has_labels = False if len(self.label_names) == 0 else all(inputs.get(k) is not None for k in self.label_names)
  4219. # For CLIP-like models capable of returning loss values.
  4220. # If `return_loss` is not specified or being `None` in `inputs`, we check if the default value of `return_loss`
  4221. # is `True` in `model.forward`.
  4222. return_loss = inputs.get("return_loss")
  4223. if return_loss is None:
  4224. return_loss = self.can_return_loss
  4225. loss_without_labels = len(self.label_names) == 0 and return_loss
  4226. inputs = self._prepare_inputs(inputs)
  4227. if ignore_keys is None:
  4228. if hasattr(self.model, "config"):
  4229. ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", ["past_key_values"])
  4230. else:
  4231. ignore_keys = []
  4232. # labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
  4233. if has_labels or loss_without_labels:
  4234. labels = nested_detach(tuple(inputs.get(name) for name in self.label_names))
  4235. if len(labels) == 1:
  4236. labels = labels[0]
  4237. else:
  4238. labels = None
  4239. with torch.no_grad():
  4240. if is_sagemaker_mp_enabled():
  4241. raw_outputs = smp_forward_only(model, inputs)
  4242. if has_labels or loss_without_labels:
  4243. if isinstance(raw_outputs, dict):
  4244. loss_mb = raw_outputs["loss"]
  4245. logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys + ["loss"])
  4246. else:
  4247. loss_mb = raw_outputs[0]
  4248. logits_mb = raw_outputs[1:]
  4249. loss = loss_mb.reduce_mean().detach().cpu()
  4250. logits = smp_nested_concat(logits_mb)
  4251. else:
  4252. loss = None
  4253. if isinstance(raw_outputs, dict):
  4254. logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys)
  4255. else:
  4256. logits_mb = raw_outputs
  4257. logits = smp_nested_concat(logits_mb)
  4258. else:
  4259. if has_labels or loss_without_labels:
  4260. with self.compute_loss_context_manager():
  4261. num_items_in_batch = self._get_num_items_in_batch([inputs], self.args.device)
  4262. loss, outputs = self.compute_loss(
  4263. model, inputs, return_outputs=True, num_items_in_batch=num_items_in_batch
  4264. )
  4265. loss = loss.detach().mean()
  4266. if isinstance(outputs, dict):
  4267. logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"])
  4268. else:
  4269. logits = outputs[1:]
  4270. else:
  4271. loss = None
  4272. with self.compute_loss_context_manager():
  4273. outputs = model(**inputs)
  4274. if isinstance(outputs, dict):
  4275. logits = tuple(v for k, v in outputs.items() if k not in ignore_keys)
  4276. else:
  4277. logits = outputs
  4278. # TODO: this needs to be fixed and made cleaner later.
  4279. if self.args.past_index >= 0:
  4280. self._past = outputs[self.args.past_index - 1]
  4281. if prediction_loss_only:
  4282. return (loss, None, None)
  4283. logits = nested_detach(logits)
  4284. if len(logits) == 1:
  4285. logits = logits[0]
  4286. return (loss, logits, labels)
  4287. def floating_point_ops(self, inputs: dict[str, Union[torch.Tensor, Any]]):
  4288. """
  4289. For models that inherit from [`PreTrainedModel`], uses that method to compute the number of floating point
  4290. operations for every backward + forward pass. If using another model, either implement such a method in the
  4291. model or subclass and override this method.
  4292. Args:
  4293. inputs (`dict[str, Union[torch.Tensor, Any]]`):
  4294. The inputs and targets of the model.
  4295. Returns:
  4296. `int`: The number of floating-point operations.
  4297. """
  4298. if hasattr(self.model, "floating_point_ops"):
  4299. return self.model.floating_point_ops(inputs)
  4300. else:
  4301. return 0
  4302. def init_hf_repo(self, token: Optional[str] = None):
  4303. """
  4304. Initializes a git repo in `self.args.hub_model_id`.
  4305. """
  4306. # Only on process zero
  4307. if not self.is_world_process_zero():
  4308. return
  4309. if self.args.hub_model_id is None:
  4310. repo_name = Path(self.args.output_dir).absolute().name
  4311. else:
  4312. repo_name = self.args.hub_model_id
  4313. token = token if token is not None else self.args.hub_token
  4314. repo_url = create_repo(repo_name, token=token, private=self.args.hub_private_repo, exist_ok=True)
  4315. self.hub_model_id = repo_url.repo_id
  4316. self.push_in_progress = None
  4317. def create_model_card(
  4318. self,
  4319. language: Optional[str] = None,
  4320. license: Optional[str] = None,
  4321. tags: Union[str, list[str], None] = None,
  4322. model_name: Optional[str] = None,
  4323. finetuned_from: Optional[str] = None,
  4324. tasks: Union[str, list[str], None] = None,
  4325. dataset_tags: Union[str, list[str], None] = None,
  4326. dataset: Union[str, list[str], None] = None,
  4327. dataset_args: Union[str, list[str], None] = None,
  4328. ):
  4329. """
  4330. Creates a draft of a model card using the information available to the `Trainer`.
  4331. Args:
  4332. language (`str`, *optional*):
  4333. The language of the model (if applicable)
  4334. license (`str`, *optional*):
  4335. The license of the model. Will default to the license of the pretrained model used, if the original
  4336. model given to the `Trainer` comes from a repo on the Hub.
  4337. tags (`str` or `list[str]`, *optional*):
  4338. Some tags to be included in the metadata of the model card.
  4339. model_name (`str`, *optional*):
  4340. The name of the model.
  4341. finetuned_from (`str`, *optional*):
  4342. The name of the model used to fine-tune this one (if applicable). Will default to the name of the repo
  4343. of the original model given to the `Trainer` (if it comes from the Hub).
  4344. tasks (`str` or `list[str]`, *optional*):
  4345. One or several task identifiers, to be included in the metadata of the model card.
  4346. dataset_tags (`str` or `list[str]`, *optional*):
  4347. One or several dataset tags, to be included in the metadata of the model card.
  4348. dataset (`str` or `list[str]`, *optional*):
  4349. One or several dataset identifiers, to be included in the metadata of the model card.
  4350. dataset_args (`str` or `list[str]`, *optional*):
  4351. One or several dataset arguments, to be included in the metadata of the model card.
  4352. """
  4353. if not self.is_world_process_zero():
  4354. return
  4355. model_card_filepath = os.path.join(self.args.output_dir, "README.md")
  4356. is_peft_library = False
  4357. if os.path.exists(model_card_filepath):
  4358. library_name = ModelCard.load(model_card_filepath).data.get("library_name")
  4359. is_peft_library = library_name == "peft"
  4360. # Append existing tags in `tags`
  4361. existing_tags = ModelCard.load(model_card_filepath).data.tags
  4362. if tags is not None and existing_tags is not None:
  4363. if isinstance(tags, str):
  4364. tags = [tags]
  4365. for tag in existing_tags:
  4366. if tag not in tags:
  4367. tags.append(tag)
  4368. training_summary = TrainingSummary.from_trainer(
  4369. self,
  4370. language=language,
  4371. license=license,
  4372. tags=tags,
  4373. model_name=model_name,
  4374. finetuned_from=finetuned_from,
  4375. tasks=tasks,
  4376. dataset_tags=dataset_tags,
  4377. dataset=dataset,
  4378. dataset_args=dataset_args,
  4379. )
  4380. model_card = training_summary.to_model_card()
  4381. with open(model_card_filepath, "w") as f:
  4382. f.write(model_card)
  4383. if is_peft_library:
  4384. self.accelerator.unwrap_model(self.model).create_or_update_model_card(self.args.output_dir)
  4385. def _push_from_checkpoint(self, checkpoint_folder):
  4386. # Only push from one node.
  4387. if not self.is_world_process_zero() or self.args.hub_strategy == HubStrategy.END:
  4388. return
  4389. # If we haven't finished the last push, we don't do this one unless args.hub_always_push=True.
  4390. if not self.args.hub_always_push and self.push_in_progress is not None and not self.push_in_progress.is_done():
  4391. return
  4392. output_dir = self.args.output_dir
  4393. # To avoid a new synchronization of all model weights, we just copy the file from the checkpoint folder
  4394. modeling_files = [CONFIG_NAME, GENERATION_CONFIG_NAME, WEIGHTS_NAME, SAFE_WEIGHTS_NAME]
  4395. # Add sharded checkpoints if we have an index
  4396. for index_file in [WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_INDEX_NAME]:
  4397. index_path = os.path.join(checkpoint_folder, index_file)
  4398. if os.path.isfile(index_path):
  4399. modeling_files.append(index_file)
  4400. with open(index_path) as f:
  4401. index = json.loads(f.read())
  4402. shard_files = list(set(index["weight_map"].values()))
  4403. modeling_files.extend(shard_files)
  4404. if is_peft_available():
  4405. modeling_files.extend([ADAPTER_CONFIG_NAME, ADAPTER_WEIGHTS_NAME, ADAPTER_SAFE_WEIGHTS_NAME])
  4406. for modeling_file in modeling_files:
  4407. if os.path.isfile(os.path.join(checkpoint_folder, modeling_file)):
  4408. shutil.copy(os.path.join(checkpoint_folder, modeling_file), os.path.join(output_dir, modeling_file))
  4409. # Saving the processing class is fast and we don't know how many files it may have spawned, so we resave it to be sure.
  4410. if self.processing_class is not None:
  4411. self.processing_class.save_pretrained(output_dir)
  4412. # Same for the training arguments
  4413. torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
  4414. if self.args.save_strategy == SaveStrategy.STEPS:
  4415. commit_message = f"Training in progress, step {self.state.global_step}"
  4416. else:
  4417. commit_message = f"Training in progress, epoch {int(self.state.epoch)}"
  4418. model_push_job = upload_folder(
  4419. repo_id=self.hub_model_id,
  4420. folder_path=output_dir,
  4421. commit_message=commit_message,
  4422. token=self.args.hub_token,
  4423. run_as_future=True,
  4424. ignore_patterns=["_*", f"{PREFIX_CHECKPOINT_DIR}-*"],
  4425. revision=self.args.hub_revision,
  4426. )
  4427. push_jobs = [model_push_job]
  4428. if self.args.hub_strategy in [HubStrategy.CHECKPOINT, HubStrategy.ALL_CHECKPOINTS]:
  4429. path_in_repo = (
  4430. "last-checkpoint" if self.args.hub_strategy == HubStrategy.CHECKPOINT else Path(checkpoint_folder).name
  4431. )
  4432. checkpoint_push = upload_folder(
  4433. repo_id=self.hub_model_id,
  4434. folder_path=checkpoint_folder,
  4435. path_in_repo=path_in_repo,
  4436. commit_message=commit_message + ", checkpoint",
  4437. token=self.args.hub_token,
  4438. run_as_future=True,
  4439. revision=self.args.hub_revision,
  4440. )
  4441. push_jobs.append(checkpoint_push)
  4442. if self.push_in_progress is None or self.push_in_progress.is_done():
  4443. self.push_in_progress = PushInProgress(push_jobs)
  4444. else:
  4445. self.push_in_progress.jobs.extend(push_jobs)
  4446. def _finish_current_push(self):
  4447. if not hasattr(self, "push_in_progress"):
  4448. return
  4449. if self.push_in_progress is not None and not self.push_in_progress.is_done():
  4450. logger.info("Waiting for the current checkpoint push to be finished, this might take a couple of minutes.")
  4451. self.push_in_progress.wait_until_done()
  4452. def push_to_hub(
  4453. self,
  4454. commit_message: Optional[str] = "End of training",
  4455. blocking: bool = True,
  4456. token: Optional[str] = None,
  4457. revision: Optional[str] = None,
  4458. **kwargs,
  4459. ) -> str:
  4460. """
  4461. Upload `self.model` and `self.processing_class` to the 🤗 model hub on the repo `self.args.hub_model_id`.
  4462. Parameters:
  4463. commit_message (`str`, *optional*, defaults to `"End of training"`):
  4464. Message to commit while pushing.
  4465. blocking (`bool`, *optional*, defaults to `True`):
  4466. Whether the function should return only when the `git push` has finished.
  4467. token (`str`, *optional*, defaults to `None`):
  4468. Token with write permission to overwrite Trainer's original args.
  4469. revision (`str`, *optional*):
  4470. The git revision to commit from. Defaults to the head of the "main" branch.
  4471. kwargs (`dict[str, Any]`, *optional*):
  4472. Additional keyword arguments passed along to [`~Trainer.create_model_card`].
  4473. Returns:
  4474. The URL of the repository where the model was pushed if `blocking=False`, or a `Future` object tracking the
  4475. progress of the commit if `blocking=True`.
  4476. """
  4477. model_name = kwargs.pop("model_name", None)
  4478. if model_name is None and self.args.should_save:
  4479. if self.args.hub_model_id is None:
  4480. model_name = Path(self.args.output_dir).name
  4481. else:
  4482. model_name = self.args.hub_model_id.split("/")[-1]
  4483. token = token if token is not None else self.args.hub_token
  4484. # In case the user calls this method with args.push_to_hub = False
  4485. if self.hub_model_id is None:
  4486. self.init_hf_repo(token=token)
  4487. # Needs to be executed on all processes for TPU training, but will only save on the processed determined by
  4488. # self.args.should_save.
  4489. self.save_model(_internal_call=True)
  4490. # Only push from one node.
  4491. if not self.is_world_process_zero():
  4492. return
  4493. # Add additional tags in the case the model has already some tags and users pass
  4494. # "tags" argument to `push_to_hub` so that trainer automatically handles internal tags
  4495. # from all models since Trainer does not call `model.push_to_hub`.
  4496. if getattr(self.model, "model_tags", None) is not None:
  4497. if "tags" not in kwargs:
  4498. kwargs["tags"] = []
  4499. # If it is a string, convert it to a list
  4500. if isinstance(kwargs["tags"], str):
  4501. kwargs["tags"] = [kwargs["tags"]]
  4502. for model_tag in self.model.model_tags:
  4503. if model_tag not in kwargs["tags"]:
  4504. kwargs["tags"].append(model_tag)
  4505. self.create_model_card(model_name=model_name, **kwargs)
  4506. if revision is None:
  4507. revision = self.args.hub_revision
  4508. # Wait for the current upload to be finished.
  4509. self._finish_current_push()
  4510. return upload_folder(
  4511. repo_id=self.hub_model_id,
  4512. folder_path=self.args.output_dir,
  4513. commit_message=commit_message,
  4514. token=token,
  4515. run_as_future=not blocking,
  4516. ignore_patterns=["_*", f"{PREFIX_CHECKPOINT_DIR}-*"],
  4517. revision=revision,
  4518. )
  4519. #
  4520. # Deprecated code
  4521. #
  4522. def prediction_loop(
  4523. self,
  4524. dataloader: DataLoader,
  4525. description: str,
  4526. prediction_loss_only: Optional[bool] = None,
  4527. ignore_keys: Optional[list[str]] = None,
  4528. metric_key_prefix: str = "eval",
  4529. ) -> EvalLoopOutput:
  4530. """
  4531. Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
  4532. Works both with or without labels.
  4533. """
  4534. args = self.args
  4535. if not has_length(dataloader):
  4536. raise ValueError("dataloader must implement a working __len__")
  4537. prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
  4538. # if eval is called w/o train, handle model prep here
  4539. if self.is_deepspeed_enabled and self.deepspeed is None:
  4540. _, _ = deepspeed_init(self, num_training_steps=0, inference=True)
  4541. model = self._wrap_model(self.model, training=False, dataloader=dataloader)
  4542. if len(self.accelerator._models) == 0 and model is self.model:
  4543. model = (
  4544. self.accelerator.prepare(model)
  4545. if self.is_deepspeed_enabled or self.is_fsdp_enabled
  4546. else self.accelerator.prepare_model(model, evaluation_mode=True)
  4547. )
  4548. if self.is_fsdp_enabled:
  4549. self.model = model
  4550. # for the rest of this function `model` is the outside model, whether it was wrapped or not
  4551. if model is not self.model:
  4552. self.model_wrapped = model
  4553. # backward compatibility
  4554. if self.is_deepspeed_enabled:
  4555. self.deepspeed = self.model_wrapped
  4556. # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
  4557. # while ``train`` is running, cast it to the right dtype first and then put on device
  4558. if not self.is_in_train:
  4559. if args.fp16_full_eval:
  4560. model = model.to(dtype=torch.float16, device=args.device)
  4561. elif args.bf16_full_eval:
  4562. model = model.to(dtype=torch.bfloat16, device=args.device)
  4563. batch_size = (
  4564. dataloader.total_batch_size
  4565. if getattr(dataloader, "_is_accelerate_prepared", False)
  4566. else dataloader.batch_size
  4567. )
  4568. if batch_size is None:
  4569. raise ValueError(
  4570. "Batch size cannot be None. Ensure the dataloader has a valid batch_size or total_batch_size."
  4571. )
  4572. num_examples = self.num_examples(dataloader)
  4573. logger.info(f"\n***** Running {description} *****")
  4574. logger.info(f" Num examples = {num_examples}")
  4575. logger.info(f" Batch size = {batch_size}")
  4576. losses_host: Optional[torch.Tensor] = None
  4577. preds_host: Union[torch.Tensor, list[torch.Tensor], None] = None
  4578. labels_host: Union[torch.Tensor, list[torch.Tensor], None] = None
  4579. inputs_host: Union[torch.Tensor, list[torch.Tensor], None] = None
  4580. metrics: Optional[dict] = None
  4581. eval_set_kwargs: dict = {}
  4582. world_size = max(1, args.world_size)
  4583. eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=batch_size)
  4584. if not prediction_loss_only:
  4585. # The actual number of eval_sample can be greater than num_examples in distributed settings (when we pass
  4586. # a batch size to the sampler)
  4587. make_multiple_of = None
  4588. if hasattr(dataloader, "sampler") and isinstance(dataloader.sampler, SequentialDistributedSampler):
  4589. make_multiple_of = dataloader.sampler.batch_size
  4590. preds_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
  4591. labels_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
  4592. inputs_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
  4593. model.eval()
  4594. if hasattr(self.optimizer, "eval") and callable(self.optimizer.eval):
  4595. self.optimizer.eval()
  4596. if args.past_index >= 0:
  4597. self._past = None
  4598. self.callback_handler.eval_dataloader = dataloader
  4599. for step, inputs in enumerate(dataloader):
  4600. loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
  4601. main_input_name = getattr(self.model, "main_input_name", "input_ids")
  4602. inputs_decode = (
  4603. self._prepare_input(inputs[main_input_name]) if "inputs" in args.include_for_metrics else None
  4604. )
  4605. if loss is not None:
  4606. losses = loss.repeat(batch_size)
  4607. losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0)
  4608. if logits is not None:
  4609. preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
  4610. if labels is not None:
  4611. labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
  4612. if inputs_decode is not None:
  4613. inputs_host = (
  4614. inputs_decode
  4615. if inputs_host is None
  4616. else nested_concat(inputs_host, inputs_decode, padding_index=-100)
  4617. )
  4618. self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
  4619. if self.args.batch_eval_metrics:
  4620. if self.compute_metrics is not None and preds_host is not None and labels_host is not None:
  4621. is_last_step = self.accelerator.gradient_state.end_of_dataloader
  4622. batch_kwargs = {}
  4623. batch_kwargs["losses"] = losses_host if "loss" in args.include_for_metrics else None
  4624. batch_kwargs["inputs"] = inputs_host if "inputs" in args.include_for_metrics else None
  4625. metrics = self.compute_metrics(
  4626. EvalPrediction(predictions=preds_host, label_ids=labels_host, **batch_kwargs),
  4627. compute_result=is_last_step,
  4628. )
  4629. if self.args.batch_eval_metrics or (
  4630. args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0
  4631. ):
  4632. # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
  4633. eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
  4634. if not prediction_loss_only:
  4635. preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
  4636. labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
  4637. inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids"))
  4638. # Set back to None to begin a new accumulation
  4639. del losses_host, preds_host, labels_host, inputs_host
  4640. torch.cuda.empty_cache()
  4641. losses_host, preds_host, labels_host, inputs_host = None, None, None, None
  4642. if args.past_index and hasattr(self, "_past"):
  4643. # Clean the state at the end of the evaluation loop
  4644. delattr(self, "_past")
  4645. # Gather all remaining tensors and put them back on the CPU
  4646. eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
  4647. if not prediction_loss_only:
  4648. preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
  4649. labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))
  4650. inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids"))
  4651. eval_loss = eval_losses_gatherer.finalize()
  4652. preds = preds_gatherer.finalize() if not prediction_loss_only else None
  4653. label_ids = labels_gatherer.finalize() if not prediction_loss_only else None
  4654. inputs_ids = inputs_gatherer.finalize() if not prediction_loss_only else None
  4655. if (
  4656. self.compute_metrics is not None
  4657. and preds is not None
  4658. and label_ids is not None
  4659. and not self.args.batch_eval_metrics
  4660. ):
  4661. eval_set_kwargs["losses"] = eval_loss if "loss" in args.include_for_metrics else None
  4662. eval_set_kwargs["inputs"] = inputs_ids if "inputs" in args.include_for_metrics else None
  4663. metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids, **eval_set_kwargs))
  4664. elif metrics is None:
  4665. metrics = {}
  4666. # To be JSON-serializable, we need to remove numpy types or zero-d tensors
  4667. metrics = denumpify_detensorize(metrics)
  4668. if eval_loss is not None:
  4669. metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item()
  4670. # Prefix all keys with metric_key_prefix + '_'
  4671. for key in list(metrics.keys()):
  4672. if not key.startswith(f"{metric_key_prefix}_"):
  4673. metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
  4674. return EvalLoopOutput(predictions=preds, label_ids=label_ids, metrics=metrics, num_samples=num_examples)
  4675. def _gather_and_numpify(self, tensors, name):
  4676. """
  4677. Gather value of `tensors` (tensor or list/tuple of nested tensors) and convert them to numpy before
  4678. concatenating them to `gathered`
  4679. """
  4680. if tensors is None:
  4681. return
  4682. if is_torch_xla_available():
  4683. tensors = nested_xla_mesh_reduce(tensors, name)
  4684. elif is_sagemaker_mp_enabled():
  4685. tensors = smp_gather(tensors)
  4686. elif self.args.parallel_mode == ParallelMode.DISTRIBUTED:
  4687. tensors = distributed_concat(tensors)
  4688. return nested_numpify(tensors)
  4689. def _add_sm_patterns_to_gitignore(self) -> None:
  4690. """Add SageMaker Checkpointing patterns to .gitignore file."""
  4691. # Make sure we only do this on the main process
  4692. if not self.is_world_process_zero():
  4693. return
  4694. patterns = ["*.sagemaker-uploading", "*.sagemaker-uploaded"]
  4695. # Get current .gitignore content
  4696. if os.path.exists(os.path.join(self.repo.local_dir, ".gitignore")):
  4697. with open(os.path.join(self.repo.local_dir, ".gitignore")) as f:
  4698. current_content = f.read()
  4699. else:
  4700. current_content = ""
  4701. # Add the patterns to .gitignore
  4702. content = current_content
  4703. for pattern in patterns:
  4704. if pattern not in content:
  4705. if content.endswith("\n"):
  4706. content += pattern
  4707. else:
  4708. content += f"\n{pattern}"
  4709. # Write the .gitignore file if it has changed
  4710. if content != current_content:
  4711. with open(os.path.join(self.repo.local_dir, ".gitignore"), "w") as f:
  4712. logger.debug(f"Writing .gitignore file. Content: {content}")
  4713. f.write(content)
  4714. self.repo.git_add(".gitignore")
  4715. # avoid race condition with git status
  4716. time.sleep(0.5)
  4717. if not self.repo.is_repo_clean():
  4718. self.repo.git_commit("Add *.sagemaker patterns to .gitignore.")
  4719. self.repo.git_push()
  4720. def create_accelerator_and_postprocess(self):
  4721. # We explicitly don't rely on the `Accelerator` to do gradient accumulation
  4722. grad_acc_kwargs = {}
  4723. if is_accelerate_available("0.28.0") and self.args.accelerator_config.gradient_accumulation_kwargs is not None:
  4724. grad_acc_kwargs = self.args.accelerator_config.gradient_accumulation_kwargs
  4725. # check if num_steps is attempted to be passed in gradient_accumulation_kwargs
  4726. if "num_steps" in grad_acc_kwargs:
  4727. if self.args.gradient_accumulation_steps > 1:
  4728. # raise because we do not know which setting is intended.
  4729. raise ValueError(
  4730. "The `AcceleratorConfig`'s `num_steps` is set but `gradient_accumulation_steps` is greater than 1 in the passed `TrainingArguments`"
  4731. "If using the passed `AcceleratorConfig` is desired, do not set the `TrainingArguments` `gradient_accumulation_steps`."
  4732. )
  4733. else:
  4734. self.args.gradient_accumulation_steps = grad_acc_kwargs["num_steps"]
  4735. accelerator_config = self.args.accelerator_config.to_dict()
  4736. if is_accelerate_available("0.28.0"):
  4737. # Extract dataloader config params from accelerator config
  4738. dataloader_params = ["split_batches", "dispatch_batches", "even_batches", "use_seedable_sampler"]
  4739. dataloader_config = DataLoaderConfiguration(
  4740. **{param: accelerator_config.pop(param) for param in dataloader_params}
  4741. )
  4742. if is_accelerate_available("1.1.0"):
  4743. dataloader_config.data_seed = self.args.data_seed
  4744. non_blocking = accelerator_config.pop("non_blocking")
  4745. if not is_accelerate_available("0.30.0"):
  4746. if non_blocking:
  4747. raise ImportError(
  4748. "`non_blocking` is only supported in accelerate v0.30.0 and above. Please upgrade accelerate to use this feature."
  4749. )
  4750. else:
  4751. if non_blocking and not self.args.dataloader_pin_memory:
  4752. logger.warning(
  4753. "`non_blocking` is enabled but `dataloader_pin_memory` is not. For the best performance, it's recommended to enable both."
  4754. )
  4755. dataloader_config.non_blocking = non_blocking
  4756. # this would have been updated above, no need for it anymore
  4757. accelerator_config.pop("gradient_accumulation_kwargs")
  4758. args = {
  4759. "deepspeed_plugin": self.args.deepspeed_plugin,
  4760. }
  4761. # We defer compatibility checks to accelerator
  4762. if self.args.parallelism_config is not None:
  4763. if not is_accelerate_available("1.10.1"):
  4764. raise ImportError(
  4765. "ParallelismConfig requires accelerate v1.10.1 and above. Please upgrade accelerate to use this feature."
  4766. )
  4767. args["parallelism_config"] = self.args.parallelism_config
  4768. if is_accelerate_available("0.28.0"):
  4769. args["dataloader_config"] = dataloader_config
  4770. else:
  4771. args.update(accelerator_config)
  4772. # tp is initialized at Accelerator init phase so
  4773. # args should be prepared here
  4774. if hasattr(self.model, "tp_size") and self.model.tp_size is not None and self.model.tp_size > 1:
  4775. self.is_tp_enabled = True
  4776. if version.parse(accelerate_version) > version.parse("1.3.0"):
  4777. args["torch_tp_plugin"] = TorchTensorParallelPlugin(tp_size=self.model.tp_size)
  4778. else:
  4779. raise ValueError("Requires accelerate>1.3.0 to use Tensor Parallelism.")
  4780. # create accelerator object
  4781. self.accelerator = Accelerator(**args)
  4782. # some Trainer classes need to use `gather` instead of `gather_for_metrics`, thus we store a flag
  4783. self.gather_function = self.accelerator.gather_for_metrics
  4784. if "use_gather_object" in inspect.signature(self.gather_function).parameters:
  4785. self.gather_function = functools.partial(
  4786. self.gather_function, use_gather_object=self.args.eval_use_gather_object
  4787. )
  4788. # deepspeed and accelerate flags covering both trainer args and accelerate launcher
  4789. self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
  4790. self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
  4791. self.is_tp_enabled = getattr(self.accelerator.state, "torch_tp_plugin", None) is not None
  4792. # post accelerator creation setup
  4793. if self.is_fsdp_enabled:
  4794. fsdp_plugin = self.accelerator.state.fsdp_plugin
  4795. for param in ["limit_all_gathers", "activation_checkpointing"]:
  4796. setattr(fsdp_plugin, param, self.args.fsdp_config.get(param, getattr(fsdp_plugin, param)))
  4797. if fsdp_plugin.activation_checkpointing and self.args.gradient_checkpointing:
  4798. raise ValueError(
  4799. "The activation_checkpointing in FSDP config and the gradient_checkpointing in training arg "
  4800. "can't be set to True simultaneously. Please use FSDP's activation_checkpointing logic "
  4801. "when using FSDP."
  4802. )
  4803. if self.is_deepspeed_enabled and getattr(self.args, "hf_deepspeed_config", None) is None:
  4804. self.propagate_args_to_deepspeed()
  4805. # `save_only_model` can't be used with DeepSpeed/FSDP along with `load_best_model_at_end`
  4806. if (
  4807. self.args.save_only_model
  4808. and (self.is_deepspeed_enabled or self.is_fsdp_enabled)
  4809. and self.args.load_best_model_at_end
  4810. ):
  4811. wrapper = "DeepSpeed" if self.is_deepspeed_enabled else "FSDP"
  4812. raise ValueError(f"{wrapper} can't be used with `save_only_model` along with `load_best_model_at_end`.")
  4813. # `auto_find_batch_size` isn't supported yet with DeepSpeed Zero-3
  4814. if (
  4815. self.is_deepspeed_enabled
  4816. and self.accelerator.state.deepspeed_plugin.zero_stage == 3
  4817. and self.args.auto_find_batch_size
  4818. ):
  4819. raise ValueError(
  4820. "`auto_find_batch_size` isn't supported yet with DeepSpeed Zero-3. Please consider using Zero-2, Zero-1, or FSDP"
  4821. )
  4822. if (
  4823. self.args.save_only_model
  4824. and self.is_fsdp_enabled
  4825. and "SHARDED_STATE_DICT" in str(self.accelerator.state.fsdp_plugin.state_dict_type)
  4826. ):
  4827. raise ValueError("save_only_model option is not compatible with FSDP state dict type 'SHARDED_STATE_DICT'")
  4828. def propagate_args_to_deepspeed(self, auto_find_batch_size=False):
  4829. """
  4830. Sets values in the deepspeed plugin based on the Trainer args
  4831. """
  4832. from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig
  4833. ds_plugin = self.accelerator.state.deepspeed_plugin
  4834. ds_plugin.hf_ds_config = HfTrainerDeepSpeedConfig(ds_plugin.hf_ds_config.config)
  4835. ds_plugin.deepspeed_config = ds_plugin.hf_ds_config.config
  4836. ds_plugin.hf_ds_config.trainer_config_process(self.args, auto_find_batch_size)
  4837. def _fsdp_qlora_plugin_updates(self):
  4838. if self.is_fsdp_enabled and _is_peft_model(self.model):
  4839. from peft import PeftConfig
  4840. from peft.utils.other import fsdp_auto_wrap_policy
  4841. if isinstance(self.model.active_peft_config, PeftConfig):
  4842. self.accelerator.state.fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(self.model)
  4843. if (
  4844. getattr(self.model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES
  4845. and self.model.hf_quantizer.quantization_config.bnb_4bit_quant_storage.is_floating_point
  4846. and version.parse(accelerate_version) > version.parse("0.27.0")
  4847. ):
  4848. self.accelerator.state.fsdp_plugin.set_mixed_precision(
  4849. self.model.hf_quantizer.quantization_config.bnb_4bit_quant_storage, override=True
  4850. )
  4851. def _get_num_items_in_batch(self, batch_samples: list, device: torch.device) -> Optional[Union[torch.Tensor, int]]:
  4852. """
  4853. Counts the number of items in the batches to properly scale the loss.
  4854. Args:
  4855. batch_samples (`list`): List of batches
  4856. device (`torch.device`): The device on which the number of items in the batch should be.
  4857. Returns:
  4858. None if the number of items in the batch doesn't need to be computed else the number of items in the batch
  4859. """
  4860. num_items_in_batch = None
  4861. count_num_items_in_batch = (
  4862. len(batch_samples) > 0
  4863. and "labels" in batch_samples[0]
  4864. and (
  4865. # num_items_in_batch is passed to model forward
  4866. # https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/trainer.py#L3757
  4867. self.model_accepts_loss_kwargs
  4868. # num_items_in_batch is passed to compute_loss_func
  4869. # https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/trainer.py#L3773
  4870. or self.compute_loss_func is not None
  4871. # num_items_in_batch is also verified if (self.model_accepts_loss_kwargs or self.compute_loss_func)
  4872. # https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/trainer.py#L3790
  4873. )
  4874. )
  4875. if count_num_items_in_batch:
  4876. # For now we don't support object detection
  4877. try:
  4878. num_items_in_batch = sum((batch["labels"].ne(-100)).sum() for batch in batch_samples)
  4879. except (TypeError, AttributeError):
  4880. pass
  4881. if num_items_in_batch is not None:
  4882. if self.args.average_tokens_across_devices and self.args.world_size >= 1:
  4883. num_items_in_batch = self.accelerator.gather(num_items_in_batch.to(device)).sum()
  4884. elif self.args.n_gpu >= 1:
  4885. # In DP case, if we don't average, we need to divide by the number of gpu. This is the simplest approximation.
  4886. # Otherwise, we would have to scatter labels and calculate num_items_in_batch for each gpu.
  4887. num_items_in_batch = num_items_in_batch // self.args.n_gpu
  4888. if torch.is_tensor(num_items_in_batch):
  4889. num_items_in_batch = num_items_in_batch.to(device)
  4890. if self.args.n_gpu > 1 and num_items_in_batch.dim() == 0:
  4891. # In the DataParallel case, convert the scalar tensor into a 2-dim tensor with the same value repeated
  4892. num_items_in_batch = num_items_in_batch.unsqueeze(0).expand(self.args.n_gpu, -1)
  4893. # Divide by number of devices with the same batch
  4894. if pc := getattr(self.accelerator, "parallelism_config", None):
  4895. num_items_in_batch = num_items_in_batch // pc.non_data_parallel_size
  4896. return num_items_in_batch
  4897. def get_batch_samples(
  4898. self, epoch_iterator: Iterator, num_batches: int, device: torch.device
  4899. ) -> tuple[list, Optional[Union[torch.Tensor, int]]]:
  4900. """
  4901. Collects a specified number of batches from the epoch iterator and optionally counts the number of items in the batches to properly scale the loss.
  4902. """
  4903. batch_samples = []
  4904. for _ in range(num_batches):
  4905. try:
  4906. batch_samples.append(next(epoch_iterator))
  4907. except StopIteration:
  4908. break
  4909. num_items_in_batch = self._get_num_items_in_batch(batch_samples, device)
  4910. return batch_samples, num_items_in_batch
  4911. def set_initial_training_values(
  4912. self, args: TrainingArguments, dataloader: DataLoader, total_train_batch_size: int
  4913. ):
  4914. """
  4915. Calculates and returns the following values:
  4916. - `num_train_epochs`
  4917. - `num_update_steps_per_epoch`
  4918. - `num_examples`
  4919. - `num_train_samples`
  4920. - `epoch_based`
  4921. - `len_dataloader`
  4922. - `max_steps`
  4923. """
  4924. # Case 1: we rely on `args.max_steps` first
  4925. max_steps = args.max_steps
  4926. # If max_steps is negative, we use the number of epochs to determine the number of total steps later
  4927. epoch_based = max_steps < 0
  4928. len_dataloader = len(dataloader) if has_length(dataloader) else None
  4929. # Case 2: We have a dataloader length and can extrapolate
  4930. if len_dataloader is not None:
  4931. num_update_steps_per_epoch = max(
  4932. len_dataloader // args.gradient_accumulation_steps
  4933. + int(len_dataloader % args.gradient_accumulation_steps > 0),
  4934. 1,
  4935. )
  4936. # Case 3: We have a length but are using epochs, we can extrapolate the number of steps
  4937. if epoch_based:
  4938. max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch)
  4939. # Now we figure out `num_examples`, `num_train_epochs`, and `train_samples`
  4940. if len_dataloader:
  4941. num_examples = self.num_examples(dataloader)
  4942. if args.max_steps > 0:
  4943. num_train_epochs = max_steps // num_update_steps_per_epoch + int(
  4944. max_steps % num_update_steps_per_epoch > 0
  4945. )
  4946. # May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's
  4947. # the best we can do.
  4948. num_train_samples = max_steps * total_train_batch_size
  4949. else:
  4950. num_train_epochs = math.ceil(args.num_train_epochs)
  4951. num_train_samples = self.num_examples(dataloader) * args.num_train_epochs
  4952. elif args.max_steps > 0: # Rely on max_steps when dataloader does not have a working size
  4953. # Setting a very large number of epochs so we go as many times as necessary over the iterator.
  4954. num_train_epochs = sys.maxsize
  4955. num_update_steps_per_epoch = max_steps
  4956. num_examples = total_train_batch_size * args.max_steps
  4957. num_train_samples = args.max_steps * total_train_batch_size
  4958. else:
  4959. raise ValueError(
  4960. "args.max_steps must be set to a positive value if dataloader does not have a length, was"
  4961. f" {args.max_steps}"
  4962. )
  4963. return (
  4964. num_train_epochs,
  4965. num_update_steps_per_epoch,
  4966. num_examples,
  4967. num_train_samples,
  4968. epoch_based,
  4969. len_dataloader,
  4970. max_steps,
  4971. )