api.py 140 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356
  1. # Copyright (c) Alibaba, Inc. and its affiliates.
  2. # yapf: disable
  3. import datetime
  4. import fnmatch
  5. import functools
  6. import io
  7. import os
  8. import pickle
  9. import platform
  10. import re
  11. import shutil
  12. import tempfile
  13. import time
  14. import uuid
  15. import warnings
  16. from collections import defaultdict
  17. from http import HTTPStatus
  18. from http.cookiejar import CookieJar
  19. from os.path import expanduser
  20. from pathlib import Path
  21. from typing import (Any, BinaryIO, Dict, Iterable, List, Literal, Optional,
  22. Tuple, Union)
  23. from urllib.parse import urlencode
  24. import json
  25. import requests
  26. from requests import Session
  27. from requests.adapters import HTTPAdapter, Retry
  28. from requests.exceptions import HTTPError
  29. from tqdm.auto import tqdm
  30. from modelscope.hub.constants import (API_HTTP_CLIENT_MAX_RETRIES,
  31. API_HTTP_CLIENT_TIMEOUT,
  32. API_RESPONSE_FIELD_DATA,
  33. API_RESPONSE_FIELD_EMAIL,
  34. API_RESPONSE_FIELD_GIT_ACCESS_TOKEN,
  35. API_RESPONSE_FIELD_MESSAGE,
  36. API_RESPONSE_FIELD_USERNAME,
  37. DEFAULT_MAX_WORKERS,
  38. DEFAULT_MODELSCOPE_INTL_DOMAIN,
  39. MODELSCOPE_CLOUD_ENVIRONMENT,
  40. MODELSCOPE_CLOUD_USERNAME,
  41. MODELSCOPE_CREDENTIALS_PATH,
  42. MODELSCOPE_DOMAIN,
  43. MODELSCOPE_PREFER_AI_SITE,
  44. MODELSCOPE_REQUEST_ID,
  45. MODELSCOPE_URL_SCHEME, ONE_YEAR_SECONDS,
  46. REQUESTS_API_HTTP_METHOD,
  47. TEMPORARY_FOLDER_NAME,
  48. UPLOAD_BLOB_TQDM_DISABLE_THRESHOLD,
  49. UPLOAD_COMMIT_BATCH_SIZE,
  50. UPLOAD_MAX_FILE_COUNT,
  51. UPLOAD_MAX_FILE_COUNT_IN_DIR,
  52. UPLOAD_MAX_FILE_SIZE,
  53. UPLOAD_NORMAL_FILE_SIZE_TOTAL_LIMIT,
  54. UPLOAD_SIZE_THRESHOLD_TO_ENFORCE_LFS,
  55. VALID_SORT_KEYS, DatasetVisibility,
  56. Licenses, ModelVisibility, Visibility,
  57. VisibilityMap)
  58. from modelscope.hub.errors import (InvalidParameter, NotExistError,
  59. NotLoginException, RequestError,
  60. datahub_raise_on_error,
  61. handle_http_post_error,
  62. handle_http_response, is_ok,
  63. raise_for_http_status, raise_on_error)
  64. from modelscope.hub.git import GitCommandWrapper
  65. from modelscope.hub.info import DatasetInfo, ModelInfo
  66. from modelscope.hub.repository import Repository
  67. from modelscope.hub.utils.aigc import AigcModel
  68. from modelscope.hub.utils.utils import (add_content_to_file, get_domain,
  69. get_endpoint, get_readable_folder_size,
  70. get_release_datetime, is_env_true,
  71. model_id_to_group_owner_name)
  72. from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
  73. DEFAULT_MODEL_REVISION,
  74. DEFAULT_REPOSITORY_REVISION,
  75. MASTER_MODEL_BRANCH, META_FILES_FORMAT,
  76. REPO_TYPE_DATASET, REPO_TYPE_MODEL,
  77. REPO_TYPE_SUPPORT, ConfigFields,
  78. DatasetFormations, DatasetMetaFormats,
  79. DownloadChannel, DownloadMode,
  80. Frameworks, ModelFile, Tasks,
  81. VirgoDatasetConfig)
  82. from modelscope.utils.file_utils import (get_file_hash, get_file_size,
  83. is_relative_path)
  84. from modelscope.utils.logger import get_logger
  85. from modelscope.utils.repo_utils import (DATASET_LFS_SUFFIX,
  86. DEFAULT_IGNORE_PATTERNS,
  87. MODEL_LFS_SUFFIX,
  88. CommitHistoryResponse, CommitInfo,
  89. CommitOperation, CommitOperationAdd,
  90. RepoUtils)
  91. from modelscope.utils.thread_utils import thread_executor
  92. logger = get_logger()
  93. class HubApi:
  94. """Model hub api interface.
  95. """
  96. def __init__(self,
  97. endpoint: Optional[str] = None,
  98. timeout=API_HTTP_CLIENT_TIMEOUT,
  99. max_retries=API_HTTP_CLIENT_MAX_RETRIES,
  100. token: Optional[str] = None):
  101. """The ModelScope HubApi。
  102. Args:
  103. endpoint (str, optional): The modelscope server http|https address. Defaults to None.
  104. """
  105. self.endpoint = endpoint if endpoint is not None else get_endpoint()
  106. self.token = token
  107. self.headers = {'user-agent': ModelScopeConfig.get_user_agent()}
  108. self.session = Session()
  109. retry = Retry(
  110. total=max_retries,
  111. read=2,
  112. connect=2,
  113. backoff_factor=1,
  114. status_forcelist=(500, 502, 503, 504),
  115. respect_retry_after_header=False,
  116. )
  117. adapter = HTTPAdapter(max_retries=retry)
  118. self.session.mount('http://', adapter)
  119. self.session.mount('https://', adapter)
  120. # set http timeout
  121. for method in REQUESTS_API_HTTP_METHOD:
  122. setattr(
  123. self.session, method,
  124. functools.partial(
  125. getattr(self.session, method),
  126. timeout=timeout))
  127. self.upload_checker = UploadingCheck()
  128. def _get_cookies(self, access_token: str):
  129. """
  130. Get jar cookies for authentication from access_token.
  131. Args:
  132. access_token (str): user access token on ModelScope.
  133. Returns:
  134. jar (CookieJar): cookies for authentication.
  135. """
  136. from requests.cookies import RequestsCookieJar
  137. from urllib.parse import urlparse
  138. domain: str = urlparse(self.endpoint).netloc if self.endpoint else get_domain()
  139. jar = RequestsCookieJar()
  140. jar.set('m_session_id',
  141. access_token,
  142. domain=domain,
  143. path='/')
  144. return jar
  145. def get_cookies(self, access_token: Optional[str] = None, cookies_required: Optional[bool] = False):
  146. """
  147. Get cookies for authentication from local cache or access_token.
  148. Args:
  149. access_token (Optional[str]): user access token on ModelScope. If not provided, try to get from local cache.
  150. cookies_required (bool): whether to raise error if no cookies found, defaults to `False`.
  151. Returns:
  152. cookies (CookieJar): cookies for authentication.
  153. Raises:
  154. ValueError: If no credentials found and cookies_required is True.
  155. """
  156. token = access_token or self.token or os.environ.get('MODELSCOPE_API_TOKEN')
  157. if token:
  158. cookies = self._get_cookies(access_token=token)
  159. else:
  160. cookies = ModelScopeConfig.get_cookies()
  161. if cookies is None and cookies_required:
  162. raise ValueError(
  163. 'No credentials found.'
  164. 'You can pass the `--token` argument, '
  165. 'or use HubApi().login(access_token=`your_sdk_token`). '
  166. 'Your token is available at https://modelscope.cn/my/myaccesstoken'
  167. )
  168. return cookies
  169. def login(
  170. self,
  171. access_token: Optional[str] = None,
  172. endpoint: Optional[str] = None
  173. ):
  174. """Login with your SDK access token, which can be obtained from
  175. https://www.modelscope.cn user center.
  176. Args:
  177. access_token (str): user access token on modelscope, set this argument or set `MODELSCOPE_API_TOKEN`.
  178. If neither of the tokens exist, login will directly return.
  179. endpoint: the endpoint to use, default to None to use endpoint specified in the class
  180. Returns:
  181. cookies: to authenticate yourself to ModelScope open-api
  182. git_token: token to access your git repository.
  183. Note:
  184. You only have to login once within 30 days.
  185. """
  186. access_token = access_token or self.token or os.environ.get('MODELSCOPE_API_TOKEN')
  187. if not access_token:
  188. return None, None
  189. if not endpoint:
  190. endpoint = self.endpoint
  191. path = f'{endpoint}/api/v1/login'
  192. r = self.session.post(
  193. path,
  194. json={'AccessToken': access_token},
  195. headers=self.builder_headers(self.headers))
  196. raise_for_http_status(r)
  197. d = r.json()
  198. raise_on_error(d)
  199. token = d[API_RESPONSE_FIELD_DATA][API_RESPONSE_FIELD_GIT_ACCESS_TOKEN]
  200. cookies = r.cookies
  201. # save token and cookie
  202. ModelScopeConfig.save_token(token)
  203. ModelScopeConfig.save_cookies(cookies)
  204. ModelScopeConfig.save_user_info(
  205. d[API_RESPONSE_FIELD_DATA][API_RESPONSE_FIELD_USERNAME],
  206. d[API_RESPONSE_FIELD_DATA][API_RESPONSE_FIELD_EMAIL])
  207. return d[API_RESPONSE_FIELD_DATA][
  208. API_RESPONSE_FIELD_GIT_ACCESS_TOKEN], cookies
  209. def create_model(self,
  210. model_id: str,
  211. visibility: Optional[int] = ModelVisibility.PUBLIC,
  212. license: Optional[str] = Licenses.APACHE_V2,
  213. chinese_name: Optional[str] = None,
  214. original_model_id: Optional[str] = '',
  215. endpoint: Optional[str] = None,
  216. token: Optional[str] = None,
  217. aigc_model: Optional['AigcModel'] = None) -> str:
  218. """Create model repo at ModelScope Hub.
  219. Args:
  220. model_id (str): The model id in format {owner}/{name}
  221. visibility (int, optional): visibility of the model(1-private, 5-public), default 5.
  222. license (str, optional): license of the model, default apache-2.0.
  223. chinese_name (str, optional): chinese name of the model.
  224. original_model_id (str, optional): the base model id which this model is trained from
  225. endpoint: the endpoint to use, default to None to use endpoint specified in the class
  226. token (str, optional): access token for authentication
  227. aigc_model (AigcModel, optional): AigcModel instance for AIGC model creation.
  228. If provided, will create an AIGC model with automatic file upload.
  229. Refer to modelscope.hub.utils.aigc.AigcModel for details.
  230. Returns:
  231. str: URL of the created model repository
  232. Raises:
  233. InvalidParameter: If model_id is invalid or required AIGC parameters are missing.
  234. ValueError: If not login.
  235. Note:
  236. model_id = {owner}/{name}
  237. """
  238. if model_id is None:
  239. raise InvalidParameter('model_id is required!')
  240. # Get cookies for authentication.
  241. cookies = self.get_cookies(access_token=token, cookies_required=True)
  242. if not endpoint:
  243. endpoint = self.endpoint
  244. owner_or_group, name = model_id_to_group_owner_name(model_id)
  245. # Base body configuration
  246. body = {
  247. 'Path': owner_or_group,
  248. 'Name': name,
  249. 'ChineseName': chinese_name,
  250. 'Visibility': visibility,
  251. 'License': license,
  252. 'OriginalModelId': original_model_id,
  253. 'TrainId': os.environ.get('MODELSCOPE_TRAIN_ID', '')
  254. }
  255. # Set path based on model type
  256. if aigc_model is not None:
  257. # Use AIGC model endpoint
  258. path = f'{endpoint}/api/v1/models/aigc'
  259. # Best-effort pre-upload weights so server recognizes sha256 (use existing cookies)
  260. aigc_model.preupload_weights(cookies=cookies, headers=self.builder_headers(self.headers), endpoint=endpoint)
  261. # Add AIGC-specific fields to body
  262. body.update({
  263. 'TagShowName': aigc_model.tag,
  264. 'CoverImages': aigc_model.cover_images,
  265. 'AigcType': aigc_model.aigc_type,
  266. 'TagDescription': aigc_model.description,
  267. 'VisionFoundation': aigc_model.base_model_type,
  268. 'BaseModel': aigc_model.base_model_id or original_model_id,
  269. 'WeightsName': aigc_model.weight_filename,
  270. 'WeightsSha256': aigc_model.weight_sha256,
  271. 'WeightsSize': aigc_model.weight_size,
  272. 'ModelPath': aigc_model.model_path,
  273. 'TriggerWords': aigc_model.trigger_words,
  274. 'ModelSource': aigc_model.model_source,
  275. 'SubVisionFoundation': aigc_model.base_model_sub_type,
  276. })
  277. if aigc_model.official_tags:
  278. body['OfficialTags'] = aigc_model.official_tags
  279. else:
  280. # Use regular model endpoint
  281. path = f'{endpoint}/api/v1/models'
  282. headers = self.builder_headers(self.headers)
  283. intl_end = DEFAULT_MODELSCOPE_INTL_DOMAIN.split('.')[-1]
  284. if endpoint.rstrip('/').endswith(f'.{intl_end}'):
  285. headers['X-Modelscope-Accept-Language'] = 'en_US'
  286. r = self.session.post(
  287. path,
  288. json=body,
  289. cookies=cookies,
  290. headers=headers)
  291. raise_for_http_status(r)
  292. d = r.json()
  293. raise_on_error(d)
  294. model_repo_url = f'{endpoint}/models/{model_id}'
  295. # Upload model files for AIGC models
  296. if aigc_model is not None:
  297. aigc_model.upload_to_repo(self, model_id, token)
  298. return model_repo_url
  299. def create_model_tag(self,
  300. model_id: str,
  301. tag_name: str,
  302. endpoint: Optional[str] = None,
  303. token: Optional[str] = None,
  304. aigc_model: Optional['AigcModel'] = None) -> str:
  305. """Create a tag for a model at ModelScope Hub.
  306. Args:
  307. model_id (str): The model id in format {owner}/{name}
  308. tag_name (str): The tag name (e.g., "v1.0.0")
  309. endpoint: the endpoint to use, default to None to use endpoint specified in the class
  310. token (str, optional): access token for authentication
  311. aigc_model (AigcModel, optional): AigcModel instance for AIGC model tag creation.
  312. If provided, will create an AIGC model tag with automatic parameters.
  313. Refer to modelscope.hub.utils.aigc.AigcModel for details.
  314. Returns:
  315. str: URL of the created tag
  316. Raises:
  317. InvalidParameter: If model_id, tag_name, ref, or description is invalid.
  318. ValueError: If not login.
  319. Note:
  320. model_id = {owner}/{name}
  321. """
  322. if model_id is None:
  323. raise InvalidParameter('model_id is required!')
  324. if tag_name is None:
  325. raise InvalidParameter('tag_name is required!')
  326. if tag_name.lower() in ['main', 'master']:
  327. raise InvalidParameter(
  328. f'tag_name "{tag_name}" is not allowed. '
  329. f'Please use a different tag name (e.g., "v1.0", "v1.1", "latest"). '
  330. f'Reserved names: main, master'
  331. )
  332. # Get cookies for authentication.
  333. cookies = self.get_cookies(access_token=token, cookies_required=True)
  334. if not endpoint:
  335. endpoint = self.endpoint
  336. owner_or_group, name = model_id_to_group_owner_name(model_id)
  337. # Set path and body based on model type
  338. if aigc_model is not None:
  339. # Use AIGC model tag endpoint
  340. path = f'{endpoint}/api/v1/models/aigc/repo/tag'
  341. aigc_model.preupload_weights(cookies=cookies, headers=self.builder_headers(self.headers), endpoint=endpoint)
  342. # Base body for AIGC model tag
  343. body = {
  344. 'CoverImages': aigc_model.cover_images,
  345. 'Name': name,
  346. 'Path': owner_or_group,
  347. 'TagShowName': tag_name,
  348. 'WeightsName': aigc_model.weight_filename,
  349. 'WeightsSha256': aigc_model.weight_sha256,
  350. 'WeightsSize': aigc_model.weight_size,
  351. 'TriggerWords': aigc_model.trigger_words,
  352. 'AigcType': aigc_model.aigc_type,
  353. 'VisionFoundation': aigc_model.base_model_type
  354. }
  355. else:
  356. # Use regular model tag endpoint
  357. path = f'{endpoint}/api/v1/models/{model_id}/repo/tag'
  358. revision = 'master'
  359. body = {
  360. 'TagName': tag_name,
  361. 'Ref': revision
  362. }
  363. r = self.session.post(
  364. path,
  365. json=body,
  366. cookies=cookies,
  367. headers=self.builder_headers(self.headers))
  368. raise_for_http_status(r)
  369. d = r.json()
  370. raise_on_error(d)
  371. tag_url = f'{endpoint}/models/{model_id}/tags/{tag_name}'
  372. return tag_url
  373. def delete_model(self, model_id: str, endpoint: Optional[str] = None, token: Optional[str] = None):
  374. """
  375. @deprecated
  376. Delete model_id from ModelScope.
  377. Args:
  378. model_id (str): The model id.
  379. endpoint: the endpoint to use, default to None to use endpoint specified in the class
  380. token (str, optional): access token for authentication
  381. Raises:
  382. ValueError: If not login.
  383. Note:
  384. model_id = {owner}/{name}
  385. """
  386. warnings.warn(
  387. 'This function is deprecated due to security reasons, '
  388. 'and will be recovered in future versions with proper token authentication. ',
  389. DeprecationWarning,
  390. stacklevel=2
  391. )
  392. cookies = self.get_cookies(access_token=token, cookies_required=True)
  393. if not endpoint:
  394. endpoint = self.endpoint
  395. if cookies is None:
  396. raise ValueError('Token does not exist, please login first.')
  397. path = f'{endpoint}/api/v1/models/{model_id}'
  398. r = self.session.delete(path,
  399. cookies=cookies,
  400. headers=self.builder_headers(self.headers))
  401. raise_for_http_status(r)
  402. raise_on_error(r.json())
  403. def get_model_url(self, model_id: str, endpoint: Optional[str] = None):
  404. if not endpoint:
  405. endpoint = self.endpoint
  406. return f'{endpoint}/api/v1/models/{model_id}.git'
  407. def get_model(
  408. self,
  409. model_id: str,
  410. revision: Optional[str] = DEFAULT_MODEL_REVISION,
  411. endpoint: Optional[str] = None,
  412. token: Optional[str] = None,
  413. ) -> dict:
  414. """Get model information at ModelScope
  415. Args:
  416. model_id (str): The model id.
  417. revision (str optional): revision of model.
  418. endpoint: the endpoint to use, default to None to use endpoint specified in the class
  419. token (str, optional): access token for authentication
  420. Returns:
  421. The model detail information.
  422. Raises:
  423. NotExistError: If the model is not exist, will throw NotExistError
  424. Note:
  425. model_id = {owner}/{name}
  426. """
  427. cookies = self.get_cookies(access_token=token, cookies_required=False)
  428. owner_or_group, name = model_id_to_group_owner_name(model_id)
  429. if not endpoint:
  430. endpoint = self.endpoint
  431. if revision:
  432. path = f'{endpoint}/api/v1/models/{owner_or_group}/{name}?Revision={revision}'
  433. else:
  434. path = f'{endpoint}/api/v1/models/{owner_or_group}/{name}'
  435. r = self.session.get(path, cookies=cookies,
  436. headers=self.builder_headers(self.headers))
  437. handle_http_response(r, logger, cookies, model_id)
  438. if r.status_code == HTTPStatus.OK:
  439. if is_ok(r.json()):
  440. return r.json()[API_RESPONSE_FIELD_DATA]
  441. else:
  442. raise NotExistError(r.json()[API_RESPONSE_FIELD_MESSAGE])
  443. else:
  444. raise_for_http_status(r)
  445. def get_endpoint_for_read(self,
  446. repo_id: str,
  447. *,
  448. repo_type: Optional[str] = None,
  449. token: Optional[str] = None) -> str:
  450. """Get proper endpoint for read operation (such as download, list etc.)
  451. 1. If user has set MODELSCOPE_DOMAIN, construct endpoint with user-specified domain.
  452. If the repo does not exist on that endpoint, throw 404 error, otherwise return the endpoint.
  453. 2. If domain is not set, check existence of repo in cn-site and ai-site (intl version) respectively.
  454. Checking order is determined by MODELSCOPE_PREFER_AI_SITE.
  455. a. if MODELSCOPE_PREFER_AI_SITE is not set ,check cn-site first before ai-site (intl version)
  456. b. otherwise check ai-site before cn-site
  457. return the endpoint with which the given repo_id exists.
  458. if neither exists, throw 404 error
  459. """
  460. s = os.environ.get(MODELSCOPE_DOMAIN)
  461. if s is not None and s.strip() != '':
  462. endpoint = MODELSCOPE_URL_SCHEME + s
  463. try:
  464. self.repo_exists(repo_id=repo_id, repo_type=repo_type, endpoint=endpoint, re_raise=True, token=token)
  465. except Exception:
  466. logger.error(f'Repo {repo_id} does not exist on {endpoint}.')
  467. raise
  468. return endpoint
  469. check_cn_first = not is_env_true(MODELSCOPE_PREFER_AI_SITE)
  470. prefer_endpoint = get_endpoint(cn_site=check_cn_first)
  471. if not self.repo_exists(
  472. repo_id, repo_type=repo_type, endpoint=prefer_endpoint, token=token):
  473. alternative_endpoint = get_endpoint(cn_site=(not check_cn_first))
  474. logger.warning(f'Repo {repo_id} not exists on {prefer_endpoint}, '
  475. f'will try on alternative endpoint {alternative_endpoint}.')
  476. try:
  477. self.repo_exists(
  478. repo_id, repo_type=repo_type, endpoint=alternative_endpoint, re_raise=True, token=token)
  479. except Exception:
  480. logger.error(f'Repo {repo_id} not exists on either {prefer_endpoint} or {alternative_endpoint}')
  481. raise
  482. else:
  483. return alternative_endpoint
  484. else:
  485. return prefer_endpoint
  486. def model_info(self,
  487. repo_id: str,
  488. *,
  489. revision: Optional[str] = DEFAULT_MODEL_REVISION,
  490. endpoint: Optional[str] = None) -> ModelInfo:
  491. """Get model information including commit history.
  492. Args:
  493. repo_id (str): The model id in the format of
  494. ``namespace/model_name``.
  495. revision (str, optional): Specific revision of the model.
  496. Defaults to ``DEFAULT_MODEL_REVISION``.
  497. endpoint (str, optional): Hub endpoint to use. When ``None``,
  498. use the endpoint specified when initializing :class:`HubApi`.
  499. Returns:
  500. ModelInfo: The model detailed information returned by
  501. ModelScope Hub with commit history.
  502. """
  503. owner_or_group, _ = model_id_to_group_owner_name(repo_id)
  504. model_data = self.get_model(
  505. model_id=repo_id, revision=revision, endpoint=endpoint)
  506. commits = self.list_repo_commits(
  507. repo_id=repo_id, repo_type=REPO_TYPE_MODEL, revision=revision, endpoint=endpoint)
  508. siblings = self.get_model_files(
  509. model_id=repo_id, revision=revision, recursive=True, endpoint=endpoint)
  510. # Create ModelInfo from API response data
  511. model_info = ModelInfo(**model_data, commits=commits, author=owner_or_group, siblings=siblings)
  512. return model_info
  513. def dataset_info(self,
  514. repo_id: str,
  515. *,
  516. revision: Optional[str] = None,
  517. endpoint: Optional[str] = None) -> DatasetInfo:
  518. """Get dataset information including commit history.
  519. Args:
  520. repo_id (str): The dataset id in the format of
  521. ``namespace/dataset_name``.
  522. revision (str, optional): Specific revision of the dataset.
  523. Defaults to ``None``.
  524. endpoint (str, optional): Hub endpoint to use. When ``None``,
  525. use the endpoint specified when initializing :class:`HubApi`.
  526. Returns:
  527. DatasetInfo: The dataset detailed information returned by
  528. ModelScope Hub with commit history.
  529. """
  530. owner_or_group, _ = model_id_to_group_owner_name(repo_id)
  531. dataset_data = self.get_dataset(
  532. dataset_id=repo_id, revision=revision, endpoint=endpoint)
  533. commits = self.list_repo_commits(
  534. repo_id=repo_id, repo_type=REPO_TYPE_DATASET, revision=revision, endpoint=endpoint)
  535. siblings = self.get_dataset_files(
  536. repo_id=repo_id, revision=revision or DEFAULT_DATASET_REVISION, recursive=True, endpoint=endpoint)
  537. # Create DatasetInfo from API response data
  538. dataset_info = DatasetInfo(**dataset_data, commits=commits, author=owner_or_group, siblings=siblings)
  539. return dataset_info
  540. def repo_info(
  541. self,
  542. repo_id: str,
  543. *,
  544. repo_type: Optional[str] = REPO_TYPE_MODEL,
  545. revision: Optional[str] = DEFAULT_MODEL_REVISION,
  546. endpoint: Optional[str] = None
  547. ) -> Union[ModelInfo, DatasetInfo]:
  548. """Get repository information for models or datasets.
  549. Args:
  550. repo_id (str): The repository id in the format of
  551. ``namespace/repo_name``.
  552. revision (str, optional): Specific revision of the repository.
  553. Currently only effective for model repositories. Defaults to
  554. ``DEFAULT_MODEL_REVISION``.
  555. repo_type (str, optional): Type of the repository. Supported
  556. values are ``"model"`` and ``"dataset"``. If not provided,
  557. ``"model"`` is assumed.
  558. endpoint (str, optional): Hub endpoint to use. When ``None``,
  559. use the endpoint specified when initializing :class:`HubApi`.
  560. Returns:
  561. Union[ModelInfo, DatasetInfo]: The repository detailed information
  562. returned by ModelScope Hub.
  563. """
  564. if repo_type is None or repo_type == REPO_TYPE_MODEL:
  565. return self.model_info(repo_id=repo_id, revision=revision, endpoint=endpoint)
  566. if repo_type == REPO_TYPE_DATASET:
  567. return self.dataset_info(repo_id=repo_id, revision=revision, endpoint=endpoint)
  568. raise InvalidParameter(
  569. f'Arg repo_type {repo_type} not supported. Please choose from {REPO_TYPE_SUPPORT}.')
  570. def repo_exists(
  571. self,
  572. repo_id: str,
  573. *,
  574. repo_type: Optional[str] = None,
  575. endpoint: Optional[str] = None,
  576. re_raise: Optional[bool] = False,
  577. token: Optional[str] = None
  578. ) -> bool:
  579. """
  580. Checks if a repository exists on ModelScope
  581. Args:
  582. repo_id (`str`):
  583. A namespace (user or an organization) and a repo name separated
  584. by a `/`.
  585. repo_type (`str`, *optional*):
  586. `None` or `"model"` if getting repository info from a model. Default is `None`.
  587. TODO: support studio
  588. endpoint(`str`):
  589. None or specific endpoint to use, when None, use the default endpoint
  590. set in HubApi class (self.endpoint)
  591. re_raise(`bool`):
  592. raise exception when error
  593. token (`str`, *optional*): access token to use for checking existence.
  594. Returns:
  595. True if the repository exists, False otherwise.
  596. """
  597. if endpoint is None:
  598. endpoint = self.endpoint
  599. if (repo_type is not None) and repo_type.lower() not in REPO_TYPE_SUPPORT:
  600. raise Exception('Not support repo-type: %s' % repo_type)
  601. if (repo_id is None) or repo_id.count('/') != 1:
  602. raise Exception('Invalid repo_id: %s, must be of format namespace/name' % repo_type)
  603. cookies = self.get_cookies(access_token=token, cookies_required=False)
  604. owner_or_group, name = model_id_to_group_owner_name(repo_id)
  605. if (repo_type is not None) and repo_type.lower() == REPO_TYPE_DATASET:
  606. path = f'{endpoint}/api/v1/datasets/{owner_or_group}/{name}'
  607. else:
  608. path = f'{endpoint}/api/v1/models/{owner_or_group}/{name}'
  609. r = self.session.get(path, cookies=cookies,
  610. headers=self.builder_headers(self.headers))
  611. code = handle_http_response(r, logger, cookies, repo_id, False)
  612. if code == 200:
  613. return True
  614. elif code == 404:
  615. if re_raise:
  616. raise HTTPError(r)
  617. else:
  618. return False
  619. else:
  620. logger.warn(f'Check repo_exists return status code {code}.')
  621. raise Exception(
  622. 'Failed to check existence of repo: %s, make sure you have access authorization.'
  623. % repo_type)
  624. def delete_repo(self,
  625. repo_id: str,
  626. repo_type: str,
  627. endpoint: Optional[str] = None,
  628. token: Optional[str] = None
  629. ):
  630. """
  631. @deprecated
  632. Delete a repository from ModelScope.
  633. Args:
  634. repo_id (`str`):
  635. A namespace (user or an organization) and a repo name separated
  636. by a `/`.
  637. repo_type (`str`):
  638. The type of the repository. Supported types are `model` and `dataset`.
  639. endpoint(`str`):
  640. The endpoint to use. If not provided, the default endpoint is `https://www.modelscope.cn`
  641. Could be set to `https://ai.modelscope.ai` for international version.
  642. token (str): Access token of the ModelScope.
  643. """
  644. warnings.warn(
  645. 'This function is deprecated due to security reasons, '
  646. 'and will be recovered in future versions with proper token authentication. ',
  647. DeprecationWarning,
  648. stacklevel=2
  649. )
  650. if not endpoint:
  651. endpoint = self.endpoint
  652. if repo_type == REPO_TYPE_DATASET:
  653. self.delete_dataset(
  654. dataset_id=repo_id,
  655. endpoint=endpoint,
  656. token=token
  657. )
  658. elif repo_type == REPO_TYPE_MODEL:
  659. self.delete_model(
  660. model_id=repo_id,
  661. endpoint=endpoint,
  662. token=token)
  663. else:
  664. raise Exception(f'Arg repo_type {repo_type} not supported.')
  665. logger.info(f'Repo {repo_id} deleted successfully.')
  666. @staticmethod
  667. def _create_default_config(model_dir):
  668. cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
  669. cfg = {
  670. ConfigFields.framework: Frameworks.torch,
  671. ConfigFields.task: Tasks.other,
  672. }
  673. with open(cfg_file, 'w') as file:
  674. json.dump(cfg, file)
  675. def push_model(self,
  676. model_id: str,
  677. model_dir: str,
  678. visibility: Optional[int] = ModelVisibility.PUBLIC,
  679. license: Optional[str] = Licenses.APACHE_V2,
  680. chinese_name: Optional[str] = None,
  681. commit_message: Optional[str] = 'upload model',
  682. tag: Optional[str] = None,
  683. revision: Optional[str] = DEFAULT_REPOSITORY_REVISION,
  684. original_model_id: Optional[str] = None,
  685. ignore_file_pattern: Optional[Union[List[str], str]] = None,
  686. lfs_suffix: Optional[Union[str, List[str]]] = None,
  687. token: Optional[str] = None):
  688. warnings.warn(
  689. 'This function is deprecated and will be removed in future versions. '
  690. 'Please use git command directly or use HubApi().upload_folder instead',
  691. DeprecationWarning,
  692. stacklevel=2
  693. )
  694. """Upload model from a given directory to given repository. A valid model directory
  695. must contain a configuration.json file.
  696. This function upload the files in given directory to given repository. If the
  697. given repository is not exists in remote, it will automatically create it with
  698. given visibility, license and chinese_name parameters. If the revision is also
  699. not exists in remote repository, it will create a new branch for it.
  700. This function must be called before calling HubApi's login with a valid token
  701. which can be obtained from ModelScope's website.
  702. If any error, please upload via git commands.
  703. Args:
  704. model_id (str):
  705. The model id to be uploaded, caller must have write permission for it.
  706. model_dir(str):
  707. The Absolute Path of the finetune result.
  708. visibility(int, optional):
  709. Visibility of the new created model(1-private, 5-public). If the model is
  710. not exists in ModelScope, this function will create a new model with this
  711. visibility and this parameter is required. You can ignore this parameter
  712. if you make sure the model's existence.
  713. license(`str`, defaults to `None`):
  714. License of the new created model(see License). If the model is not exists
  715. in ModelScope, this function will create a new model with this license
  716. and this parameter is required. You can ignore this parameter if you
  717. make sure the model's existence.
  718. chinese_name(`str`, *optional*, defaults to `None`):
  719. chinese name of the new created model.
  720. commit_message(`str`, *optional*, defaults to `None`):
  721. commit message of the push request.
  722. tag(`str`, *optional*, defaults to `None`):
  723. The tag on this commit
  724. revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION):
  725. which branch to push. If the branch is not exists, It will create a new
  726. branch and push to it.
  727. original_model_id (str, optional): The base model id which this model is trained from
  728. ignore_file_pattern (`Union[List[str], str]`, optional): The file pattern to ignore uploading
  729. lfs_suffix (`List[str]`, optional): File types to use LFS to manage. examples: '*.safetensors'.
  730. Raises:
  731. InvalidParameter: Parameter invalid.
  732. NotLoginException: Not login
  733. ValueError: No configuration.json
  734. Exception: Create failed.
  735. """
  736. if model_id is None:
  737. raise InvalidParameter('model_id cannot be empty!')
  738. if model_dir is None:
  739. raise InvalidParameter('model_dir cannot be empty!')
  740. if not os.path.exists(model_dir) or os.path.isfile(model_dir):
  741. raise InvalidParameter('model_dir must be a valid directory.')
  742. cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
  743. if not os.path.exists(cfg_file):
  744. logger.warning(
  745. f'No {ModelFile.CONFIGURATION} file found in {model_dir}, creating a default one.')
  746. HubApi._create_default_config(model_dir)
  747. cookies = self.get_cookies(access_token=token, cookies_required=True)
  748. if cookies is None:
  749. raise NotLoginException('Must login before upload!')
  750. files_to_save = os.listdir(model_dir)
  751. folder_size = get_readable_folder_size(model_dir)
  752. if ignore_file_pattern is None:
  753. ignore_file_pattern = []
  754. if isinstance(ignore_file_pattern, str):
  755. ignore_file_pattern = [ignore_file_pattern]
  756. if visibility is None or license is None:
  757. raise InvalidParameter('Visibility and License cannot be empty for new model.')
  758. if not self.repo_exists(model_id, token=token):
  759. logger.info('Creating new model [%s]' % model_id)
  760. self.create_model(
  761. model_id=model_id,
  762. visibility=visibility,
  763. license=license,
  764. chinese_name=chinese_name,
  765. original_model_id=original_model_id,
  766. token=token)
  767. tmp_dir = os.path.join(model_dir, TEMPORARY_FOLDER_NAME) # make temporary folder
  768. git_wrapper = GitCommandWrapper()
  769. logger.info(f'Pushing folder {model_dir} as model {model_id}.')
  770. logger.info(f'Total folder size {folder_size}, this may take a while depending on actual pushing size...')
  771. try:
  772. repo = Repository(model_dir=tmp_dir, clone_from=model_id, auth_token=token)
  773. branches = git_wrapper.get_remote_branches(tmp_dir)
  774. if revision not in branches:
  775. logger.info('Creating new branch %s' % revision)
  776. git_wrapper.new_branch(tmp_dir, revision)
  777. git_wrapper.checkout(tmp_dir, revision)
  778. files_in_repo = os.listdir(tmp_dir)
  779. for f in files_in_repo:
  780. if f[0] != '.':
  781. src = os.path.join(tmp_dir, f)
  782. if os.path.isfile(src):
  783. os.remove(src)
  784. else:
  785. shutil.rmtree(src, ignore_errors=True)
  786. for f in files_to_save:
  787. if f[0] != '.':
  788. if any([re.search(pattern, f) is not None for pattern in ignore_file_pattern]):
  789. continue
  790. src = os.path.join(model_dir, f)
  791. if os.path.isdir(src):
  792. shutil.copytree(src, os.path.join(tmp_dir, f))
  793. else:
  794. shutil.copy(src, tmp_dir)
  795. if not commit_message:
  796. date = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
  797. commit_message = '[automsg] push model %s to hub at %s' % (
  798. model_id, date)
  799. if lfs_suffix is not None:
  800. lfs_suffix_list = [lfs_suffix] if isinstance(lfs_suffix, str) else lfs_suffix
  801. for suffix in lfs_suffix_list:
  802. repo.add_lfs_type(suffix)
  803. repo.push(
  804. commit_message=commit_message,
  805. local_branch=revision,
  806. remote_branch=revision)
  807. if tag is not None:
  808. repo.tag_and_push(tag, tag)
  809. logger.info(f'Successfully push folder {model_dir} to remote repo [{model_id}].')
  810. except Exception:
  811. raise
  812. finally:
  813. shutil.rmtree(tmp_dir, ignore_errors=True)
  814. def list_models(self,
  815. owner_or_group: str,
  816. page_number: Optional[int] = 1,
  817. page_size: Optional[int] = 10,
  818. endpoint: Optional[str] = None,
  819. token: Optional[str] = None) -> dict:
  820. """List models in owner or group.
  821. Args:
  822. owner_or_group(str): owner or group.
  823. page_number(int, optional): The page number, default: 1
  824. page_size(int, optional): The page size, default: 10
  825. endpoint: the endpoint to use, default to None to use endpoint specified in the class
  826. token (str, optional): access token for authentication
  827. Raises:
  828. RequestError: The request error.
  829. Returns:
  830. dict: {"models": "list of models", "TotalCount": total_number_of_models_in_owner_or_group}
  831. """
  832. cookies = self.get_cookies(access_token=token, cookies_required=False)
  833. if not endpoint:
  834. endpoint = self.endpoint
  835. path = f'{endpoint}/api/v1/models/'
  836. r = self.session.put(
  837. path,
  838. data='{"Path":"%s", "PageNumber":%s, "PageSize": %s}' %
  839. (owner_or_group, page_number, page_size),
  840. cookies=cookies,
  841. headers=self.builder_headers(self.headers))
  842. handle_http_response(r, logger, cookies, owner_or_group)
  843. if r.status_code == HTTPStatus.OK:
  844. if is_ok(r.json()):
  845. data = r.json()[API_RESPONSE_FIELD_DATA]
  846. return data
  847. else:
  848. raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
  849. else:
  850. raise_for_http_status(r)
  851. return None
  852. def list_datasets(self,
  853. owner_or_group: str,
  854. *,
  855. page_number: Optional[int] = 1,
  856. page_size: Optional[int] = 10,
  857. sort: Optional[str] = None,
  858. search: Optional[str] = None,
  859. endpoint: Optional[str] = None,
  860. token: Optional[str] = None) -> dict:
  861. """List datasets via OpenAPI with pagination, filtering and sorting.
  862. Args:
  863. owner_or_group (str): Search by dataset authors (including organizations and individuals).
  864. page_number (int, optional): The page number. Defaults to 1.
  865. page_size (int, optional): The page size. Defaults to 10.
  866. sort (str, optional): Sort key. If not provided, the server's default sorting is used.
  867. choose from ['default', 'downloads', 'likes', 'last_modified'].
  868. search (str, optional): Search by substring keywords in the dataset's Chinese name,
  869. English name, and authors (including organizations and individuals).
  870. endpoint (str, optional): Hub endpoint to use. When None, use the endpoint specified in the class.
  871. token (str, optional): Access token for authentication.
  872. Returns:
  873. dict: The OpenAPI data payload, e.g.
  874. {
  875. "datasets": [...],
  876. "total_count": int,
  877. "page_number": int,
  878. "page_size": int
  879. }
  880. """
  881. if not endpoint:
  882. endpoint = self.endpoint
  883. path = f'{endpoint}/openapi/v1/datasets'
  884. # Build query params
  885. params: Dict[str, Any] = {
  886. 'page_number': page_number,
  887. 'page_size': page_size,
  888. }
  889. if sort:
  890. if sort not in VALID_SORT_KEYS:
  891. raise InvalidParameter(
  892. f'Invalid sort key: {sort}. Supported sort keys: {list(VALID_SORT_KEYS)}')
  893. params['sort'] = sort
  894. if search:
  895. params['search'] = search
  896. if owner_or_group:
  897. params['author'] = owner_or_group
  898. cookies = self.get_cookies(access_token=token, cookies_required=False)
  899. headers = self.builder_headers(self.headers)
  900. r = self.session.get(
  901. path,
  902. params=params,
  903. cookies=cookies,
  904. headers=headers
  905. )
  906. raise_for_http_status(r)
  907. resp = r.json()
  908. # OpenAPI success schema
  909. if resp.get('success') is True and 'data' in resp:
  910. return resp['data']
  911. else:
  912. # Fallback for unexpected schema
  913. msg = resp.get('message') or 'Failed to list datasets'
  914. raise RequestError(msg)
  915. def _check_cookie(self, use_cookies: Union[bool, CookieJar] = False) -> CookieJar: # noqa
  916. cookies = None
  917. if isinstance(use_cookies, CookieJar):
  918. cookies = use_cookies
  919. elif isinstance(use_cookies, bool):
  920. cookies = self.get_cookies(cookies_required=use_cookies)
  921. return cookies
  922. def list_model_revisions(
  923. self,
  924. model_id: str,
  925. cutoff_timestamp: Optional[int] = None,
  926. use_cookies: Union[bool, CookieJar] = False) -> List[str]:
  927. """Get model branch and tags.
  928. Args:
  929. model_id (str): The model id
  930. cutoff_timestamp (int): Tags created before the cutoff will be included.
  931. The timestamp is represented by the seconds elapsed from the epoch time.
  932. use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True,
  933. will load cookie from local. Defaults to False.
  934. Returns:
  935. Tuple[List[str], List[str]]: Return list of branch name and tags
  936. """
  937. tags_details = self.list_model_revisions_detail(model_id=model_id,
  938. cutoff_timestamp=cutoff_timestamp,
  939. use_cookies=use_cookies)
  940. tags = [x['Revision'] for x in tags_details
  941. ] if tags_details else []
  942. return tags
  943. def list_model_revisions_detail(
  944. self,
  945. model_id: str,
  946. cutoff_timestamp: Optional[int] = None,
  947. use_cookies: Union[bool, CookieJar] = False,
  948. endpoint: Optional[str] = None) -> List[str]:
  949. """Get model branch and tags.
  950. Args:
  951. model_id (str): The model id
  952. cutoff_timestamp (int): Tags created before the cutoff will be included.
  953. The timestamp is represented by the seconds elapsed from the epoch time.
  954. use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True,
  955. will load cookie from local. Defaults to False.
  956. endpoint: the endpoint to use, default to None to use endpoint specified in the class
  957. Returns:
  958. Tuple[List[str], List[str]]: Return list of branch name and tags
  959. """
  960. cookies = self._check_cookie(use_cookies)
  961. if cutoff_timestamp is None:
  962. cutoff_timestamp = get_release_datetime()
  963. if not endpoint:
  964. endpoint = self.endpoint
  965. path = f'{endpoint}/api/v1/models/{model_id}/revisions?EndTime=%s' % cutoff_timestamp
  966. r = self.session.get(path, cookies=cookies,
  967. headers=self.builder_headers(self.headers))
  968. handle_http_response(r, logger, cookies, model_id)
  969. d = r.json()
  970. raise_on_error(d)
  971. info = d[API_RESPONSE_FIELD_DATA]
  972. # tags returned from backend are guaranteed to be ordered by create-time
  973. return info['RevisionMap']['Tags']
  974. def get_branch_tag_detail(self, details, name):
  975. for item in details:
  976. if item['Revision'] == name:
  977. return item
  978. return None
  979. def get_valid_revision_detail(self,
  980. model_id: str,
  981. revision=None,
  982. cookies: Optional[CookieJar] = None,
  983. endpoint: Optional[str] = None):
  984. if not endpoint:
  985. endpoint = self.endpoint
  986. release_timestamp = get_release_datetime()
  987. current_timestamp = int(round(datetime.datetime.now().timestamp()))
  988. # for active development in library codes (non-release-branches), release_timestamp
  989. # is set to be a far-away-time-in-the-future, to ensure that we shall
  990. # get the master-HEAD version from model repo by default (when no revision is provided)
  991. all_branches_detail, all_tags_detail = self.get_model_branches_and_tags_details(
  992. model_id, use_cookies=False if cookies is None else cookies, endpoint=endpoint)
  993. all_branches = [x['Revision'] for x in all_branches_detail] if all_branches_detail else []
  994. all_tags = [x['Revision'] for x in all_tags_detail] if all_tags_detail else []
  995. if release_timestamp > current_timestamp + ONE_YEAR_SECONDS:
  996. if revision is None:
  997. revision = MASTER_MODEL_BRANCH
  998. logger.info(
  999. 'Model revision not specified, using default [%s] version.'
  1000. % revision)
  1001. if revision not in all_branches and revision not in all_tags:
  1002. raise NotExistError('The model: %s has no revision : %s .' % (model_id, revision))
  1003. revision_detail = self.get_branch_tag_detail(all_tags_detail, revision)
  1004. if revision_detail is None:
  1005. revision_detail = self.get_branch_tag_detail(all_branches_detail, revision)
  1006. logger.debug('Development mode use revision: %s' % revision)
  1007. else:
  1008. if revision is not None and revision in all_branches:
  1009. revision_detail = self.get_branch_tag_detail(all_branches_detail, revision)
  1010. return revision_detail
  1011. if len(all_tags_detail) == 0: # use no revision use master as default.
  1012. if revision is None or revision == MASTER_MODEL_BRANCH:
  1013. revision = MASTER_MODEL_BRANCH
  1014. else:
  1015. raise NotExistError('The model: %s has no revision: %s !' % (model_id, revision))
  1016. revision_detail = self.get_branch_tag_detail(all_branches_detail, revision)
  1017. else:
  1018. if revision is None: # user not specified revision, use latest revision before release time
  1019. revisions_detail = [x for x in
  1020. all_tags_detail if
  1021. x['CreatedAt'] <= release_timestamp] if all_tags_detail else [] # noqa E501
  1022. if len(revisions_detail) > 0:
  1023. revision = revisions_detail[0]['Revision'] # use latest revision before release time.
  1024. revision_detail = revisions_detail[0]
  1025. else:
  1026. revision = MASTER_MODEL_BRANCH
  1027. revision_detail = self.get_branch_tag_detail(all_branches_detail, revision)
  1028. vl = '[%s]' % ','.join(all_tags)
  1029. logger.warning('Model revision should be specified from revisions: %s' % (vl))
  1030. logger.warning('Model revision not specified, use revision: %s' % revision)
  1031. else:
  1032. # use user-specified revision
  1033. if revision not in all_tags:
  1034. if revision == MASTER_MODEL_BRANCH:
  1035. logger.warning('Using the master branch is fragile, please use it with caution!')
  1036. revision_detail = self.get_branch_tag_detail(all_branches_detail, revision)
  1037. else:
  1038. vl = '[%s]' % ','.join(all_tags)
  1039. raise NotExistError('The model: %s has no revision: %s valid are: %s!' %
  1040. (model_id, revision, vl))
  1041. else:
  1042. revision_detail = self.get_branch_tag_detail(all_tags_detail, revision)
  1043. logger.info('Use user-specified model revision: %s' % revision)
  1044. return revision_detail
  1045. def get_valid_revision(self,
  1046. model_id: str,
  1047. revision=None,
  1048. cookies: Optional[CookieJar] = None,
  1049. endpoint: Optional[str] = None):
  1050. return self.get_valid_revision_detail(model_id=model_id,
  1051. revision=revision,
  1052. cookies=cookies,
  1053. endpoint=endpoint)['Revision']
  1054. def get_model_branches_and_tags_details(
  1055. self,
  1056. model_id: str,
  1057. use_cookies: Union[bool, CookieJar] = False,
  1058. endpoint: Optional[str] = None
  1059. ) -> Tuple[List[str], List[str]]:
  1060. """Get model branch and tags.
  1061. Args:
  1062. model_id (str): The model id
  1063. use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True,
  1064. will load cookie from local. Defaults to False.
  1065. endpoint: the endpoint to use, default to None to use endpoint specified in the class
  1066. Returns:
  1067. Tuple[List[str], List[str]]: Return list of branch name and tags
  1068. """
  1069. cookies = self._check_cookie(use_cookies)
  1070. if not endpoint:
  1071. endpoint = self.endpoint
  1072. path = f'{endpoint}/api/v1/models/{model_id}/revisions'
  1073. r = self.session.get(path, cookies=cookies,
  1074. headers=self.builder_headers(self.headers))
  1075. handle_http_response(r, logger, cookies, model_id)
  1076. d = r.json()
  1077. raise_on_error(d)
  1078. info = d[API_RESPONSE_FIELD_DATA]
  1079. return info['RevisionMap']['Branches'], info['RevisionMap']['Tags']
  1080. def get_model_branches_and_tags(
  1081. self,
  1082. model_id: str,
  1083. use_cookies: Union[bool, CookieJar] = False,
  1084. ) -> Tuple[List[str], List[str]]:
  1085. """Get model branch and tags.
  1086. Args:
  1087. model_id (str): The model id
  1088. use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True,
  1089. will load cookie from local. Defaults to False.
  1090. Returns:
  1091. Tuple[List[str], List[str]]: Return list of branch name and tags
  1092. """
  1093. branches_detail, tags_detail = self.get_model_branches_and_tags_details(model_id=model_id,
  1094. use_cookies=use_cookies)
  1095. branches = [x['Revision'] for x in branches_detail
  1096. ] if branches_detail else []
  1097. tags = [x['Revision'] for x in tags_detail
  1098. ] if tags_detail else []
  1099. return branches, tags
  1100. def get_model_files(self,
  1101. model_id: str,
  1102. revision: Optional[str] = DEFAULT_MODEL_REVISION,
  1103. root: Optional[str] = None,
  1104. recursive: Optional[bool] = False,
  1105. use_cookies: Union[bool, CookieJar] = False,
  1106. headers: Optional[dict] = {},
  1107. endpoint: Optional[str] = None) -> List[dict]:
  1108. """List the models files.
  1109. Args:
  1110. model_id (str): The model id
  1111. revision (Optional[str], optional): The branch or tag name.
  1112. root (Optional[str], optional): The root path. Defaults to None.
  1113. recursive (Optional[bool], optional): Is recursive list files. Defaults to False.
  1114. use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True,
  1115. will load cookie from local. Defaults to False.
  1116. headers: request headers
  1117. endpoint: the endpoint to use, default to None to use endpoint specified in the class
  1118. Returns:
  1119. List[dict]: Model file list.
  1120. """
  1121. if not endpoint:
  1122. endpoint = self.endpoint
  1123. if revision:
  1124. path = '%s/api/v1/models/%s/repo/files?Revision=%s&Recursive=%s' % (
  1125. endpoint, model_id, revision, recursive)
  1126. else:
  1127. path = '%s/api/v1/models/%s/repo/files?Recursive=%s' % (
  1128. endpoint, model_id, recursive)
  1129. cookies = self._check_cookie(use_cookies)
  1130. if root is not None:
  1131. path = path + f'&Root={root}'
  1132. headers = self.headers if headers is None else headers
  1133. headers['X-Request-ID'] = str(uuid.uuid4().hex)
  1134. r = self.session.get(
  1135. path, cookies=cookies, headers=headers)
  1136. handle_http_response(r, logger, cookies, model_id)
  1137. d = r.json()
  1138. raise_on_error(d)
  1139. files = []
  1140. if not d[API_RESPONSE_FIELD_DATA]['Files']:
  1141. logger.warning(f'No files found in model {model_id} at revision {revision}.')
  1142. return files
  1143. for file in d[API_RESPONSE_FIELD_DATA]['Files']:
  1144. if file['Name'] == '.gitignore' or file['Name'] == '.gitattributes':
  1145. continue
  1146. files.append(file)
  1147. return files
  1148. def file_exists(
  1149. self,
  1150. repo_id: str,
  1151. filename: str,
  1152. *,
  1153. revision: Optional[str] = None,
  1154. token: Optional[str] = None,
  1155. ):
  1156. """Get if the specified file exists
  1157. Args:
  1158. repo_id (`str`): The repo id to use
  1159. filename (`str`): The queried filename, if the file exists in a sub folder,
  1160. please pass <sub-folder-name>/<file-name>
  1161. revision (`Optional[str]`): The repo revision
  1162. token (`Optional[str]`): The access token
  1163. Returns:
  1164. The query result in bool value
  1165. """
  1166. cookies = self.get_cookies(access_token=token)
  1167. files = self.get_model_files(
  1168. repo_id,
  1169. recursive=True,
  1170. revision=revision,
  1171. use_cookies=False if cookies is None else cookies,
  1172. )
  1173. files = [file['Path'] for file in files]
  1174. return filename in files
  1175. def create_dataset(self,
  1176. dataset_name: str,
  1177. namespace: str,
  1178. chinese_name: Optional[str] = '',
  1179. license: Optional[str] = Licenses.APACHE_V2,
  1180. visibility: Optional[int] = DatasetVisibility.PUBLIC,
  1181. description: Optional[str] = '',
  1182. endpoint: Optional[str] = None,
  1183. token: Optional[str] = None) -> str:
  1184. """
  1185. Create a dataset in ModelScope.
  1186. Args:
  1187. dataset_name (str): The name of the dataset.
  1188. namespace (str): The namespace (user or organization) for the dataset.
  1189. chinese_name (str, optional): The Chinese name of the dataset. Defaults to ''.
  1190. license (str, optional): The license of the dataset. Defaults to Licenses.APACHE_V2.
  1191. visibility (int, optional): The visibility of the dataset. Defaults to DatasetVisibility.PUBLIC.
  1192. description (str, optional): The description of the dataset. Defaults to ''.
  1193. endpoint (str, optional): The endpoint to use. If not provided, the default endpoint is used.
  1194. token (str, optional): The access token for authentication.
  1195. Returns:
  1196. str: The URL of the created dataset repository.
  1197. """
  1198. if dataset_name is None or namespace is None:
  1199. raise InvalidParameter('dataset_name and namespace are required!')
  1200. cookies = self.get_cookies(access_token=token, cookies_required=True)
  1201. if not endpoint:
  1202. endpoint = self.endpoint
  1203. path = f'{endpoint}/api/v1/datasets'
  1204. files = {
  1205. 'Name': (None, dataset_name),
  1206. 'ChineseName': (None, chinese_name),
  1207. 'Owner': (None, namespace),
  1208. 'License': (None, license),
  1209. 'Visibility': (None, visibility),
  1210. 'Description': (None, description)
  1211. }
  1212. r = self.session.post(
  1213. path,
  1214. files=files,
  1215. cookies=cookies,
  1216. headers=self.builder_headers(self.headers),
  1217. )
  1218. handle_http_post_error(r, path, files)
  1219. raise_on_error(r.json())
  1220. dataset_repo_url = f'{endpoint}/datasets/{namespace}/{dataset_name}'
  1221. logger.info(f'Create dataset success: {dataset_repo_url}')
  1222. return dataset_repo_url
  1223. def delete_dataset(self,
  1224. dataset_id: str,
  1225. endpoint: Optional[str] = None,
  1226. token: Optional[str] = None):
  1227. """
  1228. @deprecated
  1229. Delete a dataset from ModelScope.
  1230. Args:
  1231. dataset_id (str): The dataset id to delete.
  1232. endpoint (str, optional): The endpoint to use. If not provided, the default endpoint is used.
  1233. token (str, optional): The access token for authentication.
  1234. Returns:
  1235. None
  1236. """
  1237. warnings.warn(
  1238. 'This function is deprecated due to security reasons, '
  1239. 'and will be recovered in future versions with proper token authentication. ',
  1240. DeprecationWarning,
  1241. stacklevel=2
  1242. )
  1243. cookies = self.get_cookies(access_token=token, cookies_required=True)
  1244. if not endpoint:
  1245. endpoint = self.endpoint
  1246. if cookies is None:
  1247. raise ValueError('Token does not exist, please login first.')
  1248. path = f'{endpoint}/api/v1/datasets/{dataset_id}'
  1249. r = self.session.delete(path,
  1250. cookies=cookies,
  1251. headers=self.builder_headers(self.headers))
  1252. raise_for_http_status(r)
  1253. raise_on_error(r.json())
  1254. def get_dataset_id_and_type(self,
  1255. dataset_name: str,
  1256. namespace: str,
  1257. endpoint: Optional[str] = None,
  1258. token: Optional[str] = None):
  1259. """ Get the dataset id and type. """
  1260. if not endpoint:
  1261. endpoint = self.endpoint
  1262. datahub_url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
  1263. cookies = self.get_cookies(access_token=token)
  1264. r = self.session.get(datahub_url, cookies=cookies)
  1265. resp = r.json()
  1266. datahub_raise_on_error(datahub_url, resp, r)
  1267. dataset_id = resp['Data']['Id']
  1268. dataset_type = resp['Data']['Type']
  1269. return dataset_id, dataset_type
  1270. def list_repo_tree(self,
  1271. dataset_name: str,
  1272. namespace: str,
  1273. revision: str,
  1274. root_path: str,
  1275. recursive: bool = True,
  1276. page_number: int = 1,
  1277. page_size: int = 100,
  1278. endpoint: Optional[str] = None,
  1279. token: Optional[str] = None):
  1280. """
  1281. @deprecated: Use `get_dataset_files` instead.
  1282. """
  1283. warnings.warn('The function `list_repo_tree` is deprecated, use `get_dataset_files` instead.',
  1284. DeprecationWarning)
  1285. dataset_hub_id, dataset_type = self.get_dataset_id_and_type(
  1286. dataset_name=dataset_name, namespace=namespace, endpoint=endpoint, token=token)
  1287. recursive = 'True' if recursive else 'False'
  1288. if not endpoint:
  1289. endpoint = self.endpoint
  1290. datahub_url = f'{endpoint}/api/v1/datasets/{dataset_hub_id}/repo/tree'
  1291. params = {'Revision': revision if revision else 'master',
  1292. 'Root': root_path if root_path else '/', 'Recursive': recursive,
  1293. 'PageNumber': page_number, 'PageSize': page_size}
  1294. cookies = self.get_cookies(access_token=token)
  1295. r = self.session.get(datahub_url, params=params, cookies=cookies)
  1296. resp = r.json()
  1297. datahub_raise_on_error(datahub_url, resp, r)
  1298. return resp
  1299. def list_repo_commits(self,
  1300. repo_id: str,
  1301. *,
  1302. repo_type: Optional[str] = REPO_TYPE_MODEL,
  1303. revision: Optional[str] = DEFAULT_REPOSITORY_REVISION,
  1304. page_number: int = 1,
  1305. page_size: int = 50,
  1306. endpoint: Optional[str] = None,
  1307. token: Optional[str] = None):
  1308. """
  1309. Get the commit history for a repository.
  1310. Args:
  1311. repo_id (str): The repository id, in the format of `namespace/repo_name`.
  1312. repo_type (Optional[str]): The type of the repository. Supported types are `model` and `dataset`.
  1313. revision (str): The branch or tag name. Defaults to `DEFAULT_REPOSITORY_REVISION`.
  1314. page_number (int): The page number for pagination. Defaults to 1.
  1315. page_size (int): The number of commits per page. Defaults to 50.
  1316. endpoint (Optional[str]): The endpoint to use, defaults to None to use the endpoint specified in the class.
  1317. token (Optional[str]): The access token.
  1318. Returns:
  1319. CommitHistoryResponse: The commit history response.
  1320. Examples:
  1321. >>> from modelscope.hub.api import HubApi
  1322. >>> api = HubApi()
  1323. >>> commit_history = api.list_repo_commits('meituan/Meeseeks')
  1324. >>> print(f"Total commits: {commit_history.total_count}")
  1325. >>> for commit in commit_history.commits:
  1326. ... print(f"{commit.short_id}: {commit.title}")
  1327. """
  1328. if is_relative_path(repo_id) and repo_id.count('/') == 1:
  1329. _owner, _dataset_name = repo_id.split('/')
  1330. else:
  1331. raise ValueError(f'Invalid repo_id: {repo_id} !')
  1332. if not endpoint:
  1333. endpoint = self.endpoint
  1334. commits_url = f'{endpoint}/api/v1/{repo_type}s/{repo_id}/commits' if repo_type else \
  1335. f'{endpoint}/api/v1/models/{repo_id}/commits'
  1336. params = {
  1337. 'Ref': revision or DEFAULT_REPOSITORY_REVISION,
  1338. 'PageNumber': page_number,
  1339. 'PageSize': page_size
  1340. }
  1341. cookies = self.get_cookies(access_token=token)
  1342. try:
  1343. r = self.session.get(commits_url, params=params,
  1344. cookies=cookies, headers=self.builder_headers(self.headers))
  1345. raise_for_http_status(r)
  1346. resp = r.json()
  1347. raise_on_error(resp)
  1348. if resp.get('Code') == HTTPStatus.OK:
  1349. return CommitHistoryResponse.from_api_response(resp)
  1350. except requests.exceptions.RequestException as e:
  1351. raise Exception(f'Failed to get repository commits for {repo_id}: {str(e)}')
  1352. def get_dataset_files(self,
  1353. repo_id: str,
  1354. *,
  1355. revision: str = DEFAULT_REPOSITORY_REVISION,
  1356. root_path: str = '/',
  1357. recursive: bool = True,
  1358. page_number: int = 1,
  1359. page_size: int = 100,
  1360. endpoint: Optional[str] = None,
  1361. token: Optional[str] = None):
  1362. """
  1363. Get the dataset files.
  1364. Args:
  1365. repo_id (str): The repository id, in the format of `namespace/dataset_name`.
  1366. revision (str): The branch or tag name. Defaults to `DEFAULT_REPOSITORY_REVISION`.
  1367. root_path (str): The root path to list. Defaults to '/'.
  1368. recursive (bool): Whether to list recursively. Defaults to True.
  1369. page_number (int): The page number for pagination. Defaults to 1.
  1370. page_size (int): The number of items per page. Defaults to 100.
  1371. endpoint (Optional[str]): The endpoint to use, defaults to None to use the endpoint specified in the class.
  1372. token (Optional[str]): The access token.
  1373. Returns:
  1374. List: The response containing the dataset repository tree information.
  1375. e.g. [{'CommitId': None, 'CommitMessage': '...', 'Size': 0, 'Type': 'tree'}, ...]
  1376. """
  1377. if is_relative_path(repo_id) and repo_id.count('/') == 1:
  1378. _owner, _dataset_name = repo_id.split('/')
  1379. else:
  1380. raise ValueError(f'Invalid repo_id: {repo_id} !')
  1381. dataset_hub_id, dataset_type = self.get_dataset_id_and_type(
  1382. dataset_name=_dataset_name, namespace=_owner, endpoint=endpoint, token=token)
  1383. if not endpoint:
  1384. endpoint = self.endpoint
  1385. datahub_url = f'{endpoint}/api/v1/datasets/{dataset_hub_id}/repo/tree'
  1386. params = {
  1387. 'Revision': revision,
  1388. 'Root': root_path,
  1389. 'Recursive': 'True' if recursive else 'False',
  1390. 'PageNumber': page_number,
  1391. 'PageSize': page_size
  1392. }
  1393. cookies = self.get_cookies(access_token=token)
  1394. r = self.session.get(datahub_url, params=params, cookies=cookies)
  1395. resp = r.json()
  1396. datahub_raise_on_error(datahub_url, resp, r)
  1397. return resp['Data']['Files']
  1398. def get_dataset(
  1399. self,
  1400. dataset_id: str,
  1401. revision: Optional[str] = DEFAULT_REPOSITORY_REVISION,
  1402. endpoint: Optional[str] = None,
  1403. token: Optional[str] = None
  1404. ):
  1405. """
  1406. Get the dataset information.
  1407. Args:
  1408. dataset_id (str): The dataset id.
  1409. revision (Optional[str]): The revision of the dataset.
  1410. endpoint (Optional[str]): The endpoint to use, defaults to None to use the endpoint specified in the class.
  1411. token (Optional[str]): The access token.
  1412. Returns:
  1413. dict: The dataset information.
  1414. """
  1415. cookies = self.get_cookies(access_token=token)
  1416. if not endpoint:
  1417. endpoint = self.endpoint
  1418. if revision:
  1419. path = f'{endpoint}/api/v1/datasets/{dataset_id}?Revision={revision}'
  1420. else:
  1421. path = f'{endpoint}/api/v1/datasets/{dataset_id}'
  1422. r = self.session.get(
  1423. path, cookies=cookies, headers=self.builder_headers(self.headers))
  1424. raise_for_http_status(r)
  1425. resp = r.json()
  1426. datahub_raise_on_error(path, resp, r)
  1427. return resp[API_RESPONSE_FIELD_DATA]
  1428. def get_dataset_meta_file_list(self, dataset_name: str, namespace: str,
  1429. dataset_id: str, revision: str, endpoint: Optional[str] = None,
  1430. token: Optional[str] = None):
  1431. """ Get the meta file-list of the dataset. """
  1432. if not endpoint:
  1433. endpoint = self.endpoint
  1434. datahub_url = f'{endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}'
  1435. cookies = self.get_cookies(access_token=token)
  1436. r = self.session.get(datahub_url,
  1437. cookies=cookies,
  1438. headers=self.builder_headers(self.headers))
  1439. resp = r.json()
  1440. datahub_raise_on_error(datahub_url, resp, r)
  1441. file_list = resp['Data']
  1442. if file_list is None:
  1443. raise NotExistError(
  1444. f'The modelscope dataset [dataset_name = {dataset_name}, namespace = {namespace}, '
  1445. f'version = {revision}] dose not exist')
  1446. file_list = file_list['Files']
  1447. return file_list
  1448. @staticmethod
  1449. def dump_datatype_file(dataset_type: int, meta_cache_dir: str):
  1450. """
  1451. Dump the data_type as a local file, in order to get the dataset
  1452. formation without calling the datahub.
  1453. More details, please refer to the class
  1454. `modelscope.utils.constant.DatasetFormations`.
  1455. """
  1456. dataset_type_file_path = os.path.join(meta_cache_dir,
  1457. f'{str(dataset_type)}{DatasetFormations.formation_mark_ext.value}')
  1458. with open(dataset_type_file_path, 'w') as fp:
  1459. fp.write('*** Automatically-generated file, do not modify ***')
  1460. def get_dataset_meta_files_local_paths(self, dataset_name: str,
  1461. namespace: str,
  1462. revision: str,
  1463. meta_cache_dir: str, dataset_type: int, file_list: list,
  1464. endpoint: Optional[str] = None,
  1465. token: Optional[str] = None):
  1466. local_paths = defaultdict(list)
  1467. dataset_formation = DatasetFormations(dataset_type)
  1468. dataset_meta_format = DatasetMetaFormats[dataset_formation]
  1469. cookies = self.get_cookies(access_token=token)
  1470. # Dump the data_type as a local file
  1471. HubApi.dump_datatype_file(dataset_type=dataset_type, meta_cache_dir=meta_cache_dir)
  1472. if not endpoint:
  1473. endpoint = self.endpoint
  1474. for file_info in file_list:
  1475. file_path = file_info['Path']
  1476. extension = os.path.splitext(file_path)[-1]
  1477. if extension in dataset_meta_format:
  1478. datahub_url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
  1479. f'Revision={revision}&FilePath={file_path}'
  1480. r = self.session.get(datahub_url, cookies=cookies)
  1481. raise_for_http_status(r)
  1482. local_path = os.path.join(meta_cache_dir, file_path)
  1483. if os.path.exists(local_path):
  1484. logger.warning(
  1485. f"Reusing dataset {dataset_name}'s python file ({local_path})"
  1486. )
  1487. local_paths[extension].append(local_path)
  1488. continue
  1489. with open(local_path, 'wb') as f:
  1490. f.write(r.content)
  1491. local_paths[extension].append(local_path)
  1492. return local_paths, dataset_formation
  1493. @staticmethod
  1494. def fetch_meta_files_from_url(url, out_path, chunk_size=1024, mode=DownloadMode.REUSE_DATASET_IF_EXISTS,
  1495. token: Optional[str] = None):
  1496. """
  1497. Fetch the meta-data files from the url, e.g. csv/jsonl files.
  1498. """
  1499. import hashlib
  1500. from tqdm.auto import tqdm
  1501. import pandas as pd
  1502. out_path = os.path.join(out_path, hashlib.md5(url.encode(encoding='UTF-8')).hexdigest())
  1503. if mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists(out_path):
  1504. os.remove(out_path)
  1505. if os.path.exists(out_path):
  1506. logger.info(f'Reusing cached meta-data file: {out_path}')
  1507. return out_path
  1508. cookies = HubApi().get_cookies(access_token=token)
  1509. # Make the request and get the response content as TextIO
  1510. logger.info('Loading meta-data file ...')
  1511. response = requests.get(url, cookies=cookies, stream=True)
  1512. total_size = int(response.headers.get('content-length', 0))
  1513. progress = tqdm(total=total_size, dynamic_ncols=True)
  1514. def get_chunk(resp):
  1515. chunk_data = []
  1516. for data in resp.iter_lines():
  1517. data = data.decode('utf-8')
  1518. chunk_data.append(data)
  1519. if len(chunk_data) >= chunk_size:
  1520. yield chunk_data
  1521. chunk_data = []
  1522. yield chunk_data
  1523. iter_num = 0
  1524. with open(out_path, 'a') as f:
  1525. for chunk in get_chunk(response):
  1526. progress.update(len(chunk))
  1527. if url.endswith('jsonl'):
  1528. chunk = [json.loads(line) for line in chunk if line.strip()]
  1529. if len(chunk) == 0:
  1530. continue
  1531. if iter_num == 0:
  1532. with_header = True
  1533. else:
  1534. with_header = False
  1535. chunk_df = pd.DataFrame(chunk)
  1536. chunk_df.to_csv(f, index=False, header=with_header, escapechar='\\')
  1537. iter_num += 1
  1538. else:
  1539. # csv or others
  1540. for line in chunk:
  1541. f.write(line + '\n')
  1542. progress.close()
  1543. return out_path
  1544. def get_dataset_file_url(
  1545. self,
  1546. file_name: str,
  1547. dataset_name: str,
  1548. namespace: str,
  1549. revision: Optional[str] = DEFAULT_DATASET_REVISION,
  1550. view: Optional[bool] = False,
  1551. extension_filter: Optional[bool] = True,
  1552. endpoint: Optional[str] = None):
  1553. if not file_name or not dataset_name or not namespace:
  1554. raise ValueError('Args (file_name, dataset_name, namespace) cannot be empty!')
  1555. # Note: make sure the FilePath is the last parameter in the url
  1556. params: dict = {'Source': 'SDK', 'Revision': revision, 'FilePath': file_name, 'View': view}
  1557. params: str = urlencode(params)
  1558. if not endpoint:
  1559. endpoint = self.endpoint
  1560. file_url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?{params}'
  1561. return file_url
  1562. # if extension_filter:
  1563. # if os.path.splitext(file_name)[-1] in META_FILES_FORMAT:
  1564. # file_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?'\
  1565. # f'Revision={revision}&FilePath={file_name}'
  1566. # else:
  1567. # file_url = file_name
  1568. # return file_url
  1569. # else:
  1570. # return file_url
  1571. def get_dataset_file_url_origin(
  1572. self,
  1573. file_name: str,
  1574. dataset_name: str,
  1575. namespace: str,
  1576. revision: Optional[str] = DEFAULT_DATASET_REVISION,
  1577. endpoint: Optional[str] = None):
  1578. if not endpoint:
  1579. endpoint = self.endpoint
  1580. if file_name and os.path.splitext(file_name)[-1] in META_FILES_FORMAT:
  1581. file_name = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
  1582. f'Revision={revision}&FilePath={file_name}'
  1583. return file_name
  1584. def get_dataset_access_config(
  1585. self,
  1586. dataset_name: str,
  1587. namespace: str,
  1588. revision: Optional[str] = DEFAULT_DATASET_REVISION,
  1589. endpoint: Optional[str] = None,
  1590. token: Optional[str] = None):
  1591. if not endpoint:
  1592. endpoint = self.endpoint
  1593. datahub_url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
  1594. f'ststoken?Revision={revision}'
  1595. return self.datahub_remote_call(datahub_url, token=token)
  1596. def get_dataset_access_config_session(
  1597. self,
  1598. dataset_name: str,
  1599. namespace: str,
  1600. check_cookie: bool,
  1601. revision: Optional[str] = DEFAULT_DATASET_REVISION,
  1602. endpoint: Optional[str] = None,
  1603. token: Optional[str] = None):
  1604. if not endpoint:
  1605. endpoint = self.endpoint
  1606. datahub_url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
  1607. f'ststoken?Revision={revision}'
  1608. if check_cookie:
  1609. cookies = self._check_cookie(use_cookies=True)
  1610. else:
  1611. cookies = self.get_cookies(access_token=token)
  1612. r = self.session.get(
  1613. url=datahub_url,
  1614. cookies=cookies,
  1615. headers=self.builder_headers(self.headers))
  1616. resp = r.json()
  1617. raise_on_error(resp)
  1618. return resp['Data']
  1619. def get_virgo_meta(self, dataset_id: str, version: int = 1, token: Optional[str] = None) -> dict:
  1620. """
  1621. Get virgo dataset meta info.
  1622. """
  1623. virgo_endpoint = os.environ.get(VirgoDatasetConfig.env_virgo_endpoint, '')
  1624. if not virgo_endpoint:
  1625. raise RuntimeError(f'Virgo endpoint is not set in env: {VirgoDatasetConfig.env_virgo_endpoint}')
  1626. virgo_dataset_url = f'{virgo_endpoint}/data/set/download'
  1627. cookies = requests.utils.dict_from_cookiejar(self.get_cookies(access_token=token))
  1628. dataset_info = dict(
  1629. dataSetId=dataset_id,
  1630. dataSetVersion=version
  1631. )
  1632. data = dict(
  1633. data=dataset_info,
  1634. )
  1635. r = self.session.post(url=virgo_dataset_url,
  1636. json=data,
  1637. cookies=cookies,
  1638. headers=self.builder_headers(self.headers),
  1639. timeout=900)
  1640. resp = r.json()
  1641. if resp['code'] != 0:
  1642. raise RuntimeError(f'Failed to get virgo dataset: {resp}')
  1643. return resp['data']
  1644. def get_dataset_access_config_for_unzipped(self,
  1645. dataset_name: str,
  1646. namespace: str,
  1647. revision: str,
  1648. zip_file_name: str,
  1649. endpoint: Optional[str] = None,
  1650. token: Optional[str] = None):
  1651. if not endpoint:
  1652. endpoint = self.endpoint
  1653. datahub_url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
  1654. cookies = self.get_cookies(access_token=token)
  1655. r = self.session.get(url=datahub_url, cookies=cookies,
  1656. headers=self.builder_headers(self.headers))
  1657. resp = r.json()
  1658. # get visibility of the dataset
  1659. raise_on_error(resp)
  1660. data = resp['Data']
  1661. visibility = VisibilityMap.get(data['Visibility'])
  1662. datahub_sts_url = f'{datahub_url}/ststoken?Revision={revision}'
  1663. r_sts = self.session.get(url=datahub_sts_url, cookies=cookies,
  1664. headers=self.builder_headers(self.headers))
  1665. resp_sts = r_sts.json()
  1666. raise_on_error(resp_sts)
  1667. data_sts = resp_sts['Data']
  1668. file_dir = visibility + '-unzipped' + '/' + namespace + '_' + dataset_name + '_' + zip_file_name
  1669. data_sts['Dir'] = file_dir
  1670. return data_sts
  1671. def list_oss_dataset_objects(self, dataset_name, namespace, max_limit,
  1672. is_recursive, is_filter_dir, revision, endpoint: Optional[str] = None,
  1673. token: Optional[str] = None):
  1674. if not endpoint:
  1675. endpoint = self.endpoint
  1676. url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss/tree/?' \
  1677. f'MaxLimit={max_limit}&Revision={revision}&Recursive={is_recursive}&FilterDir={is_filter_dir}'
  1678. cookies = self.get_cookies(access_token=token)
  1679. resp = self.session.get(url=url, cookies=cookies, timeout=1800)
  1680. resp = resp.json()
  1681. raise_on_error(resp)
  1682. resp = resp['Data']
  1683. return resp
  1684. def delete_oss_dataset_object(self, object_name: str, dataset_name: str,
  1685. namespace: str, revision: str, endpoint: Optional[str] = None,
  1686. token: Optional[str] = None) -> str:
  1687. if not object_name or not dataset_name or not namespace or not revision:
  1688. raise ValueError('Args cannot be empty!')
  1689. if not endpoint:
  1690. endpoint = self.endpoint
  1691. url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss?Path={object_name}&Revision={revision}'
  1692. cookies = self.get_cookies(access_token=token, cookies_required=True)
  1693. resp = self.session.delete(url=url, cookies=cookies)
  1694. resp = resp.json()
  1695. raise_on_error(resp)
  1696. resp = resp['Message']
  1697. return resp
  1698. def delete_oss_dataset_dir(self, object_name: str, dataset_name: str,
  1699. namespace: str, revision: str, endpoint: Optional[str] = None,
  1700. token: Optional[str] = None) -> str:
  1701. if not object_name or not dataset_name or not namespace or not revision:
  1702. raise ValueError('Args cannot be empty!')
  1703. if not endpoint:
  1704. endpoint = self.endpoint
  1705. url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss/prefix?Prefix={object_name}/' \
  1706. f'&Revision={revision}'
  1707. cookies = self.get_cookies(access_token=token, cookies_required=True)
  1708. resp = self.session.delete(url=url, cookies=cookies)
  1709. resp = resp.json()
  1710. raise_on_error(resp)
  1711. resp = resp['Message']
  1712. return resp
  1713. def datahub_remote_call(self, url, token: Optional[str] = None):
  1714. cookies = self.get_cookies(access_token=token)
  1715. r = self.session.get(
  1716. url,
  1717. cookies=cookies,
  1718. headers={'user-agent': ModelScopeConfig.get_user_agent()})
  1719. resp = r.json()
  1720. datahub_raise_on_error(url, resp, r)
  1721. return resp['Data']
  1722. def dataset_download_statistics(self, dataset_name: str, namespace: str,
  1723. use_streaming: bool = False, endpoint: Optional[str] = None,
  1724. token: Optional[str] = None) -> None:
  1725. is_ci_test = os.getenv('CI_TEST') == 'True'
  1726. if not endpoint:
  1727. endpoint = self.endpoint
  1728. if dataset_name and namespace and not is_ci_test and not use_streaming:
  1729. try:
  1730. cookies = self.get_cookies(access_token=token)
  1731. # Download count
  1732. download_count_url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase'
  1733. download_count_resp = self.session.post(download_count_url, cookies=cookies,
  1734. headers=self.builder_headers(self.headers))
  1735. raise_for_http_status(download_count_resp)
  1736. # Download uv
  1737. channel = DownloadChannel.LOCAL.value
  1738. user_name = ''
  1739. if MODELSCOPE_CLOUD_ENVIRONMENT in os.environ:
  1740. channel = os.environ[MODELSCOPE_CLOUD_ENVIRONMENT]
  1741. if MODELSCOPE_CLOUD_USERNAME in os.environ:
  1742. user_name = os.environ[MODELSCOPE_CLOUD_USERNAME]
  1743. download_uv_url = f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/' \
  1744. f'{channel}?user={user_name}'
  1745. download_uv_resp = self.session.post(download_uv_url, cookies=cookies,
  1746. headers=self.builder_headers(self.headers))
  1747. download_uv_resp = download_uv_resp.json()
  1748. raise_on_error(download_uv_resp)
  1749. except Exception as e:
  1750. logger.error(e)
  1751. def builder_headers(self, headers):
  1752. return {MODELSCOPE_REQUEST_ID: str(uuid.uuid4().hex),
  1753. **headers}
  1754. def get_file_base_path(self, repo_id: str, endpoint: Optional[str] = None) -> str:
  1755. _namespace, _dataset_name = repo_id.split('/')
  1756. if not endpoint:
  1757. endpoint = self.endpoint
  1758. return f'{endpoint}/api/v1/datasets/{_namespace}/{_dataset_name}/repo?'
  1759. # return f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?Revision={revision}&FilePath='
  1760. def create_repo(
  1761. self,
  1762. repo_id: str,
  1763. *,
  1764. token: Union[str, bool, None] = None,
  1765. visibility: Optional[str] = Visibility.PUBLIC,
  1766. repo_type: Optional[str] = REPO_TYPE_MODEL,
  1767. chinese_name: Optional[str] = None,
  1768. license: Optional[str] = Licenses.APACHE_V2,
  1769. endpoint: Optional[str] = None,
  1770. exist_ok: Optional[bool] = False,
  1771. create_default_config: Optional[bool] = True,
  1772. aigc_model: Optional[AigcModel] = None,
  1773. **kwargs,
  1774. ) -> str:
  1775. """
  1776. Create a repository on the ModelScope Hub.
  1777. Args:
  1778. repo_id (str): The repo id in the format of `owner_name/repo_name`.
  1779. token (Union[str, bool, None]): The access token.
  1780. visibility (Optional[str]): The visibility of the repo,
  1781. could be `public`, `private`, `internal`, default to `public`.
  1782. repo_type (Optional[str]): The repo type, default to `model`.
  1783. chinese_name (Optional[str]): The Chinese name of the repo.
  1784. license (Optional[str]): The license of the repo, default to `apache-2.0`.
  1785. endpoint (Optional[str]): The endpoint to use.
  1786. In the format of `https://www.modelscope.cn` or 'https://www.modelscope.ai'
  1787. exist_ok (Optional[bool]): If the repo exists, whether to return the repo url directly.
  1788. create_default_config (Optional[bool]): If True, create a default configuration file in the model repo.
  1789. **kwargs: The additional arguments.
  1790. Returns:
  1791. str: The repo url.
  1792. """
  1793. if not repo_id:
  1794. raise ValueError('Repo id cannot be empty!')
  1795. if not endpoint:
  1796. endpoint = self.endpoint
  1797. repo_exists: bool = self.repo_exists(repo_id, repo_type=repo_type, endpoint=endpoint, token=token)
  1798. if repo_exists:
  1799. if exist_ok:
  1800. repo_url: str = f'{endpoint}/{repo_type}s/{repo_id}'
  1801. logger.warning(f'Repo {repo_id} already exists, got repo url: {repo_url}')
  1802. return repo_url
  1803. else:
  1804. raise ValueError(f'Repo {repo_id} already exists!')
  1805. repo_id_list = repo_id.split('/')
  1806. if len(repo_id_list) != 2:
  1807. raise ValueError('Invalid repo id, should be in the format of `owner_name/repo_name`')
  1808. namespace, repo_name = repo_id_list
  1809. if repo_type == REPO_TYPE_MODEL:
  1810. visibilities = {k: v for k, v in ModelVisibility.__dict__.items() if not k.startswith('__')}
  1811. visibility: int = visibilities.get(visibility.upper())
  1812. if visibility is None:
  1813. raise ValueError(f'Invalid visibility: {visibility}, '
  1814. f'supported visibilities: `public`, `private`, `internal`')
  1815. repo_url: str = self.create_model(
  1816. model_id=repo_id,
  1817. visibility=visibility,
  1818. license=license,
  1819. chinese_name=chinese_name,
  1820. aigc_model=aigc_model,
  1821. token=token,
  1822. )
  1823. if create_default_config:
  1824. with tempfile.TemporaryDirectory() as temp_cache_dir:
  1825. from modelscope.hub.repository import Repository
  1826. repo = Repository(temp_cache_dir, repo_id, auth_token=token)
  1827. default_config = {
  1828. 'framework': 'pytorch',
  1829. 'task': 'text-generation',
  1830. 'allow_remote': True
  1831. }
  1832. config_json = kwargs.get('config_json')
  1833. if not config_json:
  1834. config_json = {}
  1835. config = {**default_config, **config_json}
  1836. add_content_to_file(
  1837. repo,
  1838. 'configuration.json', [json.dumps(config)],
  1839. ignore_push_error=True)
  1840. print(f'New model created successfully at {repo_url}.', flush=True)
  1841. elif repo_type == REPO_TYPE_DATASET:
  1842. visibilities = {k: v for k, v in DatasetVisibility.__dict__.items() if not k.startswith('__')}
  1843. visibility: int = visibilities.get(visibility.upper())
  1844. if visibility is None:
  1845. raise ValueError(f'Invalid visibility: {visibility}, '
  1846. f'supported visibilities: `public`, `private`, `internal`')
  1847. repo_url: str = self.create_dataset(
  1848. dataset_name=repo_name,
  1849. namespace=namespace,
  1850. chinese_name=chinese_name,
  1851. license=license,
  1852. visibility=visibility,
  1853. token=token,
  1854. )
  1855. print(f'New dataset created successfully at {repo_url}.', flush=True)
  1856. else:
  1857. raise ValueError(f'Invalid repo type: {repo_type}, supported repos: {REPO_TYPE_SUPPORT}')
  1858. return repo_url
  1859. def create_commit(
  1860. self,
  1861. repo_id: str,
  1862. operations: Iterable[CommitOperation],
  1863. *,
  1864. commit_message: str,
  1865. commit_description: Optional[str] = None,
  1866. token: str = None,
  1867. repo_type: Optional[str] = REPO_TYPE_MODEL,
  1868. revision: Optional[str] = DEFAULT_REPOSITORY_REVISION,
  1869. endpoint: Optional[str] = None,
  1870. max_retries: int = 3,
  1871. timeout: int = 180,
  1872. ) -> CommitInfo:
  1873. """
  1874. Create a commit on the ModelScope Hub with retry mechanism.
  1875. Args:
  1876. repo_id (str): The repo id in the format of `owner_name/repo_name`.
  1877. operations (Iterable[CommitOperation]): The commit operations.
  1878. commit_message (str): The commit message.
  1879. commit_description (Optional[str]): The commit description.
  1880. token (str): The access token. If None, will use the cookies from the local cache.
  1881. See `https://modelscope.cn/my/myaccesstoken` to get your token.
  1882. repo_type (Optional[str]): The repo type, should be `model` or `dataset`. Defaults to `model`.
  1883. revision (Optional[str]): The branch or tag name. Defaults to `DEFAULT_REPOSITORY_REVISION`.
  1884. endpoint (Optional[str]): The endpoint to use.
  1885. In the format of `https://www.modelscope.cn` or 'https://www.modelscope.ai'
  1886. max_retries (int): Number of max retry attempts (default: 3).
  1887. timeout (int): Timeout for each request in seconds (default: 180).
  1888. Returns:
  1889. CommitInfo: The commit info.
  1890. Raises:
  1891. requests.exceptions.RequestException: If all retry attempts fail.
  1892. """
  1893. if not repo_id:
  1894. raise ValueError('Repo id cannot be empty!')
  1895. if not endpoint:
  1896. endpoint = self.endpoint
  1897. if repo_type not in REPO_TYPE_SUPPORT:
  1898. raise ValueError(f'Invalid repo type: {repo_type}, supported repos: {REPO_TYPE_SUPPORT}')
  1899. url = f'{endpoint}/api/v1/repos/{repo_type}s/{repo_id}/commit/{revision}'
  1900. commit_message = commit_message or f'Commit to {repo_id}'
  1901. commit_description = commit_description or ''
  1902. cookies = self.get_cookies(access_token=token, cookies_required=True)
  1903. # Construct payload
  1904. payload = self._prepare_commit_payload(
  1905. operations=operations,
  1906. commit_message=commit_message,
  1907. )
  1908. # POST with retry mechanism
  1909. last_exception = None
  1910. for attempt in range(max_retries):
  1911. try:
  1912. if attempt > 0:
  1913. logger.info(f'Attempt {attempt + 1} to create commit for {repo_id}...')
  1914. response = requests.post(
  1915. url,
  1916. headers=self.builder_headers(self.headers),
  1917. data=json.dumps(payload),
  1918. cookies=cookies,
  1919. timeout=timeout,
  1920. )
  1921. if response.status_code != 200:
  1922. try:
  1923. error_detail = response.json()
  1924. except json.JSONDecodeError:
  1925. error_detail = response.text
  1926. error_msg = (
  1927. f'HTTP {response.status_code} error from {url}: '
  1928. f'{error_detail}'
  1929. )
  1930. # If server error (5xx), we can retry, otherwise (4xx) raise immediately
  1931. if 500 <= response.status_code < 600:
  1932. logger.warning(
  1933. f'Server error on attempt {attempt + 1}: {error_msg}'
  1934. )
  1935. else:
  1936. raise ValueError(f'Client request failed: {error_msg}')
  1937. else:
  1938. resp = response.json()
  1939. oid = resp.get('Data', {}).get('oid', '')
  1940. logger.info(f'Commit succeeded: {url}')
  1941. return CommitInfo(
  1942. commit_url=url,
  1943. commit_message=commit_message,
  1944. commit_description=commit_description,
  1945. oid=oid,
  1946. )
  1947. except requests.exceptions.RequestException as e:
  1948. last_exception = e
  1949. logger.warning(f'Request failed on attempt {attempt + 1}: {str(e)}')
  1950. except Exception as e:
  1951. last_exception = e
  1952. logger.error(f'Unexpected error on attempt {attempt + 1}: {str(e)}')
  1953. if attempt == max_retries - 1:
  1954. raise
  1955. if attempt < max_retries - 1:
  1956. time.sleep(1)
  1957. # All retries exhausted
  1958. raise requests.exceptions.RequestException(
  1959. f'Failed to create commit after {max_retries} attempts. Last error: {last_exception}'
  1960. )
  1961. def upload_file(
  1962. self,
  1963. *,
  1964. path_or_fileobj: Union[str, Path, bytes, BinaryIO],
  1965. path_in_repo: str,
  1966. repo_id: str,
  1967. token: Union[str, None] = None,
  1968. repo_type: Optional[str] = REPO_TYPE_MODEL,
  1969. commit_message: Optional[str] = None,
  1970. commit_description: Optional[str] = None,
  1971. buffer_size_mb: Optional[int] = 1,
  1972. tqdm_desc: Optional[str] = '[Uploading]',
  1973. disable_tqdm: Optional[bool] = False,
  1974. revision: Optional[str] = DEFAULT_REPOSITORY_REVISION
  1975. ) -> CommitInfo:
  1976. """
  1977. Upload a file to the ModelScope Hub.
  1978. Args:
  1979. path_or_fileobj (Union[str, Path, bytes, BinaryIO]):
  1980. The local file path or file-like object (BinaryIO) or bytes to upload.
  1981. path_in_repo (str): The path in the repo to upload to.
  1982. repo_id (str): The repo id in the format of `owner_name/repo_name`.
  1983. token (Union[str, None]): The access token. If None, will use the cookies from the local cache.
  1984. See `https://modelscope.cn/my/myaccesstoken` to get your token.
  1985. repo_type (Optional[str]): The repo type, default to `model`.
  1986. commit_message (Optional[str]): The commit message.
  1987. commit_description (Optional[str]): The commit description.
  1988. buffer_size_mb (Optional[int]): The buffer size in MB for reading the file. Default to 1MB.
  1989. tqdm_desc (Optional[str]): The description for the tqdm progress bar. Default to '[Uploading]'.
  1990. disable_tqdm (Optional[bool]): Whether to disable the tqdm progress bar. Default to False.
  1991. revision (Optional[str]): The branch or tag name. Defaults to `DEFAULT_REPOSITORY_REVISION`.
  1992. Returns:
  1993. CommitInfo: The commit info.
  1994. Examples:
  1995. >>> from modelscope.hub.api import HubApi
  1996. >>> api = HubApi()
  1997. >>> commit_info = api.upload_file(
  1998. ... path_or_fileobj='/path/to/your/file.txt',
  1999. ... path_in_repo='optional/path/in/repo/file.txt',
  2000. ... repo_id='your-namespace/your-repo-name',
  2001. ... commit_message='Upload file.txt to ModelScope hub'
  2002. ... )
  2003. >>> print(commit_info)
  2004. """
  2005. if repo_type not in REPO_TYPE_SUPPORT:
  2006. raise ValueError(f'Invalid repo type: {repo_type}, supported repos: {REPO_TYPE_SUPPORT}')
  2007. if not path_or_fileobj:
  2008. raise ValueError('Path or file object cannot be empty!')
  2009. # Check authentication first
  2010. self.get_cookies(access_token=token, cookies_required=True)
  2011. if isinstance(path_or_fileobj, (str, Path)):
  2012. path_or_fileobj = os.path.abspath(os.path.expanduser(path_or_fileobj))
  2013. path_in_repo = path_in_repo or os.path.basename(path_or_fileobj)
  2014. else:
  2015. # If path_or_fileobj is bytes or BinaryIO, then path_in_repo must be provided
  2016. if not path_in_repo:
  2017. raise ValueError('Arg `path_in_repo` cannot be empty!')
  2018. # Read file content if path_or_fileobj is a file-like object (BinaryIO)
  2019. # TODO: to be refined
  2020. if isinstance(path_or_fileobj, io.BufferedIOBase):
  2021. path_or_fileobj = path_or_fileobj.read()
  2022. self.upload_checker.check_file(path_or_fileobj)
  2023. self.upload_checker.check_normal_files(
  2024. file_path_list=[path_or_fileobj],
  2025. repo_type=repo_type,
  2026. )
  2027. commit_message = (
  2028. commit_message if commit_message is not None else f'Upload {path_in_repo} to ModelScope hub'
  2029. )
  2030. if buffer_size_mb <= 0:
  2031. raise ValueError('Buffer size: `buffer_size_mb` must be greater than 0')
  2032. hash_info_d: dict = get_file_hash(
  2033. file_path_or_obj=path_or_fileobj,
  2034. buffer_size_mb=buffer_size_mb,
  2035. )
  2036. file_size: int = hash_info_d['file_size']
  2037. file_hash: str = hash_info_d['file_hash']
  2038. self.create_repo(repo_id=repo_id,
  2039. token=token,
  2040. repo_type=repo_type,
  2041. endpoint=self.endpoint,
  2042. exist_ok=True,
  2043. create_default_config=False)
  2044. upload_res: dict = self._upload_blob(
  2045. repo_id=repo_id,
  2046. repo_type=repo_type,
  2047. sha256=file_hash,
  2048. size=file_size,
  2049. data=path_or_fileobj,
  2050. disable_tqdm=disable_tqdm,
  2051. tqdm_desc=tqdm_desc,
  2052. token=token,
  2053. )
  2054. # Construct commit info and create commit
  2055. add_operation: CommitOperationAdd = CommitOperationAdd(
  2056. path_in_repo=path_in_repo,
  2057. path_or_fileobj=path_or_fileobj,
  2058. file_hash_info=hash_info_d,
  2059. )
  2060. add_operation._upload_mode = 'lfs' if self.upload_checker.is_lfs(path_or_fileobj, repo_type) else 'normal'
  2061. add_operation._is_uploaded = upload_res['is_uploaded']
  2062. operations = [add_operation]
  2063. print(f'Committing file to {repo_id} ...', flush=True)
  2064. commit_info: CommitInfo = self.create_commit(
  2065. repo_id=repo_id,
  2066. operations=operations,
  2067. commit_message=commit_message,
  2068. commit_description=commit_description,
  2069. token=token,
  2070. repo_type=repo_type,
  2071. revision=revision,
  2072. )
  2073. return commit_info
  2074. def upload_folder(
  2075. self,
  2076. *,
  2077. repo_id: str,
  2078. folder_path: Union[str, Path, List[str], List[Path]],
  2079. path_in_repo: Optional[str] = '',
  2080. commit_message: Optional[str] = None,
  2081. commit_description: Optional[str] = None,
  2082. token: Union[str, None] = None,
  2083. repo_type: Optional[str] = REPO_TYPE_MODEL,
  2084. allow_patterns: Optional[Union[List[str], str]] = None,
  2085. ignore_patterns: Optional[Union[List[str], str]] = None,
  2086. max_workers: int = DEFAULT_MAX_WORKERS,
  2087. revision: Optional[str] = DEFAULT_REPOSITORY_REVISION,
  2088. ) -> Union[CommitInfo, List[CommitInfo]]:
  2089. """
  2090. Upload a folder to the ModelScope Hub.
  2091. Args:
  2092. repo_id (str): The repo id in the format of `owner_name/repo_name`.
  2093. folder_path (Union[str, Path, List[str], List[Path]]): The folder path or list of file paths to upload.
  2094. path_in_repo (Optional[str]): The path in the repo to upload to.
  2095. commit_message (Optional[str]): The commit message.
  2096. commit_description (Optional[str]): The commit description.
  2097. token (Union[str, None]): The access token. If None, will use the cookies from the local cache.
  2098. See `https://modelscope.cn/my/myaccesstoken` to get your token.
  2099. repo_type (Optional[str]): The repo type, default to `model`.
  2100. allow_patterns (Optional[Union[List[str], str]]): The patterns to allow.
  2101. ignore_patterns (Optional[Union[List[str], str]]): The patterns to ignore.
  2102. max_workers (int): The maximum number of workers to use for uploading files concurrently.
  2103. Defaults to `DEFAULT_MAX_WORKERS`.
  2104. revision (Optional[str]): The branch or tag name. Defaults to `DEFAULT_REPOSITORY_REVISION`.
  2105. Returns:
  2106. Union[CommitInfo, List[CommitInfo]]:
  2107. The commit info or list of commit infos if multiple batches are committed.
  2108. Examples:
  2109. >>> from modelscope.hub.api import HubApi
  2110. >>> api = HubApi()
  2111. >>> commit_info = api.upload_folder(
  2112. ... repo_id='your-namespace/your-repo-name',
  2113. ... folder_path='/path/to/your/folder',
  2114. ... path_in_repo='optional/path/in/repo',
  2115. ... commit_message='Upload my folder',
  2116. ... token='your-access-token'
  2117. ... )
  2118. >>> print(commit_info.commit_url)
  2119. """
  2120. if not repo_id:
  2121. raise ValueError('The arg `repo_id` cannot be empty!')
  2122. if folder_path is None:
  2123. raise ValueError('The arg `folder_path` cannot be None!')
  2124. if repo_type not in REPO_TYPE_SUPPORT:
  2125. raise ValueError(f'Invalid repo type: {repo_type}, supported repos: {REPO_TYPE_SUPPORT}')
  2126. # Check authentication first
  2127. self.get_cookies(access_token=token, cookies_required=True)
  2128. allow_patterns = allow_patterns if allow_patterns else None
  2129. ignore_patterns = ignore_patterns if ignore_patterns else None
  2130. # Ignore .git .cache folders
  2131. if ignore_patterns is None:
  2132. ignore_patterns = []
  2133. elif isinstance(ignore_patterns, str):
  2134. ignore_patterns = [ignore_patterns]
  2135. ignore_patterns += DEFAULT_IGNORE_PATTERNS
  2136. # Cover the ignore patterns if both allow and ignore patterns are provided
  2137. if allow_patterns is not None:
  2138. ignore_patterns = [
  2139. p for p in ignore_patterns if p not in allow_patterns
  2140. ]
  2141. commit_message = (
  2142. commit_message if commit_message is not None else f'Upload to {repo_id} on ModelScope hub'
  2143. )
  2144. commit_description = commit_description or 'Uploading files'
  2145. # Get the list of files to upload, e.g. [('data/abc.png', '/path/to/abc.png'), ...]
  2146. logger.info('Preparing files to upload ...')
  2147. prepared_repo_objects = self._prepare_upload_folder(
  2148. folder_path_or_files=folder_path,
  2149. path_in_repo=path_in_repo,
  2150. allow_patterns=allow_patterns,
  2151. ignore_patterns=ignore_patterns,
  2152. )
  2153. if len(prepared_repo_objects) == 0:
  2154. raise ValueError(f'No files to upload in the folder: {folder_path} !')
  2155. logger.info(f'Checking {len(prepared_repo_objects)} files to upload ...')
  2156. self.upload_checker.check_normal_files(
  2157. file_path_list=[item for _, item in prepared_repo_objects],
  2158. repo_type=repo_type,
  2159. )
  2160. self.create_repo(repo_id=repo_id,
  2161. token=token,
  2162. repo_type=repo_type,
  2163. endpoint=self.endpoint,
  2164. exist_ok=True,
  2165. create_default_config=False)
  2166. @thread_executor(max_workers=max_workers, disable_tqdm=False)
  2167. def _upload_items(item_pair, **kwargs):
  2168. file_path_in_repo, file_path = item_pair
  2169. hash_info_d: dict = get_file_hash(
  2170. file_path_or_obj=file_path,
  2171. )
  2172. file_size: int = hash_info_d['file_size']
  2173. file_hash: str = hash_info_d['file_hash']
  2174. upload_res: dict = self._upload_blob(
  2175. repo_id=repo_id,
  2176. repo_type=repo_type,
  2177. sha256=file_hash,
  2178. size=file_size,
  2179. data=file_path,
  2180. disable_tqdm=file_size <= UPLOAD_BLOB_TQDM_DISABLE_THRESHOLD,
  2181. tqdm_desc='[Uploading ' + file_path_in_repo + ']',
  2182. token=token,
  2183. )
  2184. return {
  2185. 'file_path_in_repo': file_path_in_repo,
  2186. 'file_path': file_path,
  2187. 'is_uploaded': upload_res['is_uploaded'],
  2188. 'file_hash_info': hash_info_d,
  2189. }
  2190. uploaded_items_list = _upload_items(
  2191. prepared_repo_objects,
  2192. repo_id=repo_id,
  2193. token=token,
  2194. repo_type=repo_type,
  2195. commit_message=commit_message,
  2196. commit_description=commit_description,
  2197. buffer_size_mb=1,
  2198. disable_tqdm=False,
  2199. )
  2200. # Construct commit info and create commit
  2201. operations = []
  2202. for item_d in uploaded_items_list:
  2203. prepared_path_in_repo: str = item_d['file_path_in_repo']
  2204. prepared_file_path: str = item_d['file_path']
  2205. is_uploaded: bool = item_d['is_uploaded']
  2206. file_hash_info: dict = item_d['file_hash_info']
  2207. opt = CommitOperationAdd(
  2208. path_in_repo=prepared_path_in_repo,
  2209. path_or_fileobj=prepared_file_path,
  2210. file_hash_info=file_hash_info,
  2211. )
  2212. # check normal or lfs
  2213. opt._upload_mode = 'lfs' if self.upload_checker.is_lfs(prepared_file_path, repo_type) else 'normal'
  2214. opt._is_uploaded = is_uploaded
  2215. operations.append(opt)
  2216. if len(operations) == 0:
  2217. raise ValueError(f'No files to upload in the folder: {folder_path} !')
  2218. # Commit the operations in batches
  2219. commit_batch_size: int = UPLOAD_COMMIT_BATCH_SIZE if UPLOAD_COMMIT_BATCH_SIZE > 0 else len(operations)
  2220. num_batches = (len(operations) - 1) // commit_batch_size + 1
  2221. print(f'Committing {len(operations)} files in {num_batches} batch(es) of size {commit_batch_size}.',
  2222. flush=True)
  2223. commit_infos: List[CommitInfo] = []
  2224. for i in tqdm(range(num_batches), desc='[Committing batches] ', total=num_batches):
  2225. batch_operations = operations[i * commit_batch_size: (i + 1) * commit_batch_size]
  2226. batch_commit_message = f'{commit_message} (batch {i + 1}/{num_batches})'
  2227. commit_info: CommitInfo = self.create_commit(
  2228. repo_id=repo_id,
  2229. operations=batch_operations,
  2230. commit_message=batch_commit_message,
  2231. commit_description=commit_description,
  2232. token=token,
  2233. repo_type=repo_type,
  2234. revision=revision,
  2235. )
  2236. commit_infos.append(commit_info)
  2237. return commit_infos[0] if len(commit_infos) == 1 else commit_infos
  2238. def _upload_blob(
  2239. self,
  2240. *,
  2241. repo_id: str,
  2242. repo_type: str,
  2243. sha256: str,
  2244. size: int,
  2245. data: Union[str, Path, bytes, BinaryIO],
  2246. disable_tqdm: Optional[bool] = False,
  2247. tqdm_desc: Optional[str] = '[Uploading]',
  2248. buffer_size_mb: Optional[int] = 1,
  2249. token: Optional[str] = None,
  2250. ) -> dict:
  2251. res_d: dict = dict(
  2252. url=None,
  2253. is_uploaded=False,
  2254. status_code=None,
  2255. status_msg=None,
  2256. )
  2257. objects = [{'oid': sha256, 'size': size}]
  2258. upload_objects = self._validate_blob(
  2259. repo_id=repo_id,
  2260. repo_type=repo_type,
  2261. objects=objects,
  2262. token=token,
  2263. )
  2264. # upload_object: {'url': 'xxx', 'oid': 'xxx'}
  2265. upload_object = upload_objects[0] if len(upload_objects) == 1 else None
  2266. if upload_object is None:
  2267. logger.debug(f'Blob {sha256[:8]} has already uploaded, reuse it.')
  2268. res_d['is_uploaded'] = True
  2269. return res_d
  2270. cookies = self.get_cookies(access_token=token, cookies_required=True)
  2271. cookies = dict(cookies) if cookies else None
  2272. if cookies is None:
  2273. raise ValueError('Token does not exist, please login first.')
  2274. self.headers.update({'Cookie': f"m_session_id={cookies['m_session_id']}"})
  2275. headers = self.builder_headers(self.headers)
  2276. def read_in_chunks(file_object, pbar, chunk_size=buffer_size_mb * 1024 * 1024):
  2277. """Lazy function (generator) to read a file piece by piece."""
  2278. while True:
  2279. ck = file_object.read(chunk_size)
  2280. if not ck:
  2281. break
  2282. pbar.update(len(ck))
  2283. yield ck
  2284. with tqdm(
  2285. total=size,
  2286. unit='B',
  2287. unit_scale=True,
  2288. desc=tqdm_desc,
  2289. disable=disable_tqdm
  2290. ) as pbar:
  2291. if isinstance(data, (str, Path)):
  2292. with open(data, 'rb') as f:
  2293. response = requests.put(
  2294. upload_object['url'],
  2295. headers=headers,
  2296. data=read_in_chunks(f, pbar)
  2297. )
  2298. elif isinstance(data, bytes):
  2299. response = requests.put(
  2300. upload_object['url'],
  2301. headers=headers,
  2302. data=read_in_chunks(io.BytesIO(data), pbar)
  2303. )
  2304. elif isinstance(data, io.BufferedIOBase):
  2305. response = requests.put(
  2306. upload_object['url'],
  2307. headers=headers,
  2308. data=read_in_chunks(data, pbar)
  2309. )
  2310. else:
  2311. raise ValueError('Invalid data type to upload')
  2312. raise_for_http_status(rsp=response)
  2313. resp = response.json()
  2314. raise_on_error(rsp=resp)
  2315. res_d['url'] = upload_object['url']
  2316. res_d['status_code'] = resp['Code']
  2317. res_d['status_msg'] = resp['Message']
  2318. return res_d
  2319. def _validate_blob(
  2320. self,
  2321. *,
  2322. repo_id: str,
  2323. repo_type: str,
  2324. objects: List[Dict[str, Any]],
  2325. endpoint: Optional[str] = None,
  2326. token: Optional[str] = None,
  2327. ) -> List[Dict[str, Any]]:
  2328. """
  2329. Check the blob has already uploaded.
  2330. True -- uploaded; False -- not uploaded.
  2331. Args:
  2332. repo_id (str): The repo id ModelScope.
  2333. repo_type (str): The repo type. `dataset`, `model`, etc.
  2334. objects (List[Dict[str, Any]]): The objects to check.
  2335. oid (str): The sha256 hash value.
  2336. size (int): The size of the blob.
  2337. endpoint: the endpoint to use, default to None to use endpoint specified in the class
  2338. token (str): The access token.
  2339. Returns:
  2340. List[Dict[str, Any]]: The result of the check.
  2341. """
  2342. # construct URL
  2343. if not endpoint:
  2344. endpoint = self.endpoint
  2345. url = f'{endpoint}/api/v1/repos/{repo_type}s/{repo_id}/info/lfs/objects/batch'
  2346. # build payload
  2347. payload = {
  2348. 'operation': 'upload',
  2349. 'objects': objects,
  2350. }
  2351. cookies = self.get_cookies(access_token=token, cookies_required=True)
  2352. response = requests.post(
  2353. url,
  2354. headers=self.builder_headers(self.headers),
  2355. data=json.dumps(payload),
  2356. cookies=cookies
  2357. )
  2358. raise_for_http_status(rsp=response)
  2359. resp = response.json()
  2360. raise_on_error(rsp=resp)
  2361. upload_objects = [] # list of objects to upload, [{'url': 'xxx', 'oid': 'xxx'}, ...]
  2362. resp_objects = resp['Data']['objects']
  2363. for obj in resp_objects:
  2364. upload_objects.append(
  2365. {'url': obj['actions']['upload']['href'],
  2366. 'oid': obj['oid']}
  2367. )
  2368. return upload_objects
  2369. def _prepare_upload_folder(
  2370. self,
  2371. folder_path_or_files: Union[str, Path, List[str], List[Path]],
  2372. path_in_repo: str,
  2373. allow_patterns: Optional[Union[List[str], str]] = None,
  2374. ignore_patterns: Optional[Union[List[str], str]] = None,
  2375. ) -> List[Union[tuple, list]]:
  2376. folder_path = None
  2377. files_path = None
  2378. if isinstance(folder_path_or_files, list):
  2379. if os.path.isfile(folder_path_or_files[0]):
  2380. files_path = folder_path_or_files
  2381. else:
  2382. raise ValueError('Uploading multiple folders is not supported now.')
  2383. else:
  2384. if os.path.isfile(folder_path_or_files):
  2385. files_path = [folder_path_or_files]
  2386. else:
  2387. folder_path = folder_path_or_files
  2388. if files_path is None:
  2389. self.upload_checker.check_folder(folder_path)
  2390. folder_path = Path(folder_path).expanduser().resolve()
  2391. if not folder_path.is_dir():
  2392. raise ValueError(f"Provided path: '{folder_path}' is not a directory")
  2393. # List files from folder
  2394. relpath_to_abspath = {
  2395. path.relative_to(folder_path).as_posix(): path
  2396. for path in sorted(folder_path.glob('**/*')) # sorted to be deterministic
  2397. if path.is_file()
  2398. }
  2399. else:
  2400. relpath_to_abspath = {}
  2401. for path in files_path:
  2402. if os.path.isfile(path):
  2403. self.upload_checker.check_file(path)
  2404. relpath_to_abspath[os.path.basename(path)] = path
  2405. # Filter files
  2406. filtered_repo_objects = list(
  2407. RepoUtils.filter_repo_objects(
  2408. relpath_to_abspath.keys(), allow_patterns=allow_patterns, ignore_patterns=ignore_patterns
  2409. )
  2410. )
  2411. prefix = f"{path_in_repo.strip('/')}/" if path_in_repo else ''
  2412. prepared_repo_objects = [
  2413. (prefix + relpath, str(relpath_to_abspath[relpath]))
  2414. for relpath in filtered_repo_objects
  2415. ]
  2416. logger.info(f'Prepared {len(prepared_repo_objects)} files for upload.')
  2417. return prepared_repo_objects
  2418. @staticmethod
  2419. def _prepare_commit_payload(
  2420. operations: Iterable[CommitOperation],
  2421. commit_message: str,
  2422. ) -> Dict[str, Any]:
  2423. """
  2424. Prepare the commit payload to be sent to the ModelScope hub.
  2425. """
  2426. payload = {
  2427. 'commit_message': commit_message,
  2428. 'actions': []
  2429. }
  2430. nb_ignored_files = 0
  2431. # 2. Send operations, one per line
  2432. for operation in operations:
  2433. # Skip ignored files
  2434. if isinstance(operation, CommitOperationAdd) and operation._should_ignore:
  2435. logger.debug(f"Skipping file '{operation.path_in_repo}' in commit (ignored by gitignore file).")
  2436. nb_ignored_files += 1
  2437. continue
  2438. # 2.a. Case adding a normal file
  2439. if isinstance(operation, CommitOperationAdd) and operation._upload_mode == 'normal':
  2440. commit_action = {
  2441. 'action': 'update' if operation._is_uploaded else 'create',
  2442. 'path': operation.path_in_repo,
  2443. 'type': 'normal',
  2444. 'size': operation.upload_info.size,
  2445. 'sha256': '',
  2446. 'content': operation.b64content().decode(),
  2447. 'encoding': 'base64',
  2448. }
  2449. payload['actions'].append(commit_action)
  2450. # 2.b. Case adding an LFS file
  2451. elif isinstance(operation, CommitOperationAdd) and operation._upload_mode == 'lfs':
  2452. commit_action = {
  2453. 'action': 'update' if operation._is_uploaded else 'create',
  2454. 'path': operation.path_in_repo,
  2455. 'type': 'lfs',
  2456. 'size': operation.upload_info.size,
  2457. 'sha256': operation.upload_info.sha256,
  2458. 'content': '',
  2459. 'encoding': '',
  2460. }
  2461. payload['actions'].append(commit_action)
  2462. else:
  2463. raise ValueError(
  2464. f'Unknown operation to commit. Operation: {operation}. Upload mode:'
  2465. f" {getattr(operation, '_upload_mode', None)}"
  2466. )
  2467. if nb_ignored_files > 0:
  2468. logger.info(f'Skipped {nb_ignored_files} file(s) in commit (ignored by gitignore file).')
  2469. return payload
  2470. def _get_internal_acceleration_domain(self, internal_timeout: float = 0.2):
  2471. """
  2472. Get the internal acceleration domain.
  2473. Args:
  2474. internal_timeout (float): The timeout for the request. Default to 0.2s
  2475. Returns:
  2476. str: The internal acceleration domain. e.g. `cn-hangzhou`, `cn-zhangjiakou`
  2477. """
  2478. def send_request(url: str, timeout: float):
  2479. try:
  2480. response = requests.get(url, timeout=timeout)
  2481. response.raise_for_status()
  2482. except requests.exceptions.RequestException:
  2483. response = None
  2484. return response
  2485. internal_url = f'{self.endpoint}/api/v1/repos/internalAccelerationInfo'
  2486. # Get internal url and region for acceleration
  2487. internal_info_response = send_request(url=internal_url, timeout=internal_timeout)
  2488. region_id: str = ''
  2489. if internal_info_response is not None:
  2490. internal_info_response = internal_info_response.json()
  2491. if 'Data' in internal_info_response:
  2492. query_addr = internal_info_response['Data']['InternalRegionQueryAddress']
  2493. else:
  2494. query_addr: str = ''
  2495. if query_addr:
  2496. domain_response = send_request(query_addr, timeout=internal_timeout)
  2497. if domain_response is not None:
  2498. region_id = domain_response.text.strip()
  2499. return region_id
  2500. def delete_files(self,
  2501. repo_id: str,
  2502. repo_type: str,
  2503. delete_patterns: Union[str, List[str]],
  2504. *,
  2505. revision: Optional[str] = DEFAULT_MODEL_REVISION,
  2506. endpoint: Optional[str] = None,
  2507. token: Optional[str] = None) -> Dict[str, Any]:
  2508. """
  2509. Delete files in batch using glob (wildcard) patterns, e.g. '*.py', 'data/*.csv', 'foo*', etc.
  2510. Example:
  2511. # Delete all Python and Markdown files in a model repo
  2512. api.delete_files(
  2513. repo_id='your_username/your_model',
  2514. repo_type=REPO_TYPE_MODEL,
  2515. delete_patterns=['*.py', '*.md']
  2516. )
  2517. # Delete all CSV files in the data/ directory of a dataset repo
  2518. api.delete_files(
  2519. repo_id='your_username/your_dataset',
  2520. repo_type=REPO_TYPE_DATASET,
  2521. delete_patterns='data/*.csv'
  2522. )
  2523. Args:
  2524. repo_id (str): 'owner/repo_name' or 'owner/dataset_name', e.g. 'Koko/my_model'
  2525. repo_type (str): REPO_TYPE_MODEL or REPO_TYPE_DATASET
  2526. delete_patterns (str or List[str]): List of glob patterns, e.g. '*.py', 'data/*.csv', 'foo*'
  2527. revision (str, optional): Branch or tag name
  2528. endpoint (str, optional): API endpoint
  2529. token (str, optional): Access token
  2530. Returns:
  2531. dict: Deletion result
  2532. """
  2533. if repo_type not in REPO_TYPE_SUPPORT:
  2534. raise ValueError(f'Unsupported repo_type: {repo_type}')
  2535. if not delete_patterns:
  2536. raise ValueError('delete_patterns cannot be empty')
  2537. if isinstance(delete_patterns, str):
  2538. delete_patterns = [delete_patterns]
  2539. cookies = self.get_cookies(access_token=token, cookies_required=True)
  2540. if not endpoint:
  2541. endpoint = self.endpoint
  2542. if cookies is None:
  2543. raise ValueError('Token does not exist, please login first.')
  2544. headers = self.builder_headers(self.headers)
  2545. # List all files in the repo
  2546. if repo_type == REPO_TYPE_MODEL:
  2547. files = self.get_model_files(
  2548. repo_id,
  2549. revision=revision or DEFAULT_MODEL_REVISION,
  2550. recursive=True,
  2551. endpoint=endpoint,
  2552. use_cookies=cookies,
  2553. )
  2554. file_paths = [f['Path'] for f in files]
  2555. elif repo_type == REPO_TYPE_DATASET:
  2556. file_paths = []
  2557. page_number = 1
  2558. page_size = 100
  2559. while True:
  2560. try:
  2561. dataset_files: List[Dict[str, Any]] = self.get_dataset_files(
  2562. repo_id=repo_id,
  2563. revision=revision or DEFAULT_DATASET_REVISION,
  2564. recursive=True,
  2565. page_number=page_number,
  2566. page_size=page_size,
  2567. endpoint=endpoint,
  2568. token=token,
  2569. )
  2570. except Exception as e:
  2571. logger.error(f'Get dataset: {repo_id} file list failed, message: {str(e)}')
  2572. break
  2573. # Parse data (Type: 'tree' or 'blob')
  2574. for file_info_d in dataset_files:
  2575. if file_info_d['Type'] != 'tree':
  2576. file_paths.append(file_info_d['Path'])
  2577. if len(dataset_files) < page_size:
  2578. break
  2579. page_number += 1
  2580. else:
  2581. raise ValueError(f'Unsupported repo_type: {repo_type}, supported repos: {REPO_TYPE_SUPPORT}')
  2582. # Glob pattern matching
  2583. to_delete = []
  2584. for path in file_paths:
  2585. for delete_pattern in delete_patterns:
  2586. if fnmatch.fnmatch(path, delete_pattern):
  2587. to_delete.append(path)
  2588. break
  2589. deleted_files, failed_files = [], []
  2590. for path in to_delete:
  2591. try:
  2592. if repo_type == REPO_TYPE_MODEL:
  2593. owner, repo_name = repo_id.split('/')
  2594. url = f'{endpoint}/api/v1/models/{owner}/{repo_name}/file'
  2595. params = {
  2596. 'Revision': revision or DEFAULT_MODEL_REVISION,
  2597. 'FilePath': path
  2598. }
  2599. elif repo_type == REPO_TYPE_DATASET:
  2600. owner, dataset_name = repo_id.split('/')
  2601. url = f'{endpoint}/api/v1/datasets/{owner}/{dataset_name}/repo'
  2602. params = {
  2603. 'FilePath': path
  2604. }
  2605. else:
  2606. raise ValueError(f'Unsupported repo_type: {repo_type}, supported repos: {REPO_TYPE_SUPPORT}')
  2607. r = self.session.delete(url, params=params, cookies=cookies, headers=headers)
  2608. raise_for_http_status(r)
  2609. resp = r.json()
  2610. raise_on_error(resp)
  2611. deleted_files.append(path)
  2612. except Exception as e:
  2613. failed_files.append(path)
  2614. logger.error(f'Failed to delete {path}: {str(e)}')
  2615. return {
  2616. 'deleted_files': deleted_files,
  2617. 'failed_files': failed_files,
  2618. 'total_files': len(to_delete)
  2619. }
  2620. def set_repo_visibility(self,
  2621. repo_id: str,
  2622. repo_type: Literal['model', 'dataset'],
  2623. visibility: Literal['private', 'public'],
  2624. token: Union[str, None] = None
  2625. ) -> dict:
  2626. """
  2627. Set the visibility of a repo.
  2628. Args:
  2629. repo_id (str): The repo id in the format of `owner_name/repo_name`.
  2630. repo_type (Literal['model', 'dataset']): The repo type, `model` or `dataset`.
  2631. visibility (Literal['private', 'public']): The visibility to set, `private` or `public`.
  2632. token (Union[str, None]): The access token. If None, will use the cookies from the local cache.
  2633. See `https://modelscope.cn/my/myaccesstoken` to get your token.
  2634. Returns:
  2635. dict: The response from the server.
  2636. """
  2637. if not repo_id:
  2638. raise ValueError('The arg `repo_id` cannot be empty!')
  2639. if visibility not in ['private', 'public']:
  2640. raise ValueError(f'Invalid visibility: {visibility}, supported visibilities: `private`, `public`')
  2641. visibility_map: Dict[str, int] = {v: k for k, v in VisibilityMap.items()}
  2642. visibility_code: int = visibility_map.get(visibility, 5)
  2643. cookies = self.get_cookies(access_token=token, cookies_required=True)
  2644. if repo_type == REPO_TYPE_MODEL:
  2645. model_info = self.get_model(model_id=repo_id, token=token)
  2646. path = f'{self.endpoint}/api/v1/models/{repo_id}'
  2647. tasks = model_info.get('Tasks')
  2648. model_tasks = ''
  2649. if isinstance(tasks, list) and tasks:
  2650. first = tasks[0]
  2651. if isinstance(first, dict) and first:
  2652. model_tasks = first.get('name')
  2653. payload = {
  2654. 'ChineseName': model_info.get('ChineseName', ''),
  2655. 'ModelFramework': model_info.get('ModelFramework', 'Pytorch'),
  2656. 'Visibility': visibility_code,
  2657. 'ProtectedMode': 2,
  2658. 'ApprovalMode': model_info.get('ApprovalMode', 2),
  2659. 'Description': model_info.get('Description', ''),
  2660. 'AigcType': model_info.get('AigcType', ''),
  2661. 'VisionFoundation': model_info.get('VisionFoundation', ''),
  2662. 'ModelCover': model_info.get('ModelCover', ''),
  2663. 'SubScientificField': model_info.get('SubScientificField', None),
  2664. 'ScientificField': model_info.get('NEXA', {}).get('ScientificField', ''),
  2665. 'Source': model_info.get('NEXA', {}).get('Source', ''),
  2666. 'ModelTask': model_tasks,
  2667. 'License': model_info.get('License', ''),
  2668. }
  2669. elif repo_type == REPO_TYPE_DATASET:
  2670. repo_id_parts = repo_id.split('/')
  2671. if len(repo_id_parts) != 2 or not all(repo_id_parts):
  2672. raise ValueError(f'Invalid dataset repo_id: {repo_id}, should be in format of `owner/dataset_name`')
  2673. dataset_idx, _ = self.get_dataset_id_and_type(
  2674. dataset_name=repo_id_parts[1],
  2675. namespace=repo_id_parts[0],
  2676. token=token
  2677. )
  2678. path = f'{self.endpoint}/api/v1/datasets/{dataset_idx}'
  2679. payload = {
  2680. 'Visibility': visibility_code,
  2681. 'ProtectedMode': 2,
  2682. }
  2683. else:
  2684. raise ValueError(f'Invalid repo type: {repo_type}, supported repos: {REPO_TYPE_SUPPORT}')
  2685. r = self.session.put(
  2686. path,
  2687. json=payload,
  2688. cookies=cookies,
  2689. headers=self.builder_headers(self.headers))
  2690. raise_for_http_status(r)
  2691. resp = r.json()
  2692. raise_on_error(resp)
  2693. return resp
  2694. class ModelScopeConfig:
  2695. path_credential = expanduser(MODELSCOPE_CREDENTIALS_PATH)
  2696. COOKIES_FILE_NAME = 'cookies'
  2697. GIT_TOKEN_FILE_NAME = 'git_token'
  2698. USER_INFO_FILE_NAME = 'user'
  2699. USER_SESSION_ID_FILE_NAME = 'session'
  2700. cookie_expired_warning = False
  2701. @staticmethod
  2702. def make_sure_credential_path_exist():
  2703. os.makedirs(ModelScopeConfig.path_credential, exist_ok=True)
  2704. @staticmethod
  2705. def save_cookies(cookies: CookieJar):
  2706. ModelScopeConfig.make_sure_credential_path_exist()
  2707. with open(
  2708. os.path.join(ModelScopeConfig.path_credential,
  2709. ModelScopeConfig.COOKIES_FILE_NAME), 'wb+') as f:
  2710. pickle.dump(cookies, f)
  2711. @staticmethod
  2712. def get_cookies():
  2713. cookies_path = os.path.join(ModelScopeConfig.path_credential,
  2714. ModelScopeConfig.COOKIES_FILE_NAME)
  2715. if os.path.exists(cookies_path):
  2716. with open(cookies_path, 'rb') as f:
  2717. cookies = pickle.load(f)
  2718. if not cookies:
  2719. return None
  2720. for cookie in cookies:
  2721. if cookie.name == 'm_session_id' and cookie.is_expired() and \
  2722. not ModelScopeConfig.cookie_expired_warning:
  2723. ModelScopeConfig.cookie_expired_warning = True
  2724. logger.info('Not logged-in, you can login for uploading'
  2725. 'or accessing controlled entities.')
  2726. return None
  2727. return cookies
  2728. return None
  2729. @staticmethod
  2730. def get_user_session_id():
  2731. session_path = os.path.join(ModelScopeConfig.path_credential,
  2732. ModelScopeConfig.USER_SESSION_ID_FILE_NAME)
  2733. session_id = ''
  2734. if os.path.exists(session_path):
  2735. with open(session_path, 'rb') as f:
  2736. session_id = str(f.readline().strip(), encoding='utf-8')
  2737. return session_id
  2738. if session_id == '' or len(session_id) != 32:
  2739. session_id = str(uuid.uuid4().hex)
  2740. ModelScopeConfig.make_sure_credential_path_exist()
  2741. with open(session_path, 'w+') as wf:
  2742. wf.write(session_id)
  2743. return session_id
  2744. @staticmethod
  2745. def save_token(token: str):
  2746. ModelScopeConfig.make_sure_credential_path_exist()
  2747. with open(
  2748. os.path.join(ModelScopeConfig.path_credential,
  2749. ModelScopeConfig.GIT_TOKEN_FILE_NAME), 'w+') as f:
  2750. f.write(token)
  2751. @staticmethod
  2752. def save_user_info(user_name: str, user_email: str):
  2753. ModelScopeConfig.make_sure_credential_path_exist()
  2754. with open(
  2755. os.path.join(ModelScopeConfig.path_credential,
  2756. ModelScopeConfig.USER_INFO_FILE_NAME), 'w+') as f:
  2757. f.write('%s:%s' % (user_name, user_email))
  2758. @staticmethod
  2759. def get_user_info() -> Tuple[str, str]:
  2760. try:
  2761. with open(
  2762. os.path.join(ModelScopeConfig.path_credential,
  2763. ModelScopeConfig.USER_INFO_FILE_NAME),
  2764. 'r',
  2765. encoding='utf-8') as f:
  2766. info = f.read()
  2767. return info.split(':')[0], info.split(':')[1]
  2768. except FileNotFoundError:
  2769. pass
  2770. return None, None
  2771. @staticmethod
  2772. def get_token() -> Optional[str]:
  2773. """
  2774. Get token or None if not existent.
  2775. Returns:
  2776. `str` or `None`: The token, `None` if it doesn't exist.
  2777. """
  2778. token = None
  2779. try:
  2780. with open(
  2781. os.path.join(ModelScopeConfig.path_credential,
  2782. ModelScopeConfig.GIT_TOKEN_FILE_NAME),
  2783. 'r',
  2784. encoding='utf-8') as f:
  2785. token = f.read()
  2786. except FileNotFoundError:
  2787. pass
  2788. return token
  2789. @staticmethod
  2790. def get_user_agent(user_agent: Union[Dict, str, None] = None, ) -> str:
  2791. """Formats a user-agent string with basic info about a request.
  2792. Args:
  2793. user_agent (`str`, `dict`, *optional*):
  2794. The user agent info in the form of a dictionary or a single string.
  2795. Returns:
  2796. The formatted user-agent string.
  2797. """
  2798. # include some more telemetrics when executing in dedicated
  2799. # cloud containers
  2800. env = 'custom'
  2801. if MODELSCOPE_CLOUD_ENVIRONMENT in os.environ:
  2802. env = os.environ[MODELSCOPE_CLOUD_ENVIRONMENT]
  2803. user_name = 'unknown'
  2804. if MODELSCOPE_CLOUD_USERNAME in os.environ:
  2805. user_name = os.environ[MODELSCOPE_CLOUD_USERNAME]
  2806. from modelscope import __version__
  2807. ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s; user/%s' % (
  2808. __version__,
  2809. platform.python_version(),
  2810. ModelScopeConfig.get_user_session_id(),
  2811. platform.platform(),
  2812. platform.processor(),
  2813. env,
  2814. user_name,
  2815. )
  2816. if isinstance(user_agent, dict):
  2817. ua += '; ' + '; '.join(f'{k}/{v}' for k, v in user_agent.items())
  2818. elif isinstance(user_agent, str):
  2819. ua += '; ' + user_agent
  2820. return ua
  2821. class UploadingCheck:
  2822. """
  2823. Check the files and folders to be uploaded.
  2824. Args:
  2825. max_file_count (int): The maximum number of files to be uploaded. Default to `UPLOAD_MAX_FILE_COUNT`.
  2826. max_file_count_in_dir (int): The maximum number of files in a directory.
  2827. Default to `UPLOAD_MAX_FILE_COUNT_IN_DIR`.
  2828. max_file_size (int): The maximum size of a single file in bytes. Default to `UPLOAD_MAX_FILE_SIZE`.
  2829. size_threshold_to_enforce_lfs (int): The size threshold to enforce LFS in bytes.
  2830. Files larger than this size will be enforced to be uploaded via LFS.
  2831. Default to `UPLOAD_SIZE_THRESHOLD_TO_ENFORCE_LFS`.
  2832. normal_file_size_total_limit (int): The total size limit of normal files in bytes.
  2833. Default to `UPLOAD_NORMAL_FILE_SIZE_TOTAL_LIMIT`.
  2834. Examples:
  2835. >>> from modelscope.hub.api import UploadingCheck
  2836. >>> upload_checker = UploadingCheck()
  2837. >>> upload_checker.check_file('/path/to/your/file.txt')
  2838. >>> upload_checker.check_folder('/path/to/your/folder')
  2839. >>> is_lfs = upload_checker.is_lfs('/path/to/your/file.txt', repo_type='model')
  2840. >>> print(f'Is LFS: {is_lfs}')
  2841. """
  2842. def __init__(
  2843. self,
  2844. max_file_count: int = UPLOAD_MAX_FILE_COUNT,
  2845. max_file_count_in_dir: int = UPLOAD_MAX_FILE_COUNT_IN_DIR,
  2846. max_file_size: int = UPLOAD_MAX_FILE_SIZE,
  2847. size_threshold_to_enforce_lfs: int = UPLOAD_SIZE_THRESHOLD_TO_ENFORCE_LFS,
  2848. normal_file_size_total_limit: int = UPLOAD_NORMAL_FILE_SIZE_TOTAL_LIMIT,
  2849. ):
  2850. self.max_file_count = max_file_count
  2851. self.max_file_count_in_dir = max_file_count_in_dir
  2852. self.max_file_size = max_file_size
  2853. self.size_threshold_to_enforce_lfs = size_threshold_to_enforce_lfs
  2854. self.normal_file_size_total_limit = normal_file_size_total_limit
  2855. def check_file(self, file_path_or_obj) -> None:
  2856. """
  2857. Check a single file to be uploaded.
  2858. Args:
  2859. file_path_or_obj (Union[str, Path, bytes, BinaryIO]): The file path or file-like object to be checked.
  2860. Raises:
  2861. ValueError: If the file does not exist or exceeds the size limit.
  2862. """
  2863. if isinstance(file_path_or_obj, (str, Path)):
  2864. if not os.path.exists(file_path_or_obj):
  2865. raise ValueError(f'File {file_path_or_obj} does not exist')
  2866. file_size: int = get_file_size(file_path_or_obj)
  2867. if file_size > self.max_file_size:
  2868. logger.warning(f'File exceeds size limit: {self.max_file_size / (1024 ** 3)} GB, '
  2869. f'got {round(file_size / (1024 ** 3), 4)} GB')
  2870. def check_folder(self, folder_path: Union[str, Path]):
  2871. """
  2872. Check a folder to be uploaded.
  2873. Args:
  2874. folder_path (Union[str, Path]): The folder path to be checked.
  2875. Raises:
  2876. ValueError: If the folder does not exist or exceeds the file count limit.
  2877. """
  2878. file_count = 0
  2879. dir_count = 0
  2880. if isinstance(folder_path, str):
  2881. folder_path = Path(folder_path)
  2882. for item in folder_path.iterdir():
  2883. if item.is_file():
  2884. file_count += 1
  2885. item_size: int = get_file_size(item)
  2886. if item_size > self.max_file_size:
  2887. logger.warning(f'File {item} exceeds size limit: {self.max_file_size / (1024 ** 3)} GB',
  2888. f'got {round(item_size / (1024 ** 3), 4)} GB')
  2889. elif item.is_dir():
  2890. dir_count += 1
  2891. # Count items in subdirectories recursively
  2892. sub_file_count, sub_dir_count = self.check_folder(item)
  2893. if (sub_file_count + sub_dir_count) > self.max_file_count_in_dir:
  2894. raise ValueError(f'Directory {item} contains {sub_file_count + sub_dir_count} items '
  2895. f'and exceeds limit: {self.max_file_count_in_dir}')
  2896. file_count += sub_file_count
  2897. dir_count += sub_dir_count
  2898. if file_count > self.max_file_count:
  2899. raise ValueError(f'Total file count {file_count} and exceeds limit: {self.max_file_count}')
  2900. return file_count, dir_count
  2901. def is_lfs(self, file_path_or_obj: Union[str, Path, bytes, BinaryIO], repo_type: str) -> bool:
  2902. """
  2903. Check if a file should be uploaded via LFS.
  2904. Args:
  2905. file_path_or_obj (Union[str, Path, bytes, BinaryIO]): The file path or file-like object to be checked.
  2906. repo_type (str): The repo type, either `model` or `dataset`.
  2907. Returns:
  2908. bool: True if the file should be uploaded via LFS, False otherwise.
  2909. """
  2910. hit_lfs_suffix = True
  2911. if isinstance(file_path_or_obj, (str, Path)):
  2912. file_path_or_obj = Path(file_path_or_obj)
  2913. if not file_path_or_obj.exists():
  2914. raise ValueError(f'File {file_path_or_obj} does not exist')
  2915. if repo_type == REPO_TYPE_MODEL:
  2916. if file_path_or_obj.suffix not in MODEL_LFS_SUFFIX:
  2917. hit_lfs_suffix = False
  2918. elif repo_type == REPO_TYPE_DATASET:
  2919. if file_path_or_obj.suffix not in DATASET_LFS_SUFFIX:
  2920. hit_lfs_suffix = False
  2921. else:
  2922. raise ValueError(f'Invalid repo type: {repo_type}, supported repos: {REPO_TYPE_SUPPORT}')
  2923. file_size: int = get_file_size(file_path_or_obj)
  2924. return file_size > self.size_threshold_to_enforce_lfs or hit_lfs_suffix
  2925. def check_normal_files(self, file_path_list: List[Union[str, Path]], repo_type: str) -> None:
  2926. """
  2927. Check a list of normal files to be uploaded.
  2928. Args:
  2929. file_path_list (List[Union[str, Path]]): The list of file paths to be checked.
  2930. repo_type (str): The repo type, either `model` or `dataset`.
  2931. Raises:
  2932. ValueError: If the total size of normal files exceeds the limit.
  2933. Returns: None
  2934. """
  2935. normal_file_list = [item for item in file_path_list if not self.is_lfs(item, repo_type)]
  2936. total_size = sum([get_file_size(item) for item in normal_file_list])
  2937. if total_size > self.normal_file_size_total_limit:
  2938. raise ValueError(f'Total size of non-lfs files {total_size / (1024 * 1024)}MB '
  2939. f'and exceeds limit: {self.normal_file_size_total_limit / (1024 * 1024)}MB')