| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522 |
- # -------------------------------------------------------------------------
- # Copyright (c) Microsoft Corporation. All rights reserved.
- # Licensed under the MIT License.
- # --------------------------------------------------------------------------
- import argparse
- import csv
- import os
- import statistics
- import sys
- import time
- from pathlib import Path
- import coloredlogs
- # import torch before onnxruntime so that onnxruntime uses the cuDNN in the torch package.
- import torch
- from benchmark_helper import measure_memory
- SD_MODELS = {
- "1.5": "runwayml/stable-diffusion-v1-5",
- "2.0": "stabilityai/stable-diffusion-2",
- "2.1": "stabilityai/stable-diffusion-2-1",
- "xl-1.0": "stabilityai/stable-diffusion-xl-refiner-1.0",
- "3.0M": "stabilityai/stable-diffusion-3-medium-diffusers",
- "3.5M": "stabilityai/stable-diffusion-3.5-medium",
- "3.5L": "stabilityai/stable-diffusion-3.5-large",
- "Flux.1S": "black-forest-labs/FLUX.1-schnell",
- "Flux.1D": "black-forest-labs/FLUX.1-dev",
- }
- PROVIDERS = {
- "cuda": "CUDAExecutionProvider",
- "rocm": "ROCMExecutionProvider",
- "migraphx": "MIGraphXExecutionProvider",
- "tensorrt": "TensorrtExecutionProvider",
- }
- def example_prompts():
- prompts = [
- "a photo of an astronaut riding a horse on mars",
- "cute grey cat with blue eyes, wearing a bowtie, acrylic painting",
- "a cute magical flying dog, fantasy art drawn by disney concept artists, highly detailed, digital painting",
- "an illustration of a house with large barn with many cute flower pots and beautiful blue sky scenery",
- "one apple sitting on a table, still life, reflective, full color photograph, centered, close-up product",
- "background texture of stones, masterpiece, artistic, stunning photo, award winner photo",
- "new international organic style house, tropical surroundings, architecture, 8k, hdr",
- "beautiful Renaissance Revival Estate, Hobbit-House, detailed painting, warm colors, 8k, trending on Artstation",
- "blue owl, big green eyes, portrait, intricate metal design, unreal engine, octane render, realistic",
- "delicate elvish moonstone necklace on a velvet background, symmetrical intricate motifs, leaves, flowers, 8k",
- ]
- negative_prompt = "bad composition, ugly, abnormal, malformed"
- return prompts, negative_prompt
- def warmup_prompts():
- return "warm up", "bad"
- def measure_gpu_memory(monitor_type, func, start_memory=None):
- return measure_memory(is_gpu=True, func=func, monitor_type=monitor_type, start_memory=start_memory)
- def get_ort_pipeline(model_name: str, directory: str, provider, disable_safety_checker: bool):
- from diffusers import DDIMScheduler, OnnxStableDiffusionPipeline # noqa: PLC0415
- import onnxruntime # noqa: PLC0415
- if directory is not None:
- assert os.path.exists(directory)
- session_options = onnxruntime.SessionOptions()
- pipe = OnnxStableDiffusionPipeline.from_pretrained(
- directory,
- provider=provider,
- sess_options=session_options,
- )
- else:
- pipe = OnnxStableDiffusionPipeline.from_pretrained(
- model_name,
- revision="onnx",
- provider=provider,
- use_auth_token=True,
- )
- pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
- pipe.set_progress_bar_config(disable=True)
- if disable_safety_checker:
- pipe.safety_checker = None
- pipe.feature_extractor = None
- return pipe
- def get_torch_pipeline(model_name: str, disable_safety_checker: bool, enable_torch_compile: bool, use_xformers: bool):
- if "FLUX" in model_name:
- from diffusers import FluxPipeline # noqa: PLC0415
- pipe = FluxPipeline.from_pretrained(model_name, torch_dtype=torch.bfloat16).to("cuda")
- if enable_torch_compile:
- pipe.transformer.to(memory_format=torch.channels_last)
- pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True)
- return pipe
- if "stable-diffusion-3" in model_name:
- from diffusers import StableDiffusion3Pipeline # noqa: PLC0415
- pipe = StableDiffusion3Pipeline.from_pretrained(model_name, torch_dtype=torch.bfloat16).to("cuda")
- if enable_torch_compile:
- pipe.transformer.to(memory_format=torch.channels_last)
- pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True)
- return pipe
- from diffusers import DDIMScheduler, StableDiffusionPipeline # noqa: PLC0415
- from torch import channels_last, float16 # noqa: PLC0415
- pipe = StableDiffusionPipeline.from_pretrained(model_name, torch_dtype=float16).to("cuda")
- pipe.unet.to(memory_format=channels_last) # in-place operation
- if use_xformers:
- pipe.enable_xformers_memory_efficient_attention()
- if enable_torch_compile:
- pipe.unet = torch.compile(pipe.unet)
- pipe.vae = torch.compile(pipe.vae)
- pipe.text_encoder = torch.compile(pipe.text_encoder)
- print("Torch compiled unet, vae and text_encoder")
- pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
- pipe.set_progress_bar_config(disable=True)
- if disable_safety_checker:
- pipe.safety_checker = None
- pipe.feature_extractor = None
- return pipe
- def get_image_filename_prefix(engine: str, model_name: str, batch_size: int, steps: int, disable_safety_checker: bool):
- short_model_name = model_name.split("/")[-1].replace("stable-diffusion-", "sd")
- return f"{engine}_{short_model_name}_b{batch_size}_s{steps}" + ("" if disable_safety_checker else "_safe")
- def run_ort_pipeline(
- pipe,
- batch_size: int,
- image_filename_prefix: str,
- height,
- width,
- steps,
- num_prompts,
- batch_count,
- start_memory,
- memory_monitor_type,
- skip_warmup: bool = False,
- ):
- from diffusers import OnnxStableDiffusionPipeline # noqa: PLC0415
- assert isinstance(pipe, OnnxStableDiffusionPipeline)
- prompts, negative_prompt = example_prompts()
- def warmup():
- if skip_warmup:
- return
- prompt, negative = warmup_prompts()
- pipe(
- prompt=[prompt] * batch_size,
- height=height,
- width=width,
- num_inference_steps=steps,
- negative_prompt=[negative] * batch_size,
- )
- # Run warm up, and measure GPU memory of two runs
- # cuDNN/MIOpen The first run has algo search so it might need more memory)
- first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
- second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
- warmup()
- latency_list = []
- for i, prompt in enumerate(prompts):
- if i >= num_prompts:
- break
- inference_start = time.time()
- images = pipe(
- prompt=[prompt] * batch_size,
- height=height,
- width=width,
- num_inference_steps=steps,
- negative_prompt=[negative_prompt] * batch_size,
- ).images
- inference_end = time.time()
- latency = inference_end - inference_start
- latency_list.append(latency)
- print(f"Inference took {latency:.3f} seconds")
- for k, image in enumerate(images):
- image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
- from onnxruntime import __version__ as ort_version # noqa: PLC0415
- return {
- "engine": "onnxruntime",
- "version": ort_version,
- "height": height,
- "width": width,
- "steps": steps,
- "batch_size": batch_size,
- "batch_count": batch_count,
- "num_prompts": num_prompts,
- "average_latency": sum(latency_list) / len(latency_list),
- "median_latency": statistics.median(latency_list),
- "first_run_memory_MB": first_run_memory,
- "second_run_memory_MB": second_run_memory,
- }
- def get_negative_prompt_kwargs(negative_prompt, use_num_images_per_prompt, is_flux, batch_size) -> dict:
- # Flux does not support negative prompt
- kwargs = (
- (
- {"negative_prompt": negative_prompt}
- if use_num_images_per_prompt
- else {"negative_prompt": [negative_prompt] * batch_size}
- )
- if not is_flux
- else {}
- )
- # Fix the random seed so that we can inspect the output quality easily.
- if torch.cuda.is_available():
- kwargs["generator"] = torch.Generator(device="cuda").manual_seed(123)
- return kwargs
- def run_torch_pipeline(
- pipe,
- batch_size: int,
- image_filename_prefix: str,
- height,
- width,
- steps,
- num_prompts,
- batch_count,
- start_memory,
- memory_monitor_type,
- skip_warmup=False,
- ):
- prompts, negative_prompt = example_prompts()
- import diffusers # noqa: PLC0415
- is_flux = isinstance(pipe, diffusers.FluxPipeline)
- def warmup():
- if skip_warmup:
- return
- prompt, negative = warmup_prompts()
- extra_kwargs = get_negative_prompt_kwargs(negative, False, is_flux, batch_size)
- pipe(prompt=[prompt] * batch_size, height=height, width=width, num_inference_steps=steps, **extra_kwargs)
- # Run warm up, and measure GPU memory of two runs (The first run has cuDNN algo search so it might need more memory)
- first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
- second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
- warmup()
- torch.set_grad_enabled(False)
- latency_list = []
- for i, prompt in enumerate(prompts):
- if i >= num_prompts:
- break
- torch.cuda.synchronize()
- inference_start = time.time()
- extra_kwargs = get_negative_prompt_kwargs(negative_prompt, False, is_flux, batch_size)
- images = pipe(
- prompt=[prompt] * batch_size,
- height=height,
- width=width,
- num_inference_steps=steps,
- **extra_kwargs,
- ).images
- torch.cuda.synchronize()
- inference_end = time.time()
- latency = inference_end - inference_start
- latency_list.append(latency)
- print(f"Inference took {latency:.3f} seconds")
- for k, image in enumerate(images):
- image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
- return {
- "engine": "torch",
- "version": torch.__version__,
- "height": height,
- "width": width,
- "steps": steps,
- "batch_size": batch_size,
- "batch_count": batch_count,
- "num_prompts": num_prompts,
- "average_latency": sum(latency_list) / len(latency_list),
- "median_latency": statistics.median(latency_list),
- "first_run_memory_MB": first_run_memory,
- "second_run_memory_MB": second_run_memory,
- }
- def run_ort(
- model_name: str,
- directory: str,
- provider: str,
- batch_size: int,
- disable_safety_checker: bool,
- height: int,
- width: int,
- steps: int,
- num_prompts: int,
- batch_count: int,
- start_memory,
- memory_monitor_type,
- tuning: bool,
- skip_warmup: bool = False,
- ):
- provider_and_options = provider
- if tuning and provider in ["CUDAExecutionProvider", "ROCMExecutionProvider"]:
- provider_and_options = (provider, {"tunable_op_enable": 1, "tunable_op_tuning_enable": 1})
- load_start = time.time()
- pipe = get_ort_pipeline(model_name, directory, provider_and_options, disable_safety_checker)
- load_end = time.time()
- print(f"Model loading took {load_end - load_start} seconds")
- image_filename_prefix = get_image_filename_prefix("ort", model_name, batch_size, steps, disable_safety_checker)
- result = run_ort_pipeline(
- pipe,
- batch_size,
- image_filename_prefix,
- height,
- width,
- steps,
- num_prompts,
- batch_count,
- start_memory,
- memory_monitor_type,
- skip_warmup=skip_warmup,
- )
- result.update(
- {
- "model_name": model_name,
- "directory": directory,
- "provider": provider.replace("ExecutionProvider", ""),
- "disable_safety_checker": disable_safety_checker,
- "enable_cuda_graph": False,
- }
- )
- return result
- def get_optimum_ort_pipeline(
- model_name: str,
- directory: str,
- provider="CUDAExecutionProvider",
- disable_safety_checker: bool = True,
- use_io_binding: bool = False,
- ):
- from optimum.onnxruntime import ORTPipelineForText2Image # noqa: PLC0415
- if directory is not None and os.path.exists(directory):
- pipeline = ORTPipelineForText2Image.from_pretrained(directory, provider=provider, use_io_binding=use_io_binding)
- else:
- pipeline = ORTPipelineForText2Image.from_pretrained(
- model_name,
- export=True,
- provider=provider,
- use_io_binding=use_io_binding,
- )
- pipeline.save_pretrained(directory)
- if disable_safety_checker:
- pipeline.safety_checker = None
- pipeline.feature_extractor = None
- return pipeline
- def run_optimum_ort_pipeline(
- pipe,
- batch_size: int,
- image_filename_prefix: str,
- height,
- width,
- steps,
- num_prompts,
- batch_count,
- start_memory,
- memory_monitor_type,
- use_num_images_per_prompt=False,
- skip_warmup=False,
- ):
- print("Pipeline type", type(pipe))
- from optimum.onnxruntime.modeling_diffusion import ORTFluxPipeline # noqa: PLC0415
- is_flux = isinstance(pipe, ORTFluxPipeline)
- prompts, negative_prompt = example_prompts()
- def warmup():
- if skip_warmup:
- return
- prompt, negative = warmup_prompts()
- extra_kwargs = get_negative_prompt_kwargs(negative, use_num_images_per_prompt, is_flux, batch_size)
- if use_num_images_per_prompt:
- pipe(
- prompt=prompt,
- height=height,
- width=width,
- num_inference_steps=steps,
- num_images_per_prompt=batch_count,
- **extra_kwargs,
- )
- else:
- pipe(prompt=[prompt] * batch_size, height=height, width=width, num_inference_steps=steps, **extra_kwargs)
- # Run warm up, and measure GPU memory of two runs.
- # The first run has algo search for cuDNN/MIOpen, so it might need more memory.
- first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
- second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
- warmup()
- extra_kwargs = get_negative_prompt_kwargs(negative_prompt, use_num_images_per_prompt, is_flux, batch_size)
- latency_list = []
- for i, prompt in enumerate(prompts):
- if i >= num_prompts:
- break
- inference_start = time.time()
- if use_num_images_per_prompt:
- images = pipe(
- prompt=prompt,
- height=height,
- width=width,
- num_inference_steps=steps,
- num_images_per_prompt=batch_size,
- **extra_kwargs,
- ).images
- else:
- images = pipe(
- prompt=[prompt] * batch_size, height=height, width=width, num_inference_steps=steps, **extra_kwargs
- ).images
- inference_end = time.time()
- latency = inference_end - inference_start
- latency_list.append(latency)
- print(f"Inference took {latency:.3f} seconds")
- for k, image in enumerate(images):
- image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
- from onnxruntime import __version__ as ort_version # noqa: PLC0415
- return {
- "engine": "optimum_ort",
- "version": ort_version,
- "height": height,
- "width": width,
- "steps": steps,
- "batch_size": batch_size,
- "batch_count": batch_count,
- "num_prompts": num_prompts,
- "average_latency": sum(latency_list) / len(latency_list),
- "median_latency": statistics.median(latency_list),
- "first_run_memory_MB": first_run_memory,
- "second_run_memory_MB": second_run_memory,
- }
- def run_optimum_ort(
- model_name: str,
- directory: str,
- provider: str,
- batch_size: int,
- disable_safety_checker: bool,
- height: int,
- width: int,
- steps: int,
- num_prompts: int,
- batch_count: int,
- start_memory,
- memory_monitor_type,
- use_io_binding: bool = False,
- skip_warmup: bool = False,
- ):
- load_start = time.time()
- pipe = get_optimum_ort_pipeline(
- model_name, directory, provider, disable_safety_checker, use_io_binding=use_io_binding
- )
- load_end = time.time()
- print(f"Model loading took {load_end - load_start} seconds")
- full_model_name = model_name + "_" + Path(directory).name if directory else model_name
- image_filename_prefix = get_image_filename_prefix(
- "optimum", full_model_name, batch_size, steps, disable_safety_checker
- )
- result = run_optimum_ort_pipeline(
- pipe,
- batch_size,
- image_filename_prefix,
- height,
- width,
- steps,
- num_prompts,
- batch_count,
- start_memory,
- memory_monitor_type,
- skip_warmup=skip_warmup,
- )
- result.update(
- {
- "model_name": model_name,
- "directory": directory,
- "provider": provider.replace("ExecutionProvider", ""),
- "disable_safety_checker": disable_safety_checker,
- "enable_cuda_graph": False,
- }
- )
- return result
- def run_ort_trt_static(
- work_dir: str,
- version: str,
- batch_size: int,
- disable_safety_checker: bool,
- height: int,
- width: int,
- steps: int,
- num_prompts: int,
- batch_count: int,
- start_memory,
- memory_monitor_type,
- max_batch_size: int,
- nvtx_profile: bool = False,
- use_cuda_graph: bool = True,
- ):
- print("[I] Initializing ORT TensorRT EP accelerated StableDiffusionXL txt2img pipeline (static input shape)")
- # Register TensorRT plugins
- from trt_utilities import init_trt_plugins # noqa: PLC0415
- init_trt_plugins()
- assert batch_size <= max_batch_size
- from diffusion_models import PipelineInfo # noqa: PLC0415
- pipeline_info = PipelineInfo(version)
- short_name = pipeline_info.short_name()
- from engine_builder import EngineType, get_engine_paths # noqa: PLC0415
- from pipeline_stable_diffusion import StableDiffusionPipeline # noqa: PLC0415
- engine_type = EngineType.ORT_TRT
- onnx_dir, engine_dir, output_dir, framework_model_dir, _ = get_engine_paths(work_dir, pipeline_info, engine_type)
- # Initialize pipeline
- pipeline = StableDiffusionPipeline(
- pipeline_info,
- scheduler="DDIM",
- output_dir=output_dir,
- verbose=False,
- nvtx_profile=nvtx_profile,
- max_batch_size=max_batch_size,
- use_cuda_graph=use_cuda_graph,
- framework_model_dir=framework_model_dir,
- engine_type=engine_type,
- )
- # Load TensorRT engines and pytorch modules
- pipeline.backend.build_engines(
- engine_dir,
- framework_model_dir,
- onnx_dir,
- 17,
- opt_image_height=height,
- opt_image_width=width,
- opt_batch_size=batch_size,
- static_batch=True,
- static_image_shape=True,
- max_workspace_size=0,
- device_id=torch.cuda.current_device(),
- )
- # Here we use static batch and image size, so the resource allocation only need done once.
- # For dynamic batch and image size, some cost (like memory allocation) shall be included in latency.
- pipeline.load_resources(height, width, batch_size)
- def warmup():
- prompt, negative = warmup_prompts()
- pipeline.run([prompt] * batch_size, [negative] * batch_size, height, width, denoising_steps=steps)
- # Run warm up, and measure GPU memory of two runs
- # The first run has algo search so it might need more memory
- first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
- second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
- warmup()
- image_filename_prefix = get_image_filename_prefix("ort_trt", short_name, batch_size, steps, disable_safety_checker)
- latency_list = []
- prompts, negative_prompt = example_prompts()
- for i, prompt in enumerate(prompts):
- if i >= num_prompts:
- break
- inference_start = time.time()
- # Use warmup mode here since non-warmup mode will save image to disk.
- images, pipeline_time = pipeline.run(
- [prompt] * batch_size,
- [negative_prompt] * batch_size,
- height,
- width,
- denoising_steps=steps,
- guidance=7.5,
- seed=123,
- )
- inference_end = time.time()
- latency = inference_end - inference_start
- latency_list.append(latency)
- print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
- for k, image in enumerate(images):
- image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
- pipeline.teardown()
- from tensorrt import __version__ as trt_version # noqa: PLC0415
- from onnxruntime import __version__ as ort_version # noqa: PLC0415
- return {
- "model_name": pipeline_info.name(),
- "engine": "onnxruntime",
- "version": ort_version,
- "provider": f"tensorrt({trt_version})",
- "directory": engine_dir,
- "height": height,
- "width": width,
- "steps": steps,
- "batch_size": batch_size,
- "batch_count": batch_count,
- "num_prompts": num_prompts,
- "average_latency": sum(latency_list) / len(latency_list),
- "median_latency": statistics.median(latency_list),
- "first_run_memory_MB": first_run_memory,
- "second_run_memory_MB": second_run_memory,
- "disable_safety_checker": disable_safety_checker,
- "enable_cuda_graph": use_cuda_graph,
- }
- def run_tensorrt_static(
- work_dir: str,
- version: str,
- model_name: str,
- batch_size: int,
- disable_safety_checker: bool,
- height: int,
- width: int,
- steps: int,
- num_prompts: int,
- batch_count: int,
- start_memory,
- memory_monitor_type,
- max_batch_size: int,
- nvtx_profile: bool = False,
- use_cuda_graph: bool = True,
- skip_warmup: bool = False,
- ):
- print("[I] Initializing TensorRT accelerated StableDiffusionXL txt2img pipeline (static input shape)")
- from cuda import cudart # noqa: PLC0415
- # Register TensorRT plugins
- from trt_utilities import init_trt_plugins # noqa: PLC0415
- init_trt_plugins()
- assert batch_size <= max_batch_size
- from diffusion_models import PipelineInfo # noqa: PLC0415
- pipeline_info = PipelineInfo(version)
- from engine_builder import EngineType, get_engine_paths # noqa: PLC0415
- from pipeline_stable_diffusion import StableDiffusionPipeline # noqa: PLC0415
- engine_type = EngineType.TRT
- onnx_dir, engine_dir, output_dir, framework_model_dir, timing_cache = get_engine_paths(
- work_dir, pipeline_info, engine_type
- )
- # Initialize pipeline
- pipeline = StableDiffusionPipeline(
- pipeline_info,
- scheduler="DDIM",
- output_dir=output_dir,
- verbose=False,
- nvtx_profile=nvtx_profile,
- max_batch_size=max_batch_size,
- use_cuda_graph=True,
- engine_type=engine_type,
- )
- # Load TensorRT engines and pytorch modules
- pipeline.backend.load_engines(
- engine_dir=engine_dir,
- framework_model_dir=framework_model_dir,
- onnx_dir=onnx_dir,
- onnx_opset=17,
- opt_batch_size=batch_size,
- opt_image_height=height,
- opt_image_width=width,
- static_batch=True,
- static_shape=True,
- enable_all_tactics=False,
- timing_cache=timing_cache,
- )
- # activate engines
- max_device_memory = max(pipeline.backend.max_device_memory(), pipeline.backend.max_device_memory())
- _, shared_device_memory = cudart.cudaMalloc(max_device_memory)
- pipeline.backend.activate_engines(shared_device_memory)
- # Here we use static batch and image size, so the resource allocation only need done once.
- # For dynamic batch and image size, some cost (like memory allocation) shall be included in latency.
- pipeline.load_resources(height, width, batch_size)
- def warmup():
- if skip_warmup:
- return
- prompt, negative = warmup_prompts()
- pipeline.run([prompt] * batch_size, [negative] * batch_size, height, width, denoising_steps=steps)
- # Run warm up, and measure GPU memory of two runs
- # The first run has algo search so it might need more memory
- first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
- second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
- warmup()
- image_filename_prefix = get_image_filename_prefix("trt", model_name, batch_size, steps, disable_safety_checker)
- latency_list = []
- prompts, negative_prompt = example_prompts()
- for i, prompt in enumerate(prompts):
- if i >= num_prompts:
- break
- inference_start = time.time()
- # Use warmup mode here since non-warmup mode will save image to disk.
- images, pipeline_time = pipeline.run(
- [prompt] * batch_size,
- [negative_prompt] * batch_size,
- height,
- width,
- denoising_steps=steps,
- seed=123,
- )
- inference_end = time.time()
- latency = inference_end - inference_start
- latency_list.append(latency)
- print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
- for k, image in enumerate(images):
- image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
- pipeline.teardown()
- import tensorrt as trt # noqa: PLC0415
- return {
- "engine": "tensorrt",
- "version": trt.__version__,
- "provider": "default",
- "height": height,
- "width": width,
- "steps": steps,
- "batch_size": batch_size,
- "batch_count": batch_count,
- "num_prompts": num_prompts,
- "average_latency": sum(latency_list) / len(latency_list),
- "median_latency": statistics.median(latency_list),
- "first_run_memory_MB": first_run_memory,
- "second_run_memory_MB": second_run_memory,
- "enable_cuda_graph": use_cuda_graph,
- }
- def run_tensorrt_static_xl(
- work_dir: str,
- version: str,
- batch_size: int,
- disable_safety_checker: bool,
- height: int,
- width: int,
- steps: int,
- num_prompts: int,
- batch_count: int,
- start_memory,
- memory_monitor_type,
- max_batch_size: int,
- nvtx_profile: bool = False,
- use_cuda_graph=True,
- skip_warmup: bool = False,
- ):
- print("[I] Initializing TensorRT accelerated StableDiffusionXL txt2img pipeline (static input shape)")
- import tensorrt as trt # noqa: PLC0415
- from cuda import cudart # noqa: PLC0415
- from trt_utilities import init_trt_plugins # noqa: PLC0415
- # Validate image dimensions
- image_height = height
- image_width = width
- if image_height % 8 != 0 or image_width % 8 != 0:
- raise ValueError(
- f"Image height and width have to be divisible by 8 but specified as: {image_height} and {image_width}."
- )
- # Register TensorRT plugins
- init_trt_plugins()
- assert batch_size <= max_batch_size
- from diffusion_models import PipelineInfo # noqa: PLC0415
- from engine_builder import EngineType, get_engine_paths # noqa: PLC0415
- def init_pipeline(pipeline_class, pipeline_info):
- engine_type = EngineType.TRT
- onnx_dir, engine_dir, output_dir, framework_model_dir, timing_cache = get_engine_paths(
- work_dir, pipeline_info, engine_type
- )
- # Initialize pipeline
- pipeline = pipeline_class(
- pipeline_info,
- scheduler="DDIM",
- output_dir=output_dir,
- verbose=False,
- nvtx_profile=nvtx_profile,
- max_batch_size=max_batch_size,
- use_cuda_graph=use_cuda_graph,
- framework_model_dir=framework_model_dir,
- engine_type=engine_type,
- )
- pipeline.backend.load_engines(
- engine_dir=engine_dir,
- framework_model_dir=framework_model_dir,
- onnx_dir=onnx_dir,
- onnx_opset=17,
- opt_batch_size=batch_size,
- opt_image_height=height,
- opt_image_width=width,
- static_batch=True,
- static_shape=True,
- enable_all_tactics=False,
- timing_cache=timing_cache,
- )
- return pipeline
- from pipeline_stable_diffusion import StableDiffusionPipeline # noqa: PLC0415
- pipeline_info = PipelineInfo(version)
- pipeline = init_pipeline(StableDiffusionPipeline, pipeline_info)
- max_device_memory = max(pipeline.backend.max_device_memory(), pipeline.backend.max_device_memory())
- _, shared_device_memory = cudart.cudaMalloc(max_device_memory)
- pipeline.backend.activate_engines(shared_device_memory)
- # Here we use static batch and image size, so the resource allocation only need done once.
- # For dynamic batch and image size, some cost (like memory allocation) shall be included in latency.
- pipeline.load_resources(image_height, image_width, batch_size)
- def run_sd_xl_inference(prompt, negative_prompt, seed=None):
- return pipeline.run(
- prompt,
- negative_prompt,
- image_height,
- image_width,
- denoising_steps=steps,
- guidance=5.0,
- seed=seed,
- )
- def warmup():
- if skip_warmup:
- return
- prompt, negative = warmup_prompts()
- run_sd_xl_inference([prompt] * batch_size, [negative] * batch_size)
- # Run warm up, and measure GPU memory of two runs
- # The first run has algo search so it might need more memory
- first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
- second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
- warmup()
- model_name = pipeline_info.name()
- image_filename_prefix = get_image_filename_prefix("trt", model_name, batch_size, steps, disable_safety_checker)
- latency_list = []
- prompts, negative_prompt = example_prompts()
- for i, prompt in enumerate(prompts):
- if i >= num_prompts:
- break
- inference_start = time.time()
- # Use warmup mode here since non-warmup mode will save image to disk.
- images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
- inference_end = time.time()
- latency = inference_end - inference_start
- latency_list.append(latency)
- print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
- for k, image in enumerate(images):
- image.save(f"{image_filename_prefix}_{i}_{k}.png")
- pipeline.teardown()
- return {
- "model_name": model_name,
- "engine": "tensorrt",
- "version": trt.__version__,
- "provider": "default",
- "height": height,
- "width": width,
- "steps": steps,
- "batch_size": batch_size,
- "batch_count": batch_count,
- "num_prompts": num_prompts,
- "average_latency": sum(latency_list) / len(latency_list),
- "median_latency": statistics.median(latency_list),
- "first_run_memory_MB": first_run_memory,
- "second_run_memory_MB": second_run_memory,
- "enable_cuda_graph": use_cuda_graph,
- }
- def run_ort_trt_xl(
- work_dir: str,
- version: str,
- batch_size: int,
- disable_safety_checker: bool,
- height: int,
- width: int,
- steps: int,
- num_prompts: int,
- batch_count: int,
- start_memory,
- memory_monitor_type,
- max_batch_size: int,
- nvtx_profile: bool = False,
- use_cuda_graph=True,
- skip_warmup: bool = False,
- ):
- from demo_utils import initialize_pipeline # noqa: PLC0415
- from engine_builder import EngineType # noqa: PLC0415
- pipeline = initialize_pipeline(
- version=version,
- engine_type=EngineType.ORT_TRT,
- work_dir=work_dir,
- height=height,
- width=width,
- use_cuda_graph=use_cuda_graph,
- max_batch_size=max_batch_size,
- opt_batch_size=batch_size,
- )
- assert batch_size <= max_batch_size
- pipeline.load_resources(height, width, batch_size)
- def run_sd_xl_inference(prompt, negative_prompt, seed=None):
- return pipeline.run(
- prompt,
- negative_prompt,
- height,
- width,
- denoising_steps=steps,
- guidance=5.0,
- seed=seed,
- )
- def warmup():
- if skip_warmup:
- return
- prompt, negative = warmup_prompts()
- run_sd_xl_inference([prompt] * batch_size, [negative] * batch_size)
- # Run warm up, and measure GPU memory of two runs
- # The first run has algo search so it might need more memory
- first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
- second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
- warmup()
- model_name = pipeline.pipeline_info.name()
- image_filename_prefix = get_image_filename_prefix("ort_trt", model_name, batch_size, steps, disable_safety_checker)
- latency_list = []
- prompts, negative_prompt = example_prompts()
- for i, prompt in enumerate(prompts):
- if i >= num_prompts:
- break
- inference_start = time.time()
- # Use warmup mode here since non-warmup mode will save image to disk.
- images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
- inference_end = time.time()
- latency = inference_end - inference_start
- latency_list.append(latency)
- print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
- for k, image in enumerate(images):
- filename = f"{image_filename_prefix}_{i}_{k}.png"
- image.save(filename)
- print("Image saved to", filename)
- pipeline.teardown()
- from tensorrt import __version__ as trt_version # noqa: PLC0415
- from onnxruntime import __version__ as ort_version # noqa: PLC0415
- return {
- "model_name": model_name,
- "engine": "onnxruntime",
- "version": ort_version,
- "provider": f"tensorrt{trt_version})",
- "height": height,
- "width": width,
- "steps": steps,
- "batch_size": batch_size,
- "batch_count": batch_count,
- "num_prompts": num_prompts,
- "average_latency": sum(latency_list) / len(latency_list),
- "median_latency": statistics.median(latency_list),
- "first_run_memory_MB": first_run_memory,
- "second_run_memory_MB": second_run_memory,
- "enable_cuda_graph": use_cuda_graph,
- }
- def run_torch(
- model_name: str,
- batch_size: int,
- disable_safety_checker: bool,
- enable_torch_compile: bool,
- use_xformers: bool,
- height: int,
- width: int,
- steps: int,
- num_prompts: int,
- batch_count: int,
- start_memory,
- memory_monitor_type,
- skip_warmup: bool = True,
- ):
- torch.backends.cudnn.enabled = True
- torch.backends.cudnn.benchmark = True
- torch.set_grad_enabled(False)
- load_start = time.time()
- pipe = get_torch_pipeline(model_name, disable_safety_checker, enable_torch_compile, use_xformers)
- load_end = time.time()
- print(f"Model loading took {load_end - load_start} seconds")
- image_filename_prefix = get_image_filename_prefix("torch", model_name, batch_size, steps, disable_safety_checker)
- if not enable_torch_compile:
- with torch.inference_mode():
- result = run_torch_pipeline(
- pipe,
- batch_size,
- image_filename_prefix,
- height,
- width,
- steps,
- num_prompts,
- batch_count,
- start_memory,
- memory_monitor_type,
- skip_warmup=skip_warmup,
- )
- else:
- result = run_torch_pipeline(
- pipe,
- batch_size,
- image_filename_prefix,
- height,
- width,
- steps,
- num_prompts,
- batch_count,
- start_memory,
- memory_monitor_type,
- skip_warmup=skip_warmup,
- )
- result.update(
- {
- "model_name": model_name,
- "directory": None,
- "provider": "compile" if enable_torch_compile else "xformers" if use_xformers else "default",
- "disable_safety_checker": disable_safety_checker,
- "enable_cuda_graph": False,
- }
- )
- return result
- def parse_arguments():
- parser = argparse.ArgumentParser()
- parser.add_argument(
- "-e",
- "--engine",
- required=False,
- type=str,
- default="onnxruntime",
- choices=["onnxruntime", "optimum", "torch", "tensorrt"],
- help="Engines to benchmark. Default is onnxruntime.",
- )
- parser.add_argument(
- "-r",
- "--provider",
- required=False,
- type=str,
- default="cuda",
- choices=list(PROVIDERS.keys()),
- help="Provider to benchmark. Default is CUDAExecutionProvider.",
- )
- parser.add_argument(
- "-t",
- "--tuning",
- action="store_true",
- help="Enable TunableOp and tuning. "
- "This will incur longer warmup latency, and is mandatory for some operators of ROCm EP.",
- )
- parser.add_argument(
- "-v",
- "--version",
- required=False,
- type=str,
- choices=list(SD_MODELS.keys()),
- default="1.5",
- help="Stable diffusion version like 1.5, 2.0 or 2.1. Default is 1.5.",
- )
- parser.add_argument(
- "-p",
- "--pipeline",
- required=False,
- type=str,
- default=None,
- help="Directory of saved onnx pipeline. It could be the output directory of optimize_pipeline.py.",
- )
- parser.add_argument(
- "-w",
- "--work_dir",
- required=False,
- type=str,
- default=".",
- help="Root directory to save exported onnx models, built engines etc.",
- )
- parser.add_argument(
- "--enable_safety_checker",
- required=False,
- action="store_true",
- help="Enable safety checker",
- )
- parser.set_defaults(enable_safety_checker=False)
- parser.add_argument(
- "--enable_torch_compile",
- required=False,
- action="store_true",
- help="Enable compile unet for PyTorch 2.0",
- )
- parser.set_defaults(enable_torch_compile=False)
- parser.add_argument(
- "--use_xformers",
- required=False,
- action="store_true",
- help="Use xformers for PyTorch",
- )
- parser.set_defaults(use_xformers=False)
- parser.add_argument(
- "--use_io_binding",
- required=False,
- action="store_true",
- help="Use I/O Binding for Optimum.",
- )
- parser.set_defaults(use_io_binding=False)
- parser.add_argument(
- "--skip_warmup",
- required=False,
- action="store_true",
- help="No warmup.",
- )
- parser.set_defaults(skip_warmup=False)
- parser.add_argument(
- "-b",
- "--batch_size",
- type=int,
- default=1,
- choices=[1, 2, 3, 4, 8, 10, 16, 32],
- help="Number of images per batch. Default is 1.",
- )
- parser.add_argument(
- "--height",
- required=False,
- type=int,
- default=512,
- help="Output image height. Default is 512.",
- )
- parser.add_argument(
- "--width",
- required=False,
- type=int,
- default=512,
- help="Output image width. Default is 512.",
- )
- parser.add_argument(
- "-s",
- "--steps",
- required=False,
- type=int,
- default=50,
- help="Number of steps. Default is 50.",
- )
- parser.add_argument(
- "-n",
- "--num_prompts",
- required=False,
- type=int,
- default=10,
- help="Number of prompts. Default is 10.",
- )
- parser.add_argument(
- "-c",
- "--batch_count",
- required=False,
- type=int,
- choices=range(1, 11),
- default=5,
- help="Number of batches to test. Default is 5.",
- )
- parser.add_argument(
- "-m",
- "--max_trt_batch_size",
- required=False,
- type=int,
- choices=range(1, 16),
- default=4,
- help="Maximum batch size for TensorRT. Change the value may trigger TensorRT engine rebuild. Default is 4.",
- )
- parser.add_argument(
- "-g",
- "--enable_cuda_graph",
- required=False,
- action="store_true",
- help="Enable Cuda Graph. Requires onnxruntime >= 1.16",
- )
- parser.set_defaults(enable_cuda_graph=False)
- args = parser.parse_args()
- return args
- def print_loaded_libraries(cuda_related_only=True):
- import psutil # noqa: PLC0415
- p = psutil.Process(os.getpid())
- for lib in p.memory_maps():
- if (not cuda_related_only) or any(x in lib.path for x in ("libcu", "libnv", "tensorrt")):
- print(lib.path)
- def main():
- args = parse_arguments()
- print(args)
- if args.engine == "onnxruntime":
- if args.version in ["2.1"]:
- # Set a flag to avoid overflow in attention, which causes black image output in SD 2.1 model.
- # The environment variables shall be set before the first run of Attention or MultiHeadAttention operator.
- os.environ["ORT_DISABLE_TRT_FLASH_ATTENTION"] = "1"
- from packaging import version # noqa: PLC0415
- from onnxruntime import __version__ as ort_version # noqa: PLC0415
- if version.parse(ort_version) == version.parse("1.16.0"):
- # ORT 1.16 has a bug that might trigger Attention RuntimeError when latest fusion script is applied on clip model.
- # The walkaround is to enable fused causal attention, or disable Attention fusion for clip model.
- os.environ["ORT_ENABLE_FUSED_CAUSAL_ATTENTION"] = "1"
- if args.enable_cuda_graph:
- if not (args.engine == "onnxruntime" and args.provider in ["cuda", "tensorrt"] and args.pipeline is None):
- raise ValueError("The stable diffusion pipeline does not support CUDA graph.")
- if version.parse(ort_version) < version.parse("1.16"):
- raise ValueError("CUDA graph requires ONNX Runtime 1.16 or later")
- coloredlogs.install(fmt="%(funcName)20s: %(message)s")
- memory_monitor_type = "rocm" if args.provider == "rocm" else "cuda"
- start_memory = measure_gpu_memory(memory_monitor_type, None)
- print("GPU memory used before loading models:", start_memory)
- sd_model = SD_MODELS[args.version]
- provider = PROVIDERS[args.provider]
- if args.engine == "onnxruntime" and args.provider == "tensorrt":
- if "xl" in args.version:
- print("Testing Txt2ImgXLPipeline with static input shape. Backend is ORT TensorRT EP.")
- result = run_ort_trt_xl(
- work_dir=args.work_dir,
- version=args.version,
- batch_size=args.batch_size,
- disable_safety_checker=True,
- height=args.height,
- width=args.width,
- steps=args.steps,
- num_prompts=args.num_prompts,
- batch_count=args.batch_count,
- start_memory=start_memory,
- memory_monitor_type=memory_monitor_type,
- max_batch_size=args.max_trt_batch_size,
- nvtx_profile=False,
- use_cuda_graph=args.enable_cuda_graph,
- skip_warmup=args.skip_warmup,
- )
- else:
- print("Testing Txt2ImgPipeline with static input shape. Backend is ORT TensorRT EP.")
- result = run_ort_trt_static(
- work_dir=args.work_dir,
- version=args.version,
- batch_size=args.batch_size,
- disable_safety_checker=not args.enable_safety_checker,
- height=args.height,
- width=args.width,
- steps=args.steps,
- num_prompts=args.num_prompts,
- batch_count=args.batch_count,
- start_memory=start_memory,
- memory_monitor_type=memory_monitor_type,
- max_batch_size=args.max_trt_batch_size,
- nvtx_profile=False,
- use_cuda_graph=args.enable_cuda_graph,
- skip_warmup=args.skip_warmup,
- )
- elif args.engine == "optimum" and provider == "CUDAExecutionProvider":
- if "xl" in args.version:
- os.environ["ORT_ENABLE_FUSED_CAUSAL_ATTENTION"] = "1"
- result = run_optimum_ort(
- model_name=sd_model,
- directory=args.pipeline,
- provider=provider,
- batch_size=args.batch_size,
- disable_safety_checker=not args.enable_safety_checker,
- height=args.height,
- width=args.width,
- steps=args.steps,
- num_prompts=args.num_prompts,
- batch_count=args.batch_count,
- start_memory=start_memory,
- memory_monitor_type=memory_monitor_type,
- use_io_binding=args.use_io_binding,
- skip_warmup=args.skip_warmup,
- )
- elif args.engine == "onnxruntime":
- assert args.pipeline and os.path.isdir(args.pipeline), (
- "--pipeline should be specified for the directory of ONNX models"
- )
- print(f"Testing diffusers StableDiffusionPipeline with {provider} provider and tuning={args.tuning}")
- result = run_ort(
- model_name=sd_model,
- directory=args.pipeline,
- provider=provider,
- batch_size=args.batch_size,
- disable_safety_checker=not args.enable_safety_checker,
- height=args.height,
- width=args.width,
- steps=args.steps,
- num_prompts=args.num_prompts,
- batch_count=args.batch_count,
- start_memory=start_memory,
- memory_monitor_type=memory_monitor_type,
- tuning=args.tuning,
- skip_warmup=args.skip_warmup,
- )
- elif args.engine == "tensorrt" and "xl" in args.version:
- print("Testing Txt2ImgXLPipeline with static input shape. Backend is TensorRT.")
- result = run_tensorrt_static_xl(
- work_dir=args.work_dir,
- version=args.version,
- batch_size=args.batch_size,
- disable_safety_checker=True,
- height=args.height,
- width=args.width,
- steps=args.steps,
- num_prompts=args.num_prompts,
- batch_count=args.batch_count,
- start_memory=start_memory,
- memory_monitor_type=memory_monitor_type,
- max_batch_size=args.max_trt_batch_size,
- nvtx_profile=False,
- use_cuda_graph=args.enable_cuda_graph,
- skip_warmup=args.skip_warmup,
- )
- elif args.engine == "tensorrt":
- print("Testing Txt2ImgPipeline with static input shape. Backend is TensorRT.")
- result = run_tensorrt_static(
- work_dir=args.work_dir,
- version=args.version,
- model_name=sd_model,
- batch_size=args.batch_size,
- disable_safety_checker=True,
- height=args.height,
- width=args.width,
- steps=args.steps,
- num_prompts=args.num_prompts,
- batch_count=args.batch_count,
- start_memory=start_memory,
- memory_monitor_type=memory_monitor_type,
- max_batch_size=args.max_trt_batch_size,
- nvtx_profile=False,
- use_cuda_graph=args.enable_cuda_graph,
- skip_warmup=args.skip_warmup,
- )
- else:
- print(
- f"Testing Txt2ImgPipeline with dynamic input shape. Backend is PyTorch: compile={args.enable_torch_compile}, xformers={args.use_xformers}."
- )
- result = run_torch(
- model_name=sd_model,
- batch_size=args.batch_size,
- disable_safety_checker=not args.enable_safety_checker,
- enable_torch_compile=args.enable_torch_compile,
- use_xformers=args.use_xformers,
- height=args.height,
- width=args.width,
- steps=args.steps,
- num_prompts=args.num_prompts,
- batch_count=args.batch_count,
- start_memory=start_memory,
- memory_monitor_type=memory_monitor_type,
- skip_warmup=args.skip_warmup,
- )
- print(result)
- with open("benchmark_result.csv", mode="a", newline="") as csv_file:
- column_names = [
- "model_name",
- "directory",
- "engine",
- "version",
- "provider",
- "disable_safety_checker",
- "height",
- "width",
- "steps",
- "batch_size",
- "batch_count",
- "num_prompts",
- "average_latency",
- "median_latency",
- "first_run_memory_MB",
- "second_run_memory_MB",
- "enable_cuda_graph",
- ]
- csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
- csv_writer.writeheader()
- csv_writer.writerow(result)
- # Show loaded DLLs when steps == 1 for debugging purpose.
- if args.steps == 1:
- print_loaded_libraries(args.provider in ["cuda", "tensorrt"])
- if __name__ == "__main__":
- import traceback
- try:
- main()
- except Exception:
- traceback.print_exception(*sys.exc_info())
|