2222import torch
2323
2424from nemo .deploy .nlp .megatronllm_deployable import MegatronLLMDeployable
25- from tests .infer_data_path import get_infer_test_data
2625
2726run_export_tests = True
2827try :
@@ -353,88 +352,6 @@ def test_cpp_runtime(
353352 print ("" )
354353
355354
356- def run_existing_checkpoints (
357- model_name ,
358- n_gpus ,
359- tp_size = None ,
360- pp_size = None ,
361- ptuning = False ,
362- lora = False ,
363- streaming = False ,
364- run_accuracy = False ,
365- test_deployment = False ,
366- stop_words_list = None ,
367- test_data_path = None ,
368- backend = "tensorrt-llm" ,
369- save_engine = False ,
370- ):
371- if n_gpus > torch .cuda .device_count ():
372- print ("Skipping the test due to not enough number of GPUs" )
373- return None , None , None , None , None
374-
375- test_data = get_infer_test_data ()
376- if not (model_name in test_data .keys ()):
377- raise Exception ("Model {0} is not supported." .format (model_name ))
378-
379- model_info = test_data [model_name ]
380-
381- if n_gpus < model_info .min_gpus :
382- print ("Min n_gpus for this model is {0}" .format (n_gpus ))
383- return None , None , None , None , None
384-
385- if ptuning and model_info .p_tuning_checkpoint is None :
386- raise Exception ("There is not ptuning checkpoint path defined." )
387-
388- if lora and model_info .lora_checkpoint is None :
389- raise Exception ("There is not lora checkpoint path defined." )
390-
391- if model_info .model_type == "gemma" :
392- print ("*********************" )
393- use_embedding_sharing = True
394- else :
395- use_embedding_sharing = False
396-
397- if backend == "in-framework" :
398- return run_in_framework_inference (
399- model_name = model_name ,
400- prompt = model_info .prompt_template ,
401- checkpoint_path = model_info .checkpoint ,
402- max_batch_size = model_info .max_batch_size ,
403- max_input_len = None ,
404- max_output_len = model_info .max_output_len ,
405- )
406- else :
407- return run_trt_llm_inference (
408- model_name = model_name ,
409- model_type = model_info .model_type ,
410- prompt = model_info .prompt_template ,
411- checkpoint_path = model_info .checkpoint ,
412- trt_llm_model_dir = model_info .trt_llm_model_dir ,
413- n_gpu = n_gpus ,
414- max_batch_size = model_info .max_batch_size ,
415- use_embedding_sharing = use_embedding_sharing ,
416- max_input_len = 512 ,
417- max_output_len = model_info .max_output_len ,
418- max_num_tokens = None ,
419- ptuning = ptuning ,
420- p_tuning_checkpoint = model_info .p_tuning_checkpoint ,
421- lora = lora ,
422- lora_checkpoint = model_info .lora_checkpoint ,
423- tp_size = tp_size ,
424- pp_size = pp_size ,
425- top_k = 1 ,
426- top_p = 0.0 ,
427- temperature = 1.0 ,
428- run_accuracy = run_accuracy ,
429- debug = True ,
430- streaming = streaming ,
431- stop_words_list = stop_words_list ,
432- test_deployment = test_deployment ,
433- test_data_path = test_data_path ,
434- save_engine = save_engine ,
435- )
436-
437-
438355def get_args ():
439356 parser = argparse .ArgumentParser (
440357 formatter_class = argparse .ArgumentDefaultsHelpFormatter ,
@@ -445,11 +362,6 @@ def get_args():
445362 type = str ,
446363 required = True ,
447364 )
448- parser .add_argument (
449- "--existing_test_models" ,
450- default = False ,
451- action = 'store_true' ,
452- )
453365 parser .add_argument (
454366 "--model_type" ,
455367 type = str ,
@@ -602,75 +514,52 @@ def run_inference_tests(args):
602514
603515 result_dic = {}
604516
605- if args . existing_test_models :
606- n_gpus = args .min_gpus
607- if args .max_gpus is None :
608- args .max_gpus = args .min_gpus
517+ prompt_template = [ "The capital of France is" , "Largest animal in the sea is" ]
518+ n_gpus = args .min_gpus
519+ if args .max_gpus is None :
520+ args .max_gpus = args .min_gpus
609521
610- while n_gpus <= args .max_gpus :
611- result_dic [n_gpus ] = run_existing_checkpoints (
522+ while n_gpus <= args .max_gpus :
523+ if args .backend .lower () == "tensorrt-llm" :
524+ result_dic [n_gpus ] = run_trt_llm_inference (
612525 model_name = args .model_name ,
613- n_gpus = n_gpus ,
526+ model_type = args .model_type ,
527+ prompt = prompt_template ,
528+ checkpoint_path = args .checkpoint_dir ,
529+ trt_llm_model_dir = args .trt_llm_model_dir ,
530+ n_gpu = n_gpus ,
531+ max_batch_size = args .max_batch_size ,
532+ max_input_len = args .max_input_len ,
533+ max_output_len = args .max_output_len ,
534+ max_num_tokens = args .max_num_tokens ,
614535 ptuning = args .ptuning ,
536+ p_tuning_checkpoint = args .p_tuning_checkpoint ,
615537 lora = args .lora ,
538+ lora_checkpoint = args .lora_checkpoint ,
616539 tp_size = args .tp_size ,
617540 pp_size = args .pp_size ,
541+ top_k = args .top_k ,
542+ top_p = args .top_p ,
543+ temperature = args .temperature ,
544+ run_accuracy = args .run_accuracy ,
545+ debug = args .debug ,
618546 streaming = args .streaming ,
619547 test_deployment = args .test_deployment ,
620- run_accuracy = args .run_accuracy ,
621548 test_data_path = args .test_data_path ,
622- backend = args .backend .lower (),
623549 save_engine = args .save_engine ,
624550 )
551+ else :
552+ result_dic [n_gpus ] = run_in_framework_inference (
553+ model_name = args .model_name ,
554+ prompt = prompt_template ,
555+ checkpoint_path = args .checkpoint_dir ,
556+ n_gpu = n_gpus ,
557+ max_batch_size = args .max_batch_size ,
558+ max_input_len = args .max_input_len ,
559+ max_output_len = args .max_output_len ,
560+ )
625561
626- n_gpus = n_gpus * 2
627- else :
628- prompt_template = ["The capital of France is" , "Largest animal in the sea is" ]
629- n_gpus = args .min_gpus
630- if args .max_gpus is None :
631- args .max_gpus = args .min_gpus
632-
633- while n_gpus <= args .max_gpus :
634- if args .backend .lower () == "tensorrt-llm" :
635- result_dic [n_gpus ] = run_trt_llm_inference (
636- model_name = args .model_name ,
637- model_type = args .model_type ,
638- prompt = prompt_template ,
639- checkpoint_path = args .checkpoint_dir ,
640- trt_llm_model_dir = args .trt_llm_model_dir ,
641- n_gpu = n_gpus ,
642- max_batch_size = args .max_batch_size ,
643- max_input_len = args .max_input_len ,
644- max_output_len = args .max_output_len ,
645- max_num_tokens = args .max_num_tokens ,
646- ptuning = args .ptuning ,
647- p_tuning_checkpoint = args .p_tuning_checkpoint ,
648- lora = args .lora ,
649- lora_checkpoint = args .lora_checkpoint ,
650- tp_size = args .tp_size ,
651- pp_size = args .pp_size ,
652- top_k = args .top_k ,
653- top_p = args .top_p ,
654- temperature = args .temperature ,
655- run_accuracy = args .run_accuracy ,
656- debug = args .debug ,
657- streaming = args .streaming ,
658- test_deployment = args .test_deployment ,
659- test_data_path = args .test_data_path ,
660- save_engine = args .save_engine ,
661- )
662- else :
663- result_dic [n_gpus ] = run_in_framework_inference (
664- model_name = args .model_name ,
665- prompt = prompt_template ,
666- checkpoint_path = args .checkpoint_dir ,
667- n_gpu = n_gpus ,
668- max_batch_size = args .max_batch_size ,
669- max_input_len = args .max_input_len ,
670- max_output_len = args .max_output_len ,
671- )
672-
673- n_gpus = n_gpus * 2
562+ n_gpus = n_gpus * 2
674563
675564 test_result = "PASS"
676565 print_separator = False
0 commit comments