import os import torch import deepspeed import transformers import time import torch import numpy as np from deepspeed import module_inject from transformers import pipeline from transformers.models.gptj.modeling_gptj import GPTJBlock import sys sys.path.append('/root/mono/services/model-servers') from transformers import GPTJForCausalLM # Get local gpu rank from torch.distributed/deepspeed launcher local_rank = int(os.getenv('LOCAL_RANK', '0')) world_size = int(os.getenv('WORLD_SIZE', '1')) print( "***************** Creating model in RANK ({0}) with WORLD_SIZE = {1} *****************" .format(local_rank, world_size)) from transformers import AutoTokenizer, AutoModelForCausalLM print('here1') tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", revision='float16', torch_dtype=torch.float16) model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision='float16', torch_dtype=torch.float16) print('here2') inp_tokens = tokenizer("DeepSpeed is", return_tensors="pt",) print('here3') model = deepspeed.init_inference(model, mp_size=world_size, dtype=torch.half, injection_policy={GPTJBlock: ('attn.out_proj','mlp.fc_out')}, replace_with_kernel_inject=False) print('here4') for token in inp_tokens: if torch.is_tensor(inp_tokens[token]): inp_tokens[token] = inp_tokens[token].to(f'cuda:{local_rank}') print('here5') model.cuda().to(f'cuda:{local_rank}') print('here6') string = tokenizer.batch_decode(model.generate(**inp_tokens,min_length=50,))[0] print(string) latencies = [] for i in range(10): torch.cuda.synchronize() t0 = time.time() output_sequences = model.generate( **inp_tokens, do_sample=False, max_length=50, ) torch.cuda.synchronize() latencies.append((time.time()-t0)) print(latencies)