Lack of speed improvement when using custom GPT model with ORT
See original GitHub issuehey guys! In my investigation to try figure out why there is a speed regression for #56, I created a simple minimal script to benchmark ORT vs no ORT.
With the script I’m seeing basically the same time between ORT and no ORT. Any ideas on what is causing performance issue? I’m also seeing a few warnings which I’ve included below!
No ORT Time taken: 85.2842013835907 seconds ORT Time taken 85.33545899391174 seconds
Warnings:
/usr/local/lib/python3.6/dist-packages/onnxruntime/training/ortmodule/_logger.py:52: UserWarning: There were one or more warnings or errors raised while exporting the PyTorch model. Please enable INFO level logging to view all warnings and errors.
"model. Please enable INFO level logging to view all warnings and errors.", UserWarning)
Warning: Unsupported operator ATenOp. No schema registered for this operator.
Warning: Unsupported operator ATenOp. No schema registered for this operator.
script:
import math
import os
import time
import numpy as np
import torch
import torch.nn as nn
from torch.cuda.amp import autocast
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch_ort import ORTModule
class GPTConfig:
""" base GPT config, params common to all GPT versions """
embd_pdrop = 0.1
resid_pdrop = 0.1
attn_pdrop = 0.1
def __init__(self, vocab_size, block_size, **kwargs):
self.vocab_size = vocab_size
self.block_size = block_size
for k, v in kwargs.items():
setattr(self, k, v)
class CausalSelfAttention(nn.Module):
"""
A vanilla multi-head masked self-attention layer with a projection at the end.
I believe I could have just used torch.nn.MultiheadAttention but their documentation
is all but absent and code ugly so I don't trust it, rolling my own here.
"""
def __init__(self, config):
super().__init__()
assert config.n_embd % config.n_head == 0
# key, query, value projections for all heads
self.key = nn.Linear(config.n_embd, config.n_embd)
self.query = nn.Linear(config.n_embd, config.n_embd)
self.value = nn.Linear(config.n_embd, config.n_embd)
# regularization
self.attn_drop = nn.Dropout(config.attn_pdrop)
self.resid_drop = nn.Dropout(config.resid_pdrop)
# output projection
self.proj = nn.Linear(config.n_embd, config.n_embd)
# causal mask to ensure that attention is only applied to the left in the input sequence
self.register_buffer("mask", torch.tril(torch.ones(config.block_size, config.block_size))
.view(1, 1, config.block_size, config.block_size))
self.n_head = config.n_head
def forward(self, x, layer_past=None):
B, T, C = x.size()
# calculate query, key, values for all heads in batch and move head forward to be the batch dim
k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
# causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float('-inf'))
att = F.softmax(att, dim=-1)
att = self.attn_drop(att)
y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
# output projection
y = self.resid_drop(self.proj(y))
return y
class Block(nn.Module):
""" an unassuming Transformer block """
def __init__(self, config):
super().__init__()
self.ln1 = nn.LayerNorm(config.n_embd)
self.ln2 = nn.LayerNorm(config.n_embd)
self.attn = CausalSelfAttention(config)
self.mlp = nn.Sequential(
nn.Linear(config.n_embd, 4 * config.n_embd),
nn.GELU(),
nn.Linear(4 * config.n_embd, config.n_embd),
nn.Dropout(config.resid_pdrop),
)
def forward(self, x):
x = x + self.attn(self.ln1(x))
x = x + self.mlp(self.ln2(x))
return x
class GPT(torch.nn.Module):
def __init__(self, vocab_size, n_embd, block_size, embd_pdrop, n_layer, config):
# input embedding stem
super().__init__()
self.tok_emb = nn.Embedding(vocab_size, n_embd)
self.pos_emb = nn.Parameter(torch.zeros(1, block_size, n_embd))
self.drop = nn.Dropout(embd_pdrop)
self.config = config
# decoder head
self.ln_f = nn.LayerNorm(n_embd)
self.head = nn.Linear(n_embd, vocab_size, bias=False)
self.block_size = block_size
blocks = []
for x in range(n_layer):
layer = Block(self.config)
blocks.append(layer)
self.blocks = nn.Sequential(*blocks)
def forward(self, idx):
b, t = idx.size()
assert t <= self.block_size, "Cannot forward, model block size is exhausted."
# forward the GPT model
token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) vector
position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) vector
x = self.drop(token_embeddings + position_embeddings)
x = self.blocks(x)
x = self.ln_f(x)
logits = self.head(x)
return logits
class CharDataset(Dataset):
def __init__(self, data, block_size):
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
self.stoi = {ch: i for i, ch in enumerate(chars)}
self.itos = {i: ch for i, ch in enumerate(chars)}
self.block_size = block_size
self.vocab_size = vocab_size
self.data = data
def __len__(self):
return math.ceil(len(self.data) / (self.block_size + 1))
def __getitem__(self, idx):
# we're actually going to "cheat" and pick a spot in the dataset at random
i = np.random.randint(0, len(self.data) - (self.block_size + 1))
chunk = self.data[i:i + self.block_size + 1]
dix = [self.stoi[s] for s in chunk]
x = torch.tensor(dix[:-1], dtype=torch.long)
y = torch.tensor(dix[1:], dtype=torch.long)
return x, y
if __name__ == '__main__':
n_embd = 2048
block_size = 128
n_layer = 6
batch_size = 8
num_workers = 0
n_head = 16
n_warmup = 20
enable_ort = True
device = torch.device("cuda:0")
if not os.path.exists("input.txt"):
os.system("wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt")
file = 'input.txt'
text = open(file, 'r').read()
train_dataset = CharDataset(text, block_size) # one line of poem is roughly 50 characters
train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers)
vocab_size = train_dataset.vocab_size
model = GPT(
vocab_size=vocab_size,
n_embd=n_embd,
embd_pdrop=0.1,
block_size=block_size,
n_layer=n_layer,
config=GPTConfig(
vocab_size=vocab_size,
block_size=block_size,
n_layer=n_layer,
n_head=n_head,
n_embd=n_embd,
)
)
if enable_ort:
model = ORTModule(model)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
torch.cuda.synchronize()
# warmup before measuring
for x, (idx, targets) in tqdm(enumerate(train_loader), total=len(train_loader)):
if x == n_warmup:
break
idx = idx.to(device)
targets = targets.to(device)
with autocast():
logits = model(idx)
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
torch.cuda.synchronize()
start_time = time.time()
for idx, targets in tqdm(train_loader, total=len(train_loader)):
idx = idx.to(device)
targets = targets.to(device)
with autocast():
logits = model(idx)
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
loss.backward()
optimizer.step()
torch.cuda.synchronize()
print("Time taken", time.time() - start_time)
Issue Analytics
- State:
- Created 2 years ago
- Comments:6
Top Results From Across the Web
Pretraining a 124-M Parameter GPT-2 Language Model
This article describes experience and learnings from training the 124-M parameter GPT-2 model. . Made by Bilal using Weights & Biases.
Read more >Megatron-LM GPT Pretraining Tutorial
The GPT pretraining python script is a wrapper that imports the Megatron-LM library modules and sets up the pieces needed by the Megatron-LM...
Read more >[D] GPT-3, The $4600000 Language Model
GPT -3 demonstrates that a language model trained on enough data can solve NLP tasks that it has never seen. That is, GPT-3...
Read more >The Ultimate Guide to OpenAI's GPT-3 Language Model
OpenAI claims that GPT-3 can achieve this level of performance without any additional training data after its initial pre-training period.
Read more >MegatronBERT
Using the GPT-2 model we achieve SOTA results on the WikiText103 (10.8 compared to ... be loaded by Hugging Face Transformers and our...
Read more >Top Related Medium Post
No results found
Top Related StackOverflow Question
No results found
Troubleshoot Live Code
Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start FreeTop Related Reddit Thread
No results found
Top Related Hackernoon Post
No results found
Top Related Tweet
No results found
Top Related Dev.to Post
No results found
Top Related Hashnode Post
No results found
Top GitHub Comments
Can confirm that the above works also for Lightning, thanks so much @ashbhandare!!
For your reference, this is a partial code with changes from yours that I have used for the above numbers: