Stuck on an issue?

Lightrun Answers was designed to reduce the constant googling that comes with debugging 3rd party libraries. It collects links to all the places you might be looking at while hunting down a tough bug.

And, if you’re still stuck at the end, we’re happy to hop on a call to see how we can help out.

Lack of speed improvement when using custom GPT model with ORT

See original GitHub issue

hey guys! In my investigation to try figure out why there is a speed regression for #56, I created a simple minimal script to benchmark ORT vs no ORT.

With the script I’m seeing basically the same time between ORT and no ORT. Any ideas on what is causing performance issue? I’m also seeing a few warnings which I’ve included below!

No ORT Time taken: 85.2842013835907 seconds ORT Time taken 85.33545899391174 seconds

Warnings:

/usr/local/lib/python3.6/dist-packages/onnxruntime/training/ortmodule/_logger.py:52: UserWarning: There were one or more warnings or errors raised while exporting the PyTorch model. Please enable INFO level logging to view all warnings and errors.
  "model. Please enable INFO level logging to view all warnings and errors.", UserWarning)
Warning: Unsupported operator ATenOp. No schema registered for this operator.
Warning: Unsupported operator ATenOp. No schema registered for this operator.

script:

import math
import os
import time

import numpy as np
import torch
import torch.nn as nn
from torch.cuda.amp import autocast
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch_ort import ORTModule


class GPTConfig:
    """ base GPT config, params common to all GPT versions """
    embd_pdrop = 0.1
    resid_pdrop = 0.1
    attn_pdrop = 0.1

    def __init__(self, vocab_size, block_size, **kwargs):
        self.vocab_size = vocab_size
        self.block_size = block_size
        for k, v in kwargs.items():
            setattr(self, k, v)


class CausalSelfAttention(nn.Module):
    """
    A vanilla multi-head masked self-attention layer with a projection at the end.
    I believe I could have just used torch.nn.MultiheadAttention but their documentation
    is all but absent and code ugly so I don't trust it, rolling my own here.
    """

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads
        self.key = nn.Linear(config.n_embd, config.n_embd)
        self.query = nn.Linear(config.n_embd, config.n_embd)
        self.value = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        self.attn_drop = nn.Dropout(config.attn_pdrop)
        self.resid_drop = nn.Dropout(config.resid_pdrop)
        # output projection
        self.proj = nn.Linear(config.n_embd, config.n_embd)
        # causal mask to ensure that attention is only applied to the left in the input sequence
        self.register_buffer("mask", torch.tril(torch.ones(config.block_size, config.block_size))
                             .view(1, 1, config.block_size, config.block_size))
        self.n_head = config.n_head

    def forward(self, x, layer_past=None):
        B, T, C = x.size()

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
        q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
        v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)
        y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side

        # output projection
        y = self.resid_drop(self.proj(y))
        return y


class Block(nn.Module):
    """ an unassuming Transformer block """

    def __init__(self, config):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.n_embd)
        self.ln2 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd),
            nn.GELU(),
            nn.Linear(4 * config.n_embd, config.n_embd),
            nn.Dropout(config.resid_pdrop),
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x


class GPT(torch.nn.Module):
    def __init__(self, vocab_size, n_embd, block_size, embd_pdrop, n_layer, config):
        # input embedding stem
        super().__init__()
        self.tok_emb = nn.Embedding(vocab_size, n_embd)
        self.pos_emb = nn.Parameter(torch.zeros(1, block_size, n_embd))
        self.drop = nn.Dropout(embd_pdrop)
        self.config = config

        # decoder head
        self.ln_f = nn.LayerNorm(n_embd)
        self.head = nn.Linear(n_embd, vocab_size, bias=False)

        self.block_size = block_size

        blocks = []
        for x in range(n_layer):
            layer = Block(self.config)
            blocks.append(layer)
        self.blocks = nn.Sequential(*blocks)

    def forward(self, idx):
        b, t = idx.size()
        assert t <= self.block_size, "Cannot forward, model block size is exhausted."

        # forward the GPT model
        token_embeddings = self.tok_emb(idx)  # each index maps to a (learnable) vector
        position_embeddings = self.pos_emb[:, :t, :]  # each position maps to a (learnable) vector
        x = self.drop(token_embeddings + position_embeddings)
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)
        return logits


class CharDataset(Dataset):

    def __init__(self, data, block_size):
        chars = list(set(data))
        data_size, vocab_size = len(data), len(chars)

        self.stoi = {ch: i for i, ch in enumerate(chars)}
        self.itos = {i: ch for i, ch in enumerate(chars)}
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data

    def __len__(self):
        return math.ceil(len(self.data) / (self.block_size + 1))

    def __getitem__(self, idx):
        # we're actually going to "cheat" and pick a spot in the dataset at random
        i = np.random.randint(0, len(self.data) - (self.block_size + 1))
        chunk = self.data[i:i + self.block_size + 1]
        dix = [self.stoi[s] for s in chunk]
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y


if __name__ == '__main__':
    n_embd = 2048
    block_size = 128
    n_layer = 6
    batch_size = 8
    num_workers = 0
    n_head = 16
    n_warmup = 20
    enable_ort = True

    device = torch.device("cuda:0")

    if not os.path.exists("input.txt"):
        os.system("wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt")

    file = 'input.txt'
    text = open(file, 'r').read()
    train_dataset = CharDataset(text, block_size)  # one line of poem is roughly 50 characters
    train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers)
    vocab_size = train_dataset.vocab_size

    model = GPT(
        vocab_size=vocab_size,
        n_embd=n_embd,
        embd_pdrop=0.1,
        block_size=block_size,
        n_layer=n_layer,
        config=GPTConfig(
            vocab_size=vocab_size,
            block_size=block_size,
            n_layer=n_layer,
            n_head=n_head,
            n_embd=n_embd,
        )
    )
    if enable_ort:
        model = ORTModule(model)

    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

    torch.cuda.synchronize()
    # warmup before measuring
    for x, (idx, targets) in tqdm(enumerate(train_loader), total=len(train_loader)):
        if x == n_warmup:
            break
        idx = idx.to(device)
        targets = targets.to(device)
        with autocast():
            logits = model(idx)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

    torch.cuda.synchronize()
    start_time = time.time()

    for idx, targets in tqdm(train_loader, total=len(train_loader)):
        idx = idx.to(device)
        targets = targets.to(device)
        with autocast():
            logits = model(idx)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        loss.backward()
        optimizer.step()

    torch.cuda.synchronize()
    print("Time taken", time.time() - start_time)

cc @natke @ashbhandare

Issue Analytics

State:
Created 2 years ago
Comments:6

Top GitHub Comments

1reaction

SeanNarencommented, Aug 9, 2021

Can confirm that the above works also for Lightning, thanks so much @ashbhandare!!

1reaction

ashbhandarecommented, Aug 6, 2021

For your reference, this is a partial code with changes from yours that I have used for the above numbers:

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
scaler = GradScaler()
torch.cuda.synchronize()
# warmup before measuring
for x, (idx, targets) in tqdm(enumerate(train_loader), total=len(train_loader)):
    if x == n_warmup:
        break
    optimizer.zero_grad()
    idx = idx.to(device)
    targets = targets.to(device)
    with autocast():
        logits = model(idx)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

torch.cuda.synchronize()
start_time = time.time()
batch_id = 0
for idx, targets in tqdm(train_loader, total=len(train_loader)):
    if batch_id == n_steps:
        break
    optimizer.zero_grad()
    idx = idx.to(device)
    targets = targets.to(device)
    with autocast():
        logits = model(idx)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()
    batch_id += 1

torch.cuda.synchronize()
print("Time taken", time.time() - start_time)`