Spatial gradient speed test (to update the implementation)
See original GitHub issueimport torch
import random
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import timeit
def benchmark_conv2d_cat(input, device, sh):
torch.cuda.synchronize()
spatial_pad = [kernel.size(1) // 2,
kernel.size(1) // 2,
kernel.size(2) // 2,
kernel.size(2) // 2]
inp = F.pad(input, spatial_pad, 'replicate')
out_channels: int = 3 if kernel.size(0) == 3 else 2
x = torch.cat([F.conv2d(inp, kernel[i:i+1].expand(c,-1,-1,-1),
groups=c).unsqueeze(2) for i in range(out_channels)],
dim=2)
#loss = x.sum()
#loss.backward()
torch.cuda.synchronize()
return
def benchmark_conv3d_reshape(input, device, sh):
b1,c1,h1,w1 = input.size()
torch.cuda.synchronize()
kernel1: torch.Tensor = kernel.unsqueeze(1).unsqueeze(1)
# convolve input tensor with sobel kernel
kernel_flip: torch.Tensor = kernel1.flip(-3)
# Pad with "replicate for spatial dims, but with zeros for channel
spatial_pad = [kernel.size(1) // 2,
kernel.size(1) // 2,
kernel.size(2) // 2,
kernel.size(2) // 2]
# print (spatial_pad)
out_channels: int = 3 if kernel.size(0) == 3 else 2
padded_inp: torch.Tensor = F.pad(input.reshape(b1 * c1, 1, h1, w1), spatial_pad, 'replicate')[:, :, None]
#print (padded_inp.shape)
x = F.conv3d(padded_inp, kernel_flip, padding=0).view(b1, c1, out_channels, h1, w1)
#loss = x.sum()
#loss.backward()
torch.cuda.synchronize()
return
def benchmark_conv3d_groups(input, device, sh):
b1,c1,h1,w1 = input.size()
torch.cuda.synchronize()
kernel1: torch.Tensor = kernel.unsqueeze(1).unsqueeze(1)
# convolve input tensor with sobel kernel
kernel_flip: torch.Tensor = kernel1.flip(-3)
# Pad with "replicate for spatial dims, but with zeros for channel
spatial_pad = [kernel.size(1) // 2,
kernel.size(1) // 2,
kernel.size(2) // 2,
kernel.size(2) // 2]
# print (spatial_pad)
out_channels: int = 3 if kernel.size(0) == 3 else 2
padded_inp: torch.Tensor = F.pad(input, spatial_pad, 'replicate')[:, :, None]
#print (padded_inp.shape)
#print (kernel_flip.shape)
x = F.conv3d(padded_inp, kernel_flip.repeat(c1,1,1,1,1),
groups=c1,
padding=0).view(b1, c1, out_channels, h1, w1)
#loss = x.sum()
#loss.backward()
torch.cuda.synchronize()
return
#For square kernels
b,c,h,w = 4,8,512,512
sh = [b,c,h,w]
for ks in [3,5]:
for device in [torch.device('cuda:0'), torch.device('cpu')]:
data = torch.rand(b,c,h,w, device=device).float()
b1,c1,h1,w1 = data.size()
#kernel = kernel.to(device)
#warmup
#benchmark_conv3d_groups(data, device,sh)
kernel = torch.rand(ks//2+1,ks,ks, device=device, requires_grad=False).float()
benchmark_conv3d_reshape(data, device,sh)
benchmark_conv2d_cat(data, device,sh)
benchmark_conv3d_groups(data, device,sh)
torch.cuda.synchronize()
print ('benchmark_conv3d_groups', str(device))
%timeit benchmark_conv3d_groups(data, device, sh)
kernel = torch.rand(2,3,3, device=device, requires_grad=True).float()
torch.cuda.synchronize()
print (ks, 'benchmark_conv3d_reshape',str(device))
kernel = torch.rand(2,3,3, device=device, requires_grad=True).float()
%timeit benchmark_conv3d_reshape(data, device, sh)
kernel = torch.rand(2,3,3, device=device, requires_grad=True).float()
print (ks, 'benchmark_conv2d_cat',str(device))
%timeit benchmark_conv2d_cat(data, device, sh)
print (" ")
Without the backward pass: conv2d is better, but not dramatically:
benchmark_conv3d_groups cuda:0 3.91 ms ± 5.24 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) 3 benchmark_conv3d_reshape cuda:0 3.93 ms ± 1.53 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) 3 benchmark_conv2d_cat cuda:0 3.83 ms ± 13 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
benchmark_conv3d_groups cpu 105 ms ± 3.65 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) 3 benchmark_conv3d_reshape cpu 130 ms ± 112 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) 3 benchmark_conv2d_cat cpu 70.1 ms ± 81.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
benchmark_conv3d_groups cuda:0 6.73 ms ± 24.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) 5 benchmark_conv3d_reshape cuda:0 3.98 ms ± 1.24 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) 5 benchmark_conv2d_cat cuda:0 3.86 ms ± 653 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
benchmark_conv3d_groups cpu 266 ms ± 789 µs per loop (mean ± std. dev. of 7 runs, 1 loop each) 5 benchmark_conv3d_reshape cpu 130 ms ± 536 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) 5 benchmark_conv2d_cat cpu 70.3 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
with backward pass: conv2d is MUCH faster than conv3d:
ks= 3 benchmark_conv3d_groups cuda:0 84 ms ± 392 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) ks= 3 benchmark_conv3d_reshape cuda:0 81.9 ms ± 234 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) ks= 3 benchmark_conv2d_cat cuda:0 10.2 ms ± 36.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
ks= 3 benchmark_conv3d_groups cpu 162 ms ± 3.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) ks= 3 benchmark_conv3d_reshape cpu 160 ms ± 774 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) ks= 3 benchmark_conv2d_cat cpu 104 ms ± 452 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
ks= 5 benchmark_conv3d_groups cuda:0 87.1 ms ± 234 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) ks= 5 benchmark_conv3d_reshape cuda:0 82.4 ms ± 129 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) ks= 5 benchmark_conv2d_cat cuda:0 10.2 ms ± 20.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
ks= 5 benchmark_conv3d_groups cpu 346 ms ± 5.67 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) ks= 5 benchmark_conv3d_reshape cpu 158 ms ± 342 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) ks= 5 benchmark_conv2d_cat cpu 104 ms ± 859 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
pytorch 1.4.0, Ti 1080 12 Gb
P.S. Current implementation is conv3d_reshape
Issue Analytics
- State:
- Created 4 years ago
- Comments:5 (1 by maintainers)
Top GitHub Comments
I think, not
@ducha-aiki is this relevant anymore ?