InMemoryDataset Slicing: Data Splitting
See original GitHub issue🐛 Describe the bug
Hi, I’m trying to split an InMemoryDataset loaded from a single file into three subsets, training/validation/test set, by slicing the loaded dataset. This works well when using the DataLoader but when applying operations on the sub-datasets, e.g., standardizing the target values “y”, the slicing is ignored. I have tried different options for a standardization example (and also found some strange behavior, see 4. Option). The correct example requires slicing the dataset repeatedly before every operation is applied to the sliced dataset. Is this behavior intended or am I doing something wrong?
Note also: In the example, transform does not change data.y to only include the target values. Rather, data.y still contains all y target values. With the DataLoader it works well again.
Thank you very much for your help!
import os.path as osp
from copy import deepcopy
import torch
from torch_geometric.datasets import QM9
from torch_geometric.loader import DataLoader
TARGET = 0
BATCH_SIZE = 4
class MyTransform(object):
def __call__(self, data):
# Specify target.
data.y = data.y[:, TARGET]
return data
#########################################################
### 1. Option ###
# Load data
path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'QM9')
dataset = QM9(path, transform=MyTransform()) # dataset still contains all y targets not only the target value specified in MyTransform
# Calculate mean and std for testing
mean_all = dataset.data.y.mean(dim=0, keepdim=True)[:,TARGET]
std_all = dataset.data.y.std(dim=0, keepdim=True)[:,TARGET]
# Split dataset directly by slicing
train_set = dataset[:110000]
val_set = dataset[110000:120000]
test_set = dataset[120000:]
# Calculate mean and std for training data
mean_train = train_set.data.y.mean(dim=0, keepdim=True)[:,TARGET] # caluclates mean of whole dataset not only train_set
std_train = train_set.data.y.std(dim=0, keepdim=True)[:,TARGET] # caluclates std of whole dataset not only train_set
print(mean_train == mean_all) # True
print(std_train == std_all) # True
# Standardize data -> does not work because train/val/test set point to same dataset object, standardization executed 3x to same dataset object
train_set.data.y = (train_set.data.y - mean_train) / std_train
val_set.data.y = (val_set.data.y - mean_train) / std_train
test_set.data.y = (test_set.data.y - mean_train) / std_train
# correct would be following, but may be confusing: dataset.data.y = (dataset.data.y - mean_train) / std_train
# Load dataset
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False)
for data in train_loader:
print(data.y) # [-3.7468, -3.2685, -3.2022, -3.7468] -> data.y is normalized (with wrong calculations)
break
#########################################################
### 2. Option ###
# Load data
path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'QM9')
dataset = QM9(path, transform=MyTransform()) # dataset still contains all y targets not only the target value specified in MyTransform
# Calculate mean and std for testing
mean_all = dataset.data.y.mean(dim=0, keepdim=True)[:,TARGET]
std_all = dataset.data.y.std(dim=0, keepdim=True)[:,TARGET]
# Split dataset directly by slicing
idxs = list(range(len(dataset)))
train_set = deepcopy(dataset)[idxs[:110000]]
val_set = deepcopy(dataset)[idxs[110000:120000]]
test_set = deepcopy(dataset)[idxs[120000:]]
# Calculate mean and std for training data
mean_train = train_set.data.y.mean(dim=0, keepdim=True)[:,TARGET] # caluclates mean of whole dataset not only train_set
std_train = train_set.data.y.std(dim=0, keepdim=True)[:,TARGET] # caluclates std of whole dataset not only train_set
print(mean_train == mean_all) # True
print(std_train == std_all) # True
# Standardize data -> works correctly now (still points to dataset but now different objects of the dataset) but mean used is still wrong (see Dataset preprocessing)
train_set.data.y = (train_set.data.y - mean_train) / std_train
val_set.data.y = (val_set.data.y - mean_train) / std_train
test_set.data.y = (test_set.data.y - mean_train) / std_train
# Load dataset
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False)
for data in train_loader:
print(data.y) # [-1.7778, -0.6966, -0.5466, -1.7778] -> data.y is normalized (with wrong calculations)
break
#########################################################
### 3. Option ###
# Load data
path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'QM9')
dataset = QM9(path, transform=MyTransform())
# Calculate mean and std for testing
mean_all = dataset.data.y.mean(dim=0, keepdim=True)[:,TARGET]
std_all = dataset.data.y.std(dim=0, keepdim=True)[:,TARGET]
# Split dataset with copy and slicing
idxs = list(range(len(dataset)))
train_set = dataset.copy(idxs[:110000])
val_set = dataset.copy(idxs[110000:120000])
test_set = dataset.copy(idxs[120000:])
# Calculate mean and std for training data
mean = train_set.data.y.mean(dim=0, keepdim=True) # caluclates mean of whole dataset not only train_set
std = train_set.data.y.std(dim=0, keepdim=True) # caluclates std of whole dataset not only train_set
print(mean_train == mean_all) # True
print(std_train == std_all) # True
# Standardize data -> works correctly now (still points to dataset but now different objects of the dataset) but mean used is still wrong (see Dataset preprocessing)
train_set.data.y = (train_set.data.y - mean) / std
val_set.data.y = (val_set.data.y - mean) / std
test_set.data.y = (test_set.data.y - mean) / std
# Load dataset
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False)
for data in train_loader:
print(data.y) # [0.0000, 1.6256, 1.8511, 0.0000] -> data.y is not normalized
break
#########################################################
### 4. Option (exactly matches 1. Option with two additional lines for printing -> print(train_set[0].y)) ###
# Load data
path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'QM9')
dataset = QM9(path, transform=MyTransform()) # dataset still contains all y targets not only the target value specified in MyTransform
# calculate mean and std for testing
mean_all = dataset.data.y.mean(dim=0, keepdim=True)[:,TARGET]
std_all = dataset.data.y.std(dim=0, keepdim=True)[:,TARGET]
# Split dataset directly by slicing
train_set = dataset[:110000]
val_set = dataset[110000:120000]
test_set = dataset[120000:]
# Calculate mean and std for training data
mean_train = train_set.data.y.mean(dim=0, keepdim=True)[:,TARGET] # caluclates mean of whole dataset not only train_set
std_train = train_set.data.y.std(dim=0, keepdim=True)[:,TARGET] # caluclates std of whole dataset not only train_set
print(mean_train == mean_all) # True
print(std_train == std_all) # True
# Standardize data -> does not work because train/val/test set point to same dataset object, standardization executed 3x to same dataset object
print(train_set[0].y) # [0]
train_set.data.y = (train_set.data.y - mean_train) / std_train
val_set.data.y = (val_set.data.y - mean_train) / std_train
test_set.data.y = (test_set.data.y - mean_train) / std_train
print(train_set[0].y) # [0]
# correct would be following, but may be confusing: dataset.data.y = (dataset.data.y - mean_train) / std_train
# Load dataset
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False)
for data in train_loader:
print(data.y) # [ 0.0000, -3.2685, -3.2022, -3.7468] -> data.y is not normalized -> why does it produce different results when only have two additional print statements?
break
#########################################################
### Correct results ###
# Load data
path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'QM9')
dataset = QM9(path, transform=MyTransform()) # dataset still contains all y targets not only the target value specified in MyTransform
# Calculate mean and std for testing
mean_all = dataset.data.y.mean(dim=0, keepdim=True)[:,TARGET]
std_all = dataset.data.y.std(dim=0, keepdim=True)[:,TARGET]
# Split dataset directly by slicing
train_set = dataset[:110000]
val_set = dataset[110000:120000]
test_set = dataset[120000:]
# Calculate mean and std for training data
mean_train = dataset.data.y[:110000].mean(dim=0, keepdim=True)[:,TARGET]
std_train = dataset.data.y[:110000].std(dim=0, keepdim=True)[:,TARGET]
print(mean_train == mean_all) # False
print(std_train == std_all) # False
# Standardize data -> does not work because train/val/test set point to same dataset object, standardization executed 3x to same dataset object
dataset.data.y = (dataset.data.y - mean_train) / std_train
# Load dataset
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False)
for data in train_loader:
print(data.y) # [-1.7866, -0.6881, -0.5358, -1.7866] -> data.y is normalized
break
Environment
- PyG version: 2.0.3
- PyTorch version: 1.9.0+cu102
- OS: CentOS Linux 7 (Core)
- Python version: 3.7.11
- CUDA/cuDNN version: 10.2
- How you installed PyTorch and PyG (
conda
,pip
, source): source - Any other relevant information (e.g., version of
torch-scatter
): torch-cluster 1.5.9, torch-scatter 2.0.9, torch-sparse 0.6.12, torch-spline-conv 1.2.1
Issue Analytics
- State:
- Created 2 years ago
- Comments:6 (3 by maintainers)
Top GitHub Comments
Thanks for re-opening. This should be also fixed now, see https://github.com/pyg-team/pytorch_geometric/commit/2cb5ad2ee0d05e8828cf62c39908e8224a164de0.
Woho, you are absolutely right. This is on me, sorry. Fixed in master, see https://github.com/pyg-team/pytorch_geometric/commit/d478dcba608542e2398f64ce21b350f76aefb538.