Stuck on an issue?

Lightrun Answers was designed to reduce the constant googling that comes with debugging 3rd party libraries. It collects links to all the places you might be looking at while hunting down a tough bug.

And, if you’re still stuck at the end, we’re happy to hop on a call to see how we can help out.

data dimension was changed and peculiar value added

See original GitHub issue

🐛 Describe the bug

my code is as following:

class TOX21(InMemoryDataset):
    def __init__(self, root, partition, lmax_attr,transform=None, pre_transform=None):
        # assert target in targets
        assert partition in ["train", "test", "val"]
        self.root = root
        # self.target = target
        self.partition = partition
        # self.feature_type = feature_type
        self.lmax_attr = lmax_attr
        self.attr_irreps = Irreps.spherical_harmonics(lmax_attr)
        # transform = TargetGetter(self.target)

        super().__init__(root, transform, pre_transform)

        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self) -> List[str]:
        return ["/system/user/publicwork/confdata.pkl","/system/user/publicwork/PracticalAI/tox21.csv"]


    @ property
    def processed_file_names(self) -> str:
        return ["_".join([self.partition,  "l="+str(self.lmax_attr)]) + '.pt']   
    
    def download(self):
        pass

    def process(self):
        try:
            import rdkit
            from rdkit import Chem
            from rdkit.Chem.rdchem import HybridizationType
            from rdkit.Chem.rdchem import BondType as BT
            from rdkit import RDLogger
            RDLogger.DisableLog('rdApp.*')
        except ImportError:
            print("Please install rdkit")
            return

        with open(self.raw_paths[1], 'rb') as f:
            tox21 = pd.read_csv(f)
            target = tox21.iloc[:,7:].values
            #map nan to 0, 0 to -1, 1 to 1
            for i in range(len(target)):
                for j in range(len(target[i])):
                    if np.isnan(target[i][j]):
                        target[i][j] = 0
                    elif target[i][j] == 0:
                        target[i][j] = -1
                    elif target[i][j] == 1:
                        target[i][j] = 1
            target = torch.tensor(target, dtype=torch.float)
            # target = torch.cat([target[:, 3:], target[:, :3]], dim=-1)
            # target = target * conversion.view(1, -1)

        with open(self.raw_paths[0], 'rb') as f:
            conf = pickle.load(f)

        # types = {'H': 0, 'C': 1, 'N': 2, 'O': 3, 'F': 4} 

        types = {'H': 0,  'C': 1,  'N': 2,  'O': 3,  'F': 4,  'P': 5,  'S': 6,  'Ag': 7,  'Al': 8,  'As': 9, 
         'Au': 10,  'B': 11,  'Ba': 12,  'Be': 13,  'Bi': 14,  'Br': 15,  'Ca': 16,  'Cd': 17,  'Cl': 18,
         'Co': 19,  'Cr': 20,  'Cu': 21,  'Dy': 22,  'Eu': 23,  'Fe': 24,  'Gd': 25,  'Ge': 26,  'Hg': 27,  
         'I': 28,  'In': 29,  'K': 30,  'Li': 31,  'Mg': 32,  'Mn': 33,  'Mo': 34,  'Na': 35,  'Nd': 36,  
         'Ni': 37,  'Pb': 38,  'Pd': 39,  'Pt': 40,  'Ru': 41,  'Sb': 42,  'Sc': 43,  'Se': 44,  'Si': 45,  
         'Sn': 46,  'Ti': 47,  'Tl': 48,  'V': 49,  'Yb': 50,  'Zn': 51,  'Zr': 52}

        data_list = []
        train, valid, test = dict(list(conf.items())[0:11704]), dict(list(conf.items())[11704:11999]), dict(list(conf.items())[11999:])
        traintarget, validtarget, testtarget = target[0:11704,:], target[11704:11999],target[11999:]
        indices = {"train": train, "valid": valid, "test": test}
        targetindices = {"train": traintarget, "valid": validtarget, "test": testtarget}
    
        Nmols = len(conf)
        np.random.seed(0)
        # data_perm = np.random.permutation(Nmols)
   
        for i,((name,confdata),target) in enumerate(zip(indices[self.partition].items(), targetindices[self.partition])):
         
            mol = confdata.rdmol
            N = mol.GetNumAtoms()            
            pos = confdata.pos
            edge_index = confdata.edge_index

            type_idx = []
            atomic_number = []
            aromatic = []
            sp = []
            sp2 = []
            sp3 = []
            num_hs = []
            for atom in mol.GetAtoms():
                # if atom.GetSymbol() in types.keys():
                type_idx.append(types[atom.GetSymbol()])
                atomic_number.append(atom.GetAtomicNum())
                aromatic.append(1 if atom.GetIsAromatic() else 0)
                hybridization = atom.GetHybridization()
                sp.append(1 if hybridization == HybridizationType.SP else 0)
                sp2.append(1 if hybridization == HybridizationType.SP2 else 0)
                sp3.append(1 if hybridization == HybridizationType.SP3 else 0)
            
            z = torch.tensor(atomic_number, dtype=torch.long)       
            row, col = edge_index
            hs = (z == 1).to(torch.float)
            num_hs = scatter(hs[row], col, dim_size=N).tolist()

            x1 = F.one_hot(torch.tensor(type_idx), num_classes=len(types))
            x2 = torch.tensor([atomic_number, aromatic, sp, sp2, sp3, num_hs],
                                dtype=torch.float).t().contiguous()
            x = torch.cat([x1.to(torch.float), x2], dim=-1)
            y = target.unsqueeze(0)
            # name = mol.GetProp('_Name')
            # name = tox21["sdftitle"][i]
            name = name

            edge_attr, node_attr, edge_dist = self.get_O3_attr(edge_index, pos, self.attr_irreps)

            data = Data(x=x, pos=pos, edge_index=edge_index, edge_attr=edge_attr,
                        node_attr=node_attr, additional_message_features=edge_dist, y=y, name=name, index=i)
            data_list.append(data)

        torch.save(self.collate(data_list), self.processed_paths[0])
    
    def get_O3_attr(self, edge_index, pos, attr_irreps):
        """ Creates spherical harmonic edge attributes and node attributes for the SEGNN """
        rel_pos = pos[edge_index[0]] - pos[edge_index[1]]  # pos_j - pos_i (note in edge_index stores tuples like (j,i))
        edge_dist = rel_pos.pow(2).sum(-1, keepdims=True)
        edge_attr = spherical_harmonics(attr_irreps, rel_pos, normalize=True,
                                        normalization='component')  # Unnormalised for now
        node_attr = scatter(edge_attr, edge_index[1], dim=0, reduce="mean")
        return edge_attr, node_attr, edge_dist

I found that after load this dataset, the dimension of target y changed from 12 to 13, and it automatically add one peculiar value into y data, which is strange, I never intend to to so.

see the first value 2 is this peculiar value added, and I am not sure if it did this to other data.

If I remove the unsqueeze(0) in y= target.unsqueeze(0), it will give IndexError: index 13 is out of bounds for dimension 0 with size 13 What could be the problem?

Environment

PyG version: conda forge
PyTorch version: 1.9
OS: Linux
Python version: 3.9
CUDA/cuDNN version: 1.0
How you installed PyTorch and PyG (conda, pip, source):
Any other relevant information (e.g., version of torch-scatter):

Issue Analytics

State:
Created 2 years ago
Comments:9 (4 by maintainers)

Top GitHub Comments

1reaction

CaiYitaocommented, Mar 20, 2022

Ah, yeah, I load a different document when tested, that tox21.csv was changed. Sorry for this! Thank you for your hardwork making this masterpiece app!
I wish Julia community also can have similar one 😉

0reactions

rusty1scommented, Mar 20, 2022

Then you need to do target = tox21.iloc[:, 8:].values 😃