util.py 8.58 KB
Newer Older
1
2
3
4
5
"""
rdkit/openbabel utility scripts
"""
# __pragma__ ('skip')
from rdkit import Chem
Jacob Durrant's avatar
Jacob Durrant committed
6

7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# __pragma__ ('noskip')

"""?
from .gridder.fake_rdkit import Chem
?"""

import math


def get_coords(mol):
    """Returns an array of atom coordinates from an rdkit mol."""
    conf = mol.GetConformer()
    coords = [conf.GetAtomPosition(i) for i in range(conf.GetNumAtoms())]
    return coords


Jacob Durrant's avatar
Jacob Durrant committed
23
def get_atomic_nums(mol):
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
    """Returns an array of atomic numbers from an rdkit mol."""
    return [mol.GetAtomWithIdx(i).GetAtomicNum() for i in range(mol.GetNumAtoms())]


def generate_fragments(mol, max_heavy_atoms, only_single_bonds):
    """Takes an rdkit molecule and returns a list of (parent, fragment) tuples.

    Args:
        mol: The molecule to fragment.
        max_heavy_atoms: The maximum number of heavy atoms to include
            in generated fragments.
        nly_single_bonds: If set to true, this method will only return
            fragments generated by breaking single bonds.

    Returns:
        A list of (parent, fragment) tuples where mol is larger than fragment.
    """

    max_heavy_atoms = 0 if max_heavy_atoms is None else max_heavy_atoms
    only_single_bonds = True if only_single_bonds is None else only_single_bonds

    # list of (parent, fragment) tuples
    splits = []

    # __pragma__ ('skip')
    try:
        # if we have multiple ligands already, split into pieces and then iterate
        ligands = Chem.GetMolFrags(mol, asMols=True, sanitizeFrags=False)

        for i in range(len(ligands)):
            lig = ligands[i]
            other = list(ligands[:i] + ligands[i + 1 :])

            # iterate over bonds
            for i in range(lig.GetNumBonds()):
                # (optional) filter base on single bonds
                if (
                    only_single_bonds
                    and lig.GetBondWithIdx(i).GetBondType()
                    != Chem.rdchem.BondType.SINGLE
                ):
                    continue

                # split the molecule
                split_mol = Chem.rdmolops.FragmentOnBonds(lig, [i])

                # obtain fragments
                fragments = Chem.GetMolFrags(
                    split_mol, asMols=True, sanitizeFrags=False
                )

                # skip if this did not break the molecule into two pieces
                if len(fragments) != 2:
                    continue

                # otherwise make sure the first fragment is larger
                if fragments[0].GetNumAtoms() < fragments[1].GetNumAtoms():
                    fragments = fragments[::-1]

                # make sure the fragment has at least one heavy atom
                if fragments[1].GetNumHeavyAtoms() == 0:
                    continue

                # (optional) filter based on number of heavy atoms in the fragment
                if (
                    max_heavy_atoms > 0
                    and fragments[1].GetNumHeavyAtoms() > max_heavy_atoms
                ):
                    continue

                # if we have other ligands present, merge them with the parent
                parent = fragments[0]

                if len(other) > 0:
                    parent = combine_all([parent] + other)

                # add this pair
                splits.append((parent, fragments[1]))
    except:
        # When running under python, but with fake_rdkit. Don't fragment. Just
        # returm parent.
        splits = [(mol, None)]  # fragment is None
    # __pragma__ ('noskip')

    """?
    splits = [(mol, None)]
    ?"""

    return splits


def load_receptor(rec_path):
    """Loads a receptor from a pdb file and retrieves atomic information.

    Args:
        rec_path: Path to a pdb file.
    """
    rec = Chem.MolFromPDBFile(rec_path, sanitize=False)
    rec = remove_water(rec)
    rec = remove_hydrogens(rec)

    return rec


def remove_hydrogens(m):
Jacob Durrant's avatar
Jacob Durrant committed
129
130
131
132
133
134
135
136
137
    # __pragma__ ('skip')
    for atom in m.GetAtoms():
        atom.SetFormalCharge(0)
    m = Chem.RemoveHs(m)
    # __pragma__ ('noskip')

    """?
    m.atoms = [a for a in m.atoms if a.element != "H"]
    ?"""
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199

    return m


def remove_water(m):
    """Removes water molecules from an rdkit mol."""
    # __pragma__ ('skip')
    try:
        parts = Chem.GetMolFrags(m, asMols=True, sanitizeFrags=False)
        valid = [
            k for k in parts if not Chem.MolToSmiles(k, allHsExplicit=True) == "[OH2]"
        ]

        assert len(valid) > 0, "error: molecule contains only water"

        merged = valid[0]
        for part in valid[1:]:
            merged = Chem.CombineMols(merged, part)
    except:
        m.atoms = [
            a for a in m.atoms if a.resname not in ["WAT", "HOH", "TIP", "TIP3", "OH2"]
        ]
        merged = m
    # __pragma__ ('noskip')

    """?
    m.atoms = [
        a for a in m.atoms if a.resname not in ["WAT", "HOH", "TIP", "TIP3", "OH2"]
    ]
    merged = m
    ?"""

    return merged


def combine_all(frags):
    """Combines a list of rdkit mols."""
    if len(frags) == 0:
        return None

    c = frags[0]
    for f in frags[1:]:
        c = Chem.CombineMols(c, f)

    return c


def load_ligand(sdf):
    """Loads a ligand from an sdf file and fragments it.

    Args:
        sdf: Path to sdf file containing a ligand.
    """
    lig = next(Chem.SDMolSupplier(sdf, sanitize=False))
    lig = remove_water(lig)
    lig = remove_hydrogens(lig)

    frags = generate_fragments(lig, None, None)

    return lig, frags


Jacob Durrant's avatar
Jacob Durrant committed
200
def mol_to_points(mol, atom_types, note_sulfur=True):
201
202
    """convert an rdkit mol to an array of coordinates and layers"""

Jacob Durrant's avatar
Jacob Durrant committed
203
204
205
206
207
    atom_types = (
        [6, 7, 8, 16,]  # carbon  # nitrogen  # oxygen  # sulfur
        if atom_types is None
        else atom_types
    )
208
209

    coords = get_coords(mol)
Jacob Durrant's avatar
Jacob Durrant committed
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
    atomic_nums = get_atomic_nums(mol)

    layers = []
    for t in atomic_nums:
        if t == 1:
            # Always ignore hydrogen. Should have already been ignored.
            layers.append(-1)
        elif t == 6:
            # Carbon
            layers.append(0)
        elif t == 7:
            # Nitrogen
            layers.append(1)
        elif t == 8:
            # Oxygen
            layers.append(2)
jdurrant's avatar
jdurrant committed
226
227

        # Below differs depending on protein or not.
Jacob Durrant's avatar
Jacob Durrant committed
228
229
230
        elif not note_sulfur:
            # Not noting sulfur (e.g., ligand), but some other atom.
            layers.append(3)
jdurrant's avatar
jdurrant committed
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
        else:
            # So supposed to note sulfur (protein)
            if t == 16:
                # Noting sulfur (e.g., protein) and sulfur found.
                layers.append(3)
            else:
                # Noting sulfur (e.g., protein) but some other atom.
                layers.append(4)


        # elif not note_sulfur:
        #     # Not noting sulfur (e.g., ligand), but some other atom.
        #     layers.append(3)
        # elif note_sulfur and t == 16:
        #     # Noting sulfur (e.g., protein) and sulfur found.
        #     layers.append(3)
        # elif note_sulfur:
        #     # Noting sulfur (e.g., protein) but some other atom.
        #     layers.append(4)
Jacob Durrant's avatar
Jacob Durrant committed
250
251

    # layers = [(atom_types.index(k) if k in atom_types else -1) for k in types]
252
253
254
255
256
257
258
259
260
261
262
263

    # filter by existing layer
    # coords = coords[layers != -1]
    coords = [c for i, c in enumerate(coords) if layers[i] != -1]
    # layers = layers[layers != -1].reshape(-1, 1)  # JDD NOTE: Like .T
    layers = [l for l in layers if l != -1]

    return coords, layers


def get_connection_point(frag):
    """return the coordinates of the dummy atom as a numpy array [x,y,z]"""
Jacob Durrant's avatar
Jacob Durrant committed
264
    dummy_idx = get_atomic_nums(frag).index(0)
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
    coords = get_coords(frag)[dummy_idx]

    return coords


def frag_dist_to_receptor_raw(coords, frag):
    """compute the minimum distance between the fragment connection point any receptor atom"""
    conn = get_connection_point(frag)
    # rec_coords = np.array(coords)
    # dist = np.sum((rec_coords - conn) ** 2, axis=1)
    # min_dist = np.sqrt(np.min(dist))
    # print(min_dist)
    # return min_dist

    # non-numpy version
    dists = []
    for i in range(len(coords)):
        coord = coords[i]
        tmp = [coord[0] - conn.x, coord[1] - conn.y, coord[2] - conn.z]
        tmp = [tmp[0] ** 2, tmp[1] ** 2, tmp[2] ** 2]
        s = sum(tmp)
        dists.append(s)
    min_dist = math.sqrt(min(dists))
    return min_dist


def mol_array(mol):
    """convert an rdkit mol to an array of coordinates and atom types"""
    coords = get_coords(mol)
Jacob Durrant's avatar
Jacob Durrant committed
294
295
    types = get_atomic_nums(mol)
    # types = np.array(get_atomic_nums(mol)).reshape(-1, 1)
296
297
298
299
300
301
302
303
304
    # arr = np.concatenate([coords, types], axis=1)
    # import pdb; pdb.set_trace()

    arr = []
    for i, coor in enumerate(coords):
        arr.append([coor.x, coor.y, coor.z, types[i]])

    return arr
    # return coords, types