Commit d958ceca authored by hgarrereyn's avatar hgarrereyn
Browse files

add moad parsing scripts

parent 53832013
......@@ -140,9 +140,10 @@ LIG_TYPER = {
class FragmentDataset(Dataset):
"""Utility class to work with the packed fragments.h5 format."""
def __init__(self, fragment_file, rec_typer, lig_typer, filter_rec=None,
fdist_min=None, fdist_max=None, fmass_min=None,
fmass_max=None, verbose=False, lazy_loading=True):
def __init__(self, fragment_file, rec_typer=REC_TYPER['simple'],
lig_typer=LIG_TYPER['simple'], filter_rec=None, filter_smi=None,
fdist_min=None, fdist_max=None, fmass_min=None, fmass_max=None,
verbose=False, lazy_loading=True):
"""Initializes the fragment dataset.
Args:
......@@ -168,7 +169,7 @@ class FragmentDataset(Dataset):
self.frag = self._load_fragments(fragment_file, lig_typer)
self.valid_idx = self._get_valid_examples(
filter_rec, fdist_min, fdist_max, fmass_min, fmass_max)
filter_rec, filter_smi, fdist_min, fdist_max, fmass_min, fmass_max, verbose)
def _load_rec(self, fragment_file, rec_typer):
"""Loads receptor information."""
......@@ -216,7 +217,13 @@ class FragmentDataset(Dataset):
frag_smiles = f['frag_smiles'][()]
frag_mass = f['frag_mass'][()]
frag_dist = f['frag_dist'][()]
frag_lig_smi = None
frag_lig_idx = None
if 'frag_lig_smi' in f.keys():
frag_lig_smi = f['frag_lig_smi'][()]
frag_lig_idx = f['frag_lig_idx'][()]
# unpack frag data into separate structures
frag_coords = frag_data[:,:3].astype(np.float32)
frag_types = frag_data[:,3].astype(np.int32)
......@@ -256,6 +263,8 @@ class FragmentDataset(Dataset):
'frag_smiles': frag_smiles, # f_idx -> smiles
'frag_mass': frag_mass, # f_idx -> mass
'frag_dist': frag_dist, # f_idx -> dist
'frag_lig_smi': frag_lig_smi,
'frag_lig_idx': frag_lig_idx,
'frag_loaded': frag_loaded
}
......@@ -263,8 +272,8 @@ class FragmentDataset(Dataset):
return frag
def _get_valid_examples(self, filter_rec, fdist_min, fdist_max, fmass_min,
fmass_max):
def _get_valid_examples(self, filter_rec, filter_smi, fdist_min, fdist_max, fmass_min,
fmass_max, verbose):
"""Returns an array of valid fragment indexes.
"Valid" in this context means the fragment belongs to a receptor in
......@@ -274,11 +283,38 @@ class FragmentDataset(Dataset):
# keep track of valid examples
valid_mask = np.ones(self.frag['frag_lookup'].shape[0]).astype(np.bool)
num_frags = self.frag['frag_lookup'].shape[0]
# filter by receptor id
if filter_rec is not None:
valid_rec = np.vectorize(lambda k: k[0].decode('ascii') in filter_rec)(self.frag['frag_lookup'])
valid_rec = np.zeros(num_frags, dtype=np.bool)
r = range(num_frags)
if verbose:
r = tqdm.tqdm(r, desc='filter rec')
for i in r:
rec = self.frag['frag_lookup'][i][0].decode('ascii')
if rec in filter_rec:
valid_rec[i] = 1
valid_mask *= valid_rec
# filter by ligand smiles string
if filter_smi is not None:
valid_lig = np.zeros(num_frags, dtype=np.bool)
r = range(num_frags)
if verbose:
r = tqdm.tqdm(r, desc='filter lig')
for i in r:
smi = self.frag['frag_lig_smi'][self.frag['frag_lig_idx'][i]]
smi = smi.decode('ascii')
if smi in filter_smi:
valid_lig[i] = 1
valid_mask *= valid_lig
# filter by fragment distance
if fdist_min is not None:
valid_mask[self.frag['frag_dist'] < fdist_min] = 0
......
......@@ -16,7 +16,14 @@ from leadopt.grid_util import get_batch
from leadopt.metrics import mse, bce, tanimoto, cos, top_k_acc,\
average_support, inside_support
from config import partitions
from config import partitions, moad_partitions
def get_bios(p):
part = []
for n in range(20):
part += [x.lower() + '_bio%d' % n for x in p]
return part
def _do_mean(fn):
......@@ -293,8 +300,10 @@ class VoxelNet(LeadoptModel):
self._args['fragments'],
rec_typer=REC_TYPER[self._args['rec_typer']],
lig_typer=LIG_TYPER[self._args['lig_typer']],
filter_rec=(
partitions.TRAIN if not self._args['no_partitions'] else None),
# filter_rec=(
# partitions.TRAIN if not self._args['no_partitions'] else None),
filter_rec=set(get_bios(moad_partitions.TRAIN)),
filter_smi=set(moad_partitions.TRAIN_SMI),
fdist_min=self._args['fdist_min'],
fdist_max=self._args['fdist_max'],
fmass_min=self._args['fmass_min'],
......@@ -306,8 +315,10 @@ class VoxelNet(LeadoptModel):
self._args['fragments'],
rec_typer=REC_TYPER[self._args['rec_typer']],
lig_typer=LIG_TYPER[self._args['lig_typer']],
filter_rec=(
partitions.VAL if not self._args['no_partitions'] else None),
# filter_rec=(
# partitions.VAL if not self._args['no_partitions'] else None),
filter_rec=set(get_bios(moad_partitions.VAL)),
filter_smi=set(moad_partitions.VAL_SMI),
fdist_min=self._args['fdist_min'],
fdist_max=self._args['fdist_max'],
fmass_min=self._args['fmass_min'],
......@@ -446,7 +457,9 @@ class VoxelNet(LeadoptModel):
self._args['fragments'],
rec_typer=REC_TYPER[self._args['rec_typer']],
lig_typer=LIG_TYPER[self._args['lig_typer']],
filter_rec=partitions.TEST,
# filter_rec=partitions.TEST,
filter_rec=set(get_bios(moad_partitions.TEST)),
filter_smi=set(moad_partitions.TEST_SMI),
fdist_min=self._args['fdist_min'],
fdist_max=self._args['fdist_max'],
fmass_min=self._args['fmass_min'],
......
......@@ -108,6 +108,18 @@ def load_ligand(sdf):
return lig, frags
def load_ligands_pdb(pdb):
"""Load multiple ligands from a pdb file.
Args:
pdb: Path to pdb file containing a ligand.
"""
lig_mult = Chem.MolFromPDBFile(pdb)
ligands = Chem.GetMolFrags(lig_mult, asMols=True, sanitizeFrags=True)
return ligands
def remove_water(m):
"""Removes water molecules from an rdkit mol."""
parts = Chem.GetMolFrags(m, asMols=True, sanitizeFrags=False)
......
# 1. Download MOAD datasets
Go to https://bindingmoad.org/Home/download and download `every_part_a.zip` and `every_part_b.zip` and "Binding data" (`every.csv`).
This readme assumes these files are stored in `$MOAD_DIR`.
# 2. Unpack MOAD datasets
Run:
```sh
$ unzip every_part_a.zip
...
$ unzip every_part_b.zip
...
```
# 3. Process MOAD pdb files
The MOAD dataset contains ligand/receptor structures combined in a single pdb file (named with a `.bio<x>` extension). In this step, we will separate the receptor and each ligand into individual files.
Run:
```sh
$ cd $MOAD_DIR && mkdir split
$ python3 scripts/split_moad.py \
-d $MOAD_DIR/BindingMOAD_2020 \
-c $MOAD_DIR/every.csv \
-o $MOAD_DIR/split \
-n <number of cores>
```
# 4. Generate packed data files.
For training purposes, we pack all of the relevant information into an h5 file so we can load it entirely in memory during training.
This step will produce several similar `.h5` files that can be combined later.
Run:
```sh
$ cd $MOAD_DIR && mkdir packed
$ python3 scripts/process_moad.py \
-d $MOAD_DIR/split \
-c $MOAD_DIR/every.csv \
-o $MOAD_DIR/packed/moad.h5 \
-n <number of cores> \
-s <examples per file (e.g. 500)>
```
# MOAD csv parsing utility
class Family(object):
def __init__(self):
self.targets = []
def __repr__(self):
return 'F(%d)' % len(self.targets)
class Protein(object):
def __init__(self, pdb_id):
# (chain, smi)
self.pdb_id = pdb_id.upper()
self.ligands = []
def __repr__(self):
return '%s(%d)' % (self.pdb_id, len(self.ligands))
def parse_moad(csv):
csv_dat = open(csv, 'r').read().strip().split('\n')
csv_dat = [x.split(',') for x in csv_dat]
families = []
curr_f = None
curr_t = None
for line in csv_dat:
if line[0] != '':
# new class
continue
elif line[1] != '':
# new family
if curr_t != None:
curr_f.targets.append(curr_t)
if curr_f != None:
families.append(curr_f)
curr_f = Family()
curr_t = Protein(line[2])
elif line[2] != '':
# new target
if curr_t != None:
curr_f.targets.append(curr_t)
curr_t = Protein(line[2])
elif line[3] != '':
# new ligand
if line[4] != 'valid':
continue
curr_t.ligands.append((line[3], line[9]))
curr_f.targets.append(curr_t)
families.append(curr_f)
by_target = {}
for f in families:
for t in f.targets:
by_target[t.pdb_id] = t
return families, by_target
'''
Utility script to convert the MOAD dataset into a packed format
'''
import sys
import argparse
import os
import re
import multiprocessing
import threading
from moad_util import parse_moad
import rdkit.Chem.AllChem as Chem
from rdkit.Chem.Descriptors import ExactMolWt
import molvs
import h5py
import tqdm
import numpy as np
# add leadopt to path
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir))
from leadopt import util
# Data Format
#
# Receptor data:
# - rec_lookup: [id][start][end]
# - rec_coords: [x][y][z]
# - rec_types: [num][is_hacc][is_hdon][is_aro][pcharge]
#
# Fragment data:
# - frag_lookup: [id][fstart][fend][pstart][pend]
# - frag_lig_id: [lig_id]
# - frag_coords: [x][y][z]
# - frag_types: [num]
# - frag_smiles: [smiles]
# - frag_mass: [mass]
# - frag_dist: [dist]
LOAD_TIMEOUT = 60
u = molvs.charge.Uncharger()
t = molvs.tautomer.TautomerCanonicalizer()
REPLACE = [
('C(=O)[O-]', 'C(=O)O'),
('N=[N+]=N', 'N=[N+]=[N-]'),
('[N+](=O)O', '[N+](=O)[O-]'),
('S(O)(O)O', '[S+2](O)(O)O'),
]
def basic_replace(sm):
m = Chem.MolFromSmiles(sm, False)
for a,b in REPLACE:
m = Chem.ReplaceSubstructs(
m,
Chem.MolFromSmiles(a, sanitize=False),
Chem.MolFromSmiles(b, sanitize=False),
replaceAll=True
)[0]
return Chem.MolToSmiles(m)
def neutralize_smiles(sm):
m = Chem.MolFromSmiles(sm)
m = u.uncharge(m)
m = t.canonicalize(m)
sm = Chem.MolToSmiles(m)
sm = basic_replace(sm)
try:
return molvs.standardize_smiles(sm)
except:
print(sm)
return sm
def load_example(base, rec_id, target):
rec_path = os.path.join(base, '%s_rec.pdb' % rec_id)
# Load receptor data.
rec_coords, rec_types = util.load_receptor_ob(rec_path)
# (frag_data, parent_data, smiles, mass, dist, lig_off)
fragments = []
# (smi)
lig_smiles = []
lig_off = 0
for lig in target.ligands:
lig_path = os.path.join(base, '%s_%s.pdb' % (rec_id, lig[0].replace(' ','_')))
try:
lig_mol = Chem.MolFromPDBFile(lig_path, True)
except:
continue
if lig_mol is None:
continue
lig_smi = lig[1]
lig_smiles.append(lig_smi)
ref = Chem.MolFromSmiles(lig_smi)
lig_fixed = Chem.AssignBondOrdersFromTemplate(ref, lig_mol)
splits = util.generate_fragments(lig_fixed)
for parent, frag in splits:
frag_data = util.mol_array(frag)
parent_data = util.mol_array(parent)
frag_smi = Chem.MolToSmiles(
frag,
isomericSmiles=False,
kekuleSmiles=False,
canonical=True,
allHsExplicit=False
)
frag_smi = neutralize_smiles(frag_smi)
frag.UpdatePropertyCache(strict=False)
mass = ExactMolWt(frag)
dist = util.frag_dist_to_receptor_raw(rec_coords, frag)
fragments.append((frag_data, parent_data, frag_smi, mass, dist, lig_off))
lig_off += 1
return (rec_coords, rec_types, fragments, lig_smiles)
def do_thread(out, args):
try:
out[0] = load_example(*args)
except:
out[0] = None
def multi_load(packed):
out = [None]
t = threading.Thread(target=do_thread, args=(out, packed))
t.start()
t.join(timeout=LOAD_TIMEOUT)
if t.is_alive():
print('timeout', packed[1])
return (packed, out[0])
def process(work, processed, moad_csv, out_path='moad.h5', num_cores=1):
'''Process MOAD data and save to a packed format.
Args:
- out_path: where to save the .h5 packed data
'''
rec_lookup = [] # (id, start, end)
rec_coords = [] # (x,y,z)
rec_types = [] # (num, aro, hdon, hacc, pcharge)
frag_lookup = [] # (id, f_start, f_end, p_start, p_end)
frag_lig_idx = [] # (lig_idx)
frag_lig_smi = [] # (lig_smi)
frag_data = [] # (x,y,z,type)
frag_smiles = [] # (frag_smi)
frag_mass = [] # (mass)
frag_dist = [] # (dist)
# Data pointers.
rec_i = 0
frag_i = 0
# Multiprocess.
with multiprocessing.Pool(num_cores) as p:
with tqdm.tqdm(total=len(work)) as pbar:
for w, res in p.imap_unordered(multi_load, work):
pbar.update()
if res == None:
print('[!] Failed: %s' % w[1])
continue
rcoords, rtypes, fragments, ex_lig_smiles = res
if len(fragments) == 0:
print('Empty', w[1])
continue
rec_id = w[1]
# Add receptor info.
rec_start = rec_i
rec_end = rec_i + rcoords.shape[0]
rec_i += rcoords.shape[0]
rec_coords.append(rcoords)
rec_types.append(rtypes)
rec_lookup.append((rec_id.encode('ascii'), rec_start, rec_end))
lig_idx = len(frag_lig_smi)
# Add fragment info.
for fdat, pdat, frag_smi, mass, dist, lig_off in fragments:
frag_start = frag_i
frag_end = frag_i + fdat.shape[0]
frag_i += fdat.shape[0]
parent_start = frag_i
parent_end = frag_i + pdat.shape[0]
frag_i += pdat.shape[0]
frag_data.append(fdat)
frag_data.append(pdat)
frag_lookup.append((rec_id.encode('ascii'), frag_start, frag_end, parent_start, parent_end))
frag_lig_idx.append(lig_idx+lig_off)
frag_smiles.append(frag_smi)
frag_mass.append(mass)
frag_dist.append(dist)
# Add ligand smiles.
frag_lig_smi += ex_lig_smiles
# Convert to numpy format.
print('Convert numpy...', flush=True)
n_rec_lookup = np.array(rec_lookup, dtype='<S16,<i4,<i4')
n_rec_coords = np.concatenate(rec_coords, axis=0).astype(np.float32)
n_rec_types = np.concatenate(rec_types, axis=0).astype(np.int32)
n_frag_data = np.concatenate(frag_data, axis=0)
n_frag_lookup = np.array(frag_lookup, dtype='<S16,<i4,<i4,<i4,<i4')
n_frag_lig_idx = np.array(frag_lig_idx)
n_frag_lig_smi = np.array(frag_lig_smi, dtype='<S')
n_frag_smiles = np.array(frag_smiles, dtype='<S')
n_frag_mass = np.array(frag_mass)
n_frag_dist = np.array(frag_dist)
# Save.
with h5py.File(os.path.join(out_path), 'w') as f:
f['rec_lookup'] = n_rec_lookup
f['rec_coords'] = n_rec_coords
f['rec_types'] = n_rec_types
f['frag_data'] = n_frag_data
f['frag_lookup'] = n_frag_lookup
f['frag_lig_idx'] = n_frag_lig_idx
f['frag_lig_smi'] = n_frag_lig_smi
f['frag_smiles'] = n_frag_smiles
f['frag_mass'] = n_frag_mass
f['frag_dist'] = n_frag_dist
print('Done!')
def build_multiple(processed, moad_csv, out_path='moad.h5', num_cores=1, size=5000):
# Load MOAD metadata.
moad_families, moad_targets = parse_moad(moad_csv)
# Load files.
files = [x for x in os.listdir(processed) if not x.startswith('.')]
rec = [x for x in files if '_rec' in x]
print('[*] Found %d receptors' % len(rec))
# Create work queue.
work = []
for rec_name in sorted(rec):
rec_id = rec_name.split('_rec')[0]
pdb_id = rec_id.split('_')[0].upper()
if pdb_id not in moad_targets:
print('missing pdb', pdb_id)
continue
target = moad_targets[pdb_id]
work.append((processed, rec_id, target))
split = 0
for i in range(split * size, len(work), size):
print('Building split %d' % split)
out_name = out_path.replace('.h5', '_%d.h5' % split)
split += 1
process(work[i:i+size], processed, moad_csv, out_name, num_cores)
print('Done all!')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--datasets', required=True, help='Path to MOAD /processed folder')
parser.add_argument('-c', '--csv', required=True, help='Path to MOAD "every.csv"')
parser.add_argument('-o', '--output', default='fragments.h5', help='Output file path (.h5)')
parser.add_argument('-n', '--num_cores', default=1, type=int, help='Number of cores')
parser.add_argument('-s', '--size', default=500, type=int, help='Number of targets per output dataset')
args = parser.parse_args()
build_multiple(args.datasets, args.csv, args.output, args.num_cores, args.size)
if __name__=='__main__':
main()
import prody
import tqdm
import numpy as np
import argparse
import os
import multiprocessing
from moad_util import parse_moad
prody.utilities.logger.LOGGER.verbosity = 'none'
def load_example(path, target):
m = prody.parsePDB(path)
rec = m.select('not (nucleic or hetatm) and not water')
# (lig_name, atoms)