Last active
November 21, 2022 20:01
-
-
Save PatWalters/beb437da364a2c4bf8de724a2039b903 to your computer and use it in GitHub Desktop.
Calculate RDKit descriptors with Dask
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| import sys | |
| import pandas as pd | |
| import dask.dataframe as dd | |
| from rdkit import Chem | |
| from rdkit.Chem import Descriptors | |
| import numpy as np | |
| import time | |
| import multiprocessing | |
| # I borrowed a bunch of ideas from https://github.com/rdkit/rdkit/issues/2529 | |
| def smi2props(smi): | |
| descList = [desc[0] for desc in Descriptors.descList] | |
| mol = Chem.MolFromSmiles(smi) | |
| if mol: | |
| fns = [(x,y) for x,y in Descriptors.descList if x in descList] | |
| res = [] | |
| for x,y in fns: | |
| res.append(y(mol)) | |
| return res | |
| else: | |
| return [None]*len(descList) | |
| def df_props(df_in): | |
| return df_in.SMILES.apply(smi2props) | |
| start = time.time() | |
| if __name__ == "__main__": | |
| num_cores = int(multiprocessing.cpu_count()) | |
| descList = [desc[0] for desc in Descriptors.descList] | |
| df = pd.read_csv(sys.argv[1],names=["SMILES","Name"],sep=" ") | |
| df['prop'] = df.SMILES.apply(smi2props) | |
| ddf = dd.from_pandas(df,npartitions=num_cores) | |
| df['prop'] = ddf.map_partitions(df_props,meta='float').compute(scheduler='processes') | |
| df[descList] = df.prop.to_list() | |
| df.drop('prop',axis=1, inplace=True) | |
| df.to_csv(sys.argv[2],index=False) | |
| elapsed = time.time()-start | |
| print(f"{len(df)} molecules processed in {elapsed:0.2f} sec") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment