Skip to content

Instantly share code, notes, and snippets.

@rnaimehaom
Forked from PatWalters/parallel_descriptors.py
Created November 21, 2022 20:01
Show Gist options
  • Select an option

  • Save rnaimehaom/cf457acb74ff67fb9740b77963132aba to your computer and use it in GitHub Desktop.

Select an option

Save rnaimehaom/cf457acb74ff67fb9740b77963132aba to your computer and use it in GitHub Desktop.
Calculate RDKit descriptors with Dask
#!/usr/bin/env python
import sys
import pandas as pd
import dask.dataframe as dd
from rdkit import Chem
from rdkit.Chem import Descriptors
import numpy as np
import time
import multiprocessing
# I borrowed a bunch of ideas from https://github.com/rdkit/rdkit/issues/2529
def smi2props(smi):
descList = [desc[0] for desc in Descriptors.descList]
mol = Chem.MolFromSmiles(smi)
if mol:
fns = [(x,y) for x,y in Descriptors.descList if x in descList]
res = []
for x,y in fns:
res.append(y(mol))
return res
else:
return [None]*len(descList)
def df_props(df_in):
return df_in.SMILES.apply(smi2props)
start = time.time()
if __name__ == "__main__":
num_cores = int(multiprocessing.cpu_count())
descList = [desc[0] for desc in Descriptors.descList]
df = pd.read_csv(sys.argv[1],names=["SMILES","Name"],sep=" ")
df['prop'] = df.SMILES.apply(smi2props)
ddf = dd.from_pandas(df,npartitions=num_cores)
df['prop'] = ddf.map_partitions(df_props,meta='float').compute(scheduler='processes')
df[descList] = df.prop.to_list()
df.drop('prop',axis=1, inplace=True)
df.to_csv(sys.argv[2],index=False)
elapsed = time.time()-start
print(f"{len(df)} molecules processed in {elapsed:0.2f} sec")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment