Today I needed to retrieve the SMILES for a list of ChEMBL identifiers.
I had to refresh my memory on how to retrieve data from ChEMBL using their web interface.
I wrote a little Python script (see below) that takes a file with a list of ChEMBL ids as input, e.g. :
CHEMBL608855
CHEMBL609156
CHEMBL592105
CHEMBL592123
CHEMBL592125
CHEMBL592332
CHEMBL592344
CHEMBL1197993
CHEMBL596643
CHEMBL596852
To run it you can type e.g.:
% python3 retrieve_smiles_from_chembl_for_compoundlist.py input_list output_file
It then makes an output file with the SMILES for those ids (see below), e.g.
molecule_chembl_id canonical_smiles
CHEMBL596643 O=c1nc(C=Cc2ccc(Cl)cc2)oc2ccccc12
CHEMBL596852 COc1ccc(-c2nc3cc(Cc4ccc5[nH]c(-c6ccc(OC)cc6)nc5c4)ccc3[nH]2)cc1
CHEMBL608855 CC(C)(C)c1ccc(C2CC3=Nc4ccccc4N(C(=O)c4ccccc4Cl)C(c4ccc(F)cc4)C3=C(O)C2)cc1
CHEMBL609156 CCOC(=O)c1c[nH]c2c(CC)cccc2c1=O
CHEMBL592105 CN(C)c1ccc(C(O)(c2ccc(N(C)C)cc2)c2ccc(N(C)C)cc2)cc1
CHEMBL592344 CCOc1ccccc1CNC(=O)C1c2ccccc2C(=O)N(CC(C)C)C1c1cccs1
CHEMBL592332 CCOc1ccc2c(c1)CN(Cc1ccc(Cl)cc1)CO2
CHEMBL592123 CCCOCc1cc(CN2CCN(c3cccc(Cl)c3)CC2)c(O)c2ncccc12
CHEMBL592125 O=C(Cc1ccccc1)NC(c1ccc(Cl)cc1)c1c(O)ccc2ccccc12
My Python script
import os
import sys
import pandas as pd # uses pandas python module to view and analyse data
import requests # this is used to access json files
#====================================================================#
# call the 'molecule' API to find the molecular properties of our list of compounds:
def find_properties_of_compounds(cmpd_chembl_ids):
#For the identified compounds, extract their molecular properties and other information from the 'molecule' ChEMBL API
#Specify the input parameters:
cmpd_chembl_ids = ",".join(cmpd_chembl_ids[0:]) #Amend the format of the text string of compounds so that it is suitable for the API call
limit = 100 #Limit the number of records pulled back for each url call
# Set up the call to the ChEMBL 'molecule' API
# Remember that there is a limit to the number of records returned in any one API call (default is 20 records, maximum is 1000 records)
# So need to iterate over several pages of records to gather all relevant information together!
url_stem = "https://www.ebi.ac.uk" #This is the stem of the url
url_full_string = url_stem + "/chembl/api/data/molecule.json?molecule_chembl_id__in={}&limit={}".format(cmpd_chembl_ids, limit) #This is the full url with the specified input parameters
url_full = requests.get( url_full_string ).json() #This calls the information back from the API using the 'requests' module, and converts it to json format
url_molecules = url_full['molecules'] #This is a list of the results for activities
# This 'while' loop iterates over several pages of records (if required), and collates the list of results
while url_full['page_meta']['next']:
url_full = requests.get(url_stem + url_full['page_meta']['next']).json()
url_molecules = url_molecules + url_full['molecules'] #Add result (as a list) to previous list of results
#Convert the list of results into a Pandas dataframe:
mol_df = pd.DataFrame(url_molecules)
#Print out some useful information:
#print("This is the url string that calls the 'Molecule' API with the specified query\n{}".format(url_full_string) )
#Print("\nThese are the available columns for the Molecule API:\n{}".format(mol_df.columns))
# Select only relevant columns:
mol_df = mol_df[[ 'molecule_chembl_id','molecule_structures']]
# And convert cells containing a dictionary to individual columns in the dataframe so that is it easier to filter!
# Molecule hierarchy:
# mol_df['parent_chembl_id'] = mol_df['molecule_hierarchy'].apply(lambda x: x['parent_chembl_id'])
# Note that the above line gives an error message for some compounds e.g. CHEMBL1088885 that do not seem to have parent stored. However it should get printed anyway with molecule_hierarchy.
#Physicochemical properties (only report if cells are not null)
mol_df['canonical_smiles'] = mol_df.loc[ mol_df['molecule_structures'].notnull(), 'molecule_structures'].apply(lambda x: x['canonical_smiles'])
mol_df = mol_df[[ 'molecule_chembl_id', 'canonical_smiles']]
return mol_df
#====================================================================#
def read_input_list_of_compounds(input_compoundlist_file, output_file):
cnt = 0
# open the output file:
with open(output_file, 'w') as f:
# read in the list of oompounds:
compounds = list() # create an empty list to store the compounds in
inputfileObj = open(input_compoundlist_file, "r")
compound_set_count = 0 # we will retrieve data for 10 compounds at a time
for line in inputfileObj:
line = line.rstrip()
temp = line.split()
# CHEMBL10
compound = temp[0] # e.g. CHEMBL10
cnt += 1
compounds.append(compound)
# if the list of compounds has 10 compounds, find the compound info. for these compounds:
if len(compounds) == 10:
compound_set_count += 1
# using a list of known compounds, find compound info. for those compounds:
print(cnt,"Finding compound info. for compounds",compounds)
mol_df = find_properties_of_compounds(compounds)
#Export the data frame to a csv file:
#Followed expamples from https://stackoverflow.com/questions/37357727/pandas-write-tab-separated-dataframe-with-literal-tabs-with-no-quotes
# and https://datatofish.com/export-dataframe-to-csv and https://stackoverflow.com/questions/17530542/how-to-add-pandas-data-to-an-existing-csv-file
if compound_set_count == 1:
mol_df.to_csv(f, sep="\t", index=None, header=True) # only write a header for the first set of 10 targets
else:
mol_df.to_csv(f, sep="\t", index=None, header=False)
# empty the list of compounds:
compounds.clear() # from https://www.geeksforgeeks.org/different-ways-to-clear-a-list-in-python/
inputfileObj.close()
# if there are some compounds left in the compound list, find their properties:
if len(compounds) > 0:
# find the compound info for these targets:
print(cnt,"Finding compound info. for compounds",compounds)
mol_df = find_properties_of_compounds(compounds)
mol_df.to_csv(f, sep="\t", index=None, header=False)
#====================================================================#
def main():
# check the command-line arguments:
if len(sys.argv) != 3 or os.path.exists(sys.argv[1]) == False:
print("Usage: %s input_compoundlist_file output_file" % sys.argv[0])
sys.exit(1)
input_compoundlist_file = sys.argv[1] # input file with a list of ChEMBL compounds of interest
output_file = sys.argv[2]
# read in the input list of compounds of interest:
print("Reading in compound list...")
read_input_list_of_compounds(input_compoundlist_file, output_file)
print("FINISHED\n")
#====================================================================#
if __name__=="__main__":
main()
#====================================================================#