Tuesday 22 June 2021

Retrieving the SMILES for a list of ChEMBL identifiers

Today I needed to retrieve the SMILES for a list of ChEMBL identifiers.

I had to refresh my memory on how to retrieve data from ChEMBL using their web interface.

I wrote a little Python script (see below) that takes a file with a list of ChEMBL ids as input, e.g. :

CHEMBL608855
CHEMBL609156
CHEMBL592105
CHEMBL592123
CHEMBL592125
CHEMBL592332
CHEMBL592344
CHEMBL1197993
CHEMBL596643
CHEMBL596852

To run it you can type e.g.:

% python3 retrieve_smiles_from_chembl_for_compoundlist.py input_list output_file

It then makes an output file with the SMILES for those ids (see below), e.g.

molecule_chembl_id      canonical_smiles
CHEMBL596643    O=c1nc(C=Cc2ccc(Cl)cc2)oc2ccccc12
CHEMBL596852    COc1ccc(-c2nc3cc(Cc4ccc5[nH]c(-c6ccc(OC)cc6)nc5c4)ccc3[nH]2)cc1
CHEMBL608855    CC(C)(C)c1ccc(C2CC3=Nc4ccccc4N(C(=O)c4ccccc4Cl)C(c4ccc(F)cc4)C3=C(O)C2)cc1
CHEMBL609156    CCOC(=O)c1c[nH]c2c(CC)cccc2c1=O
CHEMBL592105    CN(C)c1ccc(C(O)(c2ccc(N(C)C)cc2)c2ccc(N(C)C)cc2)cc1
CHEMBL592344    CCOc1ccccc1CNC(=O)C1c2ccccc2C(=O)N(CC(C)C)C1c1cccs1
CHEMBL592332    CCOc1ccc2c(c1)CN(Cc1ccc(Cl)cc1)CO2
CHEMBL592123    CCCOCc1cc(CN2CCN(c3cccc(Cl)c3)CC2)c(O)c2ncccc12
CHEMBL592125    O=C(Cc1ccccc1)NC(c1ccc(Cl)cc1)c1c(O)ccc2ccccc12

My Python script

import os
import sys
import pandas as pd # uses pandas python module to view and analyse data
import requests # this is used to access json files

#====================================================================#

# call the 'molecule' API to find the molecular properties of our list of compounds:

def find_properties_of_compounds(cmpd_chembl_ids):

    #For the identified compounds, extract their molecular properties and other information from the 'molecule' ChEMBL API
    #Specify the input parameters:
    cmpd_chembl_ids = ",".join(cmpd_chembl_ids[0:]) #Amend the format of the text string of compounds so that it is suitable for the API call
    limit = 100 #Limit the number of records pulled back for each url call

    # Set up the call to the ChEMBL 'molecule' API
    # Remember that there is a limit to the number of records returned in any one API call (default is 20 records, maximum is 1000 records)
    # So need to iterate over several pages of records to gather all relevant information together!
    url_stem = "https://www.ebi.ac.uk" #This is the stem of the url
    url_full_string = url_stem + "/chembl/api/data/molecule.json?molecule_chembl_id__in={}&limit={}".format(cmpd_chembl_ids, limit) #This is the full url with the specified input parameters
    url_full = requests.get( url_full_string ).json() #This calls the information back from the API using the 'requests' module, and converts it to json format
    url_molecules = url_full['molecules'] #This is a list of the results for activities

    # This 'while' loop iterates over several pages of records (if required), and collates the list of results
    while url_full['page_meta']['next']:
        url_full = requests.get(url_stem + url_full['page_meta']['next']).json()
        url_molecules = url_molecules + url_full['molecules'] #Add result (as a list) to previous list of results

    #Convert the list of results into a Pandas dataframe:
    mol_df = pd.DataFrame(url_molecules)

    #Print out some useful information:
    #print("This is the url string that calls the 'Molecule' API with the specified query\n{}".format(url_full_string) )
    #Print("\nThese are the available columns for the Molecule API:\n{}".format(mol_df.columns))

    # Select only relevant columns:
    mol_df = mol_df[[ 'molecule_chembl_id','molecule_structures']]

    # And convert cells containing a dictionary to individual columns in the dataframe so that is it easier to filter!
    # Molecule hierarchy:
    # mol_df['parent_chembl_id'] = mol_df['molecule_hierarchy'].apply(lambda x: x['parent_chembl_id'])
    # Note that the above line gives an error message for some compounds e.g. CHEMBL1088885 that do not seem to have parent stored. However it should get printed anyway with molecule_hierarchy.

    #Physicochemical properties (only report if cells are not null)
    mol_df['canonical_smiles'] = mol_df.loc[ mol_df['molecule_structures'].notnull(), 'molecule_structures'].apply(lambda x: x['canonical_smiles'])
    mol_df = mol_df[[ 'molecule_chembl_id', 'canonical_smiles']]

    return mol_df

#====================================================================#

def read_input_list_of_compounds(input_compoundlist_file, output_file):

    cnt = 0
    # open the output file:
    with open(output_file, 'w') as f:

        # read in the list of oompounds:
        compounds = list() # create an empty list to store the compounds in
        inputfileObj = open(input_compoundlist_file, "r")
        compound_set_count = 0 # we will retrieve data for 10 compounds at a time
        for line in inputfileObj:
            line = line.rstrip()
            temp = line.split()
            # CHEMBL10
            compound = temp[0] # e.g. CHEMBL10  
            cnt += 1
            compounds.append(compound)  
            # if the list of compounds has 10 compounds, find the compound info. for these compounds:       
            if len(compounds) == 10:
                compound_set_count += 1
                # using a list of known compounds, find compound info. for those compounds:       
                print(cnt,"Finding compound info. for compounds",compounds)
                mol_df = find_properties_of_compounds(compounds)

                #Export the data frame to a csv file:
                #Followed expamples from https://stackoverflow.com/questions/37357727/pandas-write-tab-separated-dataframe-with-literal-tabs-with-no-quotes
                # and https://datatofish.com/export-dataframe-to-csv and https://stackoverflow.com/questions/17530542/how-to-add-pandas-data-to-an-existing-csv-file
                if compound_set_count == 1:
                    mol_df.to_csv(f, sep="\t", index=None, header=True) # only write a header for the first set of 10 targets
                else:
                    mol_df.to_csv(f, sep="\t", index=None, header=False)
                # empty the list of compounds:
                compounds.clear() # from https://www.geeksforgeeks.org/different-ways-to-clear-a-list-in-python/
        inputfileObj.close()
        # if there are some compounds left in the compound list, find their properties:
        if len(compounds) > 0:
            # find the compound info for these targets:
            print(cnt,"Finding compound info. for compounds",compounds)
            mol_df = find_properties_of_compounds(compounds)
            mol_df.to_csv(f, sep="\t", index=None, header=False)

#====================================================================#

def main():

    # check the command-line arguments:
    if len(sys.argv) != 3 or os.path.exists(sys.argv[1]) == False:
        print("Usage: %s input_compoundlist_file output_file" % sys.argv[0])
        sys.exit(1)
    input_compoundlist_file = sys.argv[1] # input file with a list of ChEMBL compounds of interest
    output_file = sys.argv[2]

    # read in the input list of compounds of interest:
    print("Reading in compound list...")
    read_input_list_of_compounds(input_compoundlist_file, output_file)

    print("FINISHED\n")

#====================================================================#

if __name__=="__main__":
    main()

#====================================================================#