Thursday 16 September 2021

A Python script to submit lots of jobs to the farm

I often need to split up an input file because it's huge, and submit lots of jobs to the Sanger compute farm on all the little chunks. 

 For example, I had a file of BLAT results, and wanted to run a script on these results, but the file was too big. 

Anyway, the BLAT file was enormous, so I split it up into smaller files of 10,000 lines each, using:

% split -l 10000 enormous_blat.txt fblat 

This made files fblataa, fblatab... (47 files) 

On each of these files I wanted to run my script (which is called 'strip_off_adaptors.py') on each of these small chunks: ie. 

% python3 strip_off_adaptors.py fblataa 

% python3 strip_off_adaptors.py fblataa 

etc. 

But that was going to take me ages to submit 47 jobs to the farm, typing all those 'bsub' commands. Well at least 10 minutes! 

 So I decided to write a Python script to submit the jobs (see my script below). 

It takes a file with a list of the fblat* files as its input. 

Then it makes a subdirectory for each of each fblat* file (e.g. fblataa), e.g. fblataadir. 

Then it submits the job for fblataa in the directory fblataadir. And so on, for fblatab, fblatac, etc. 

 It can be run using: 

% python3 submit_water_jobs.py fblat_file_list lib_all_R1_001.fa linker.fa 

(where lib_all_R1_001.fa and linker.fa are just some other input files required by my script 'strip_off_adaptors.py'.) 

Easy-peasy! 

 

Here's my script submit_water_jobs.py, you can alter it to submit jobs for lots of chunks of any type of file to a compute farm using bsub: 

 

import os
import sys
from collections import defaultdict

#====================================================================#

def read_input_file_list(input_file):
    """read in the input file with the list of input BLAT files"""
   
    # define a list to contain the names of the input BLAT files:
    input_file_list = list()
  
    # read in the input file:
    fileObj = open(input_file, "r")
    for line in fileObj:
        line = line.rstrip()
        temp = line.split()
        input_file_name = temp[0]
        input_file_list.append(input_file_name)
    fileObj.close()

    return input_file_list     

#====================================================================#

def main():

    # check the command-line arguments:         
    if len(sys.argv) != 4 or os.path.exists(sys.argv[1]) == False or os.path.exists(sys.argv[2]) == False or os.path.exists(sys.argv[3]) == False:
        print("Usage: %s input_list_file input_reads_fasta input_linker_fasta" % sys.argv[0])
        sys.exit(1)
    input_file = sys.argv[1] # input file with list of input BLAT files  
    input_reads_fasta = sys.argv[2] # input fasta file of reads
    input_linker_fasta = sys.argv[3] # input fasta file with the linker sequence

    # read in the input file with list of input BLAT files
    input_file_list = read_input_file_list(input_file)

    # get the current directory:
    current_dir = os.getcwd()

    # for each input BLAT file, submit the 'water' job:
 
    for blat_file in input_file_list:
        # make a directory for running this job
        newdir = '%sdir' % blat_file # e.g. fblataadir
        newdir2 = os.path.join(current_dir,newdir)
        os.mkdir(newdir2)
        os.chdir(newdir2)
        # make a soft-link to the input BLAT file:
        blat_file2 = os.path.join(current_dir,blat_file)
        blat_file3 = os.path.join(newdir2,blat_file)
        command0 = "ln -s %s %s" % (blat_file2, blat_file3) # blat_file3 is in the new directory
        os.system(command0) 

        # make a soft-link to the input fasta file of reads:
        input_reads_fasta2 = os.path.join(current_dir,input_reads_fasta)
        input_reads_fasta3 = os.path.join(newdir2, input_reads_fasta)
        command1 = "ln -s %s %s" % (input_reads_fasta2, input_reads_fasta3) # input_reads_fasta3 is in the new directory
        os.system(command1)
        # make a soft-link to the input file with the linker sequence:
        input_linker_fasta2 = os.path.join(current_dir, input_linker_fasta)
        input_linker_fasta3 = os.path.join(newdir2, input_linker_fasta)
        command2 = "ln -s %s %s" % (input_linker_fasta2, input_linker_fasta3) # input_linker_fasta3 is in the new directory
        os.system(command2)
        # define the name of the output file:
        output_file = "%s2" % blat_file3 # output_file is in the new directory
        # submit the job to run 'water' between the reads and the linker:
        command3 = "python3 ~alc/Documents/git/Python/strip_off_adaptors.py %s %s %s %s 0.5" % (blat_file3, input_reads_fasta3, input_linker_fasta3, output_file)
        # specify the bsub output and error file names:
        bsub_out = "myscript.o"
        bsub_err = "myscript.e"
        bsub_out2 = os.path.join(newdir2,bsub_out) # bsub_out2 is in the new directory
        bsub_err2 = os.path.join(newdir2,bsub_err) # bsub_err2 is in the new directory
        # submit farm job:
        jobname = "%s" % blat_file
        # request 5000 Mbyte of RAM for the job:
        command4 = 'bsub -o %s -e %s -R "select[mem>5000] rusage[mem=5000]" -M5000 -J%s "%s"' % (bsub_out2, bsub_err2, jobname, command3)
        print(command4)
        os.system(command4)
        os.chdir(current_dir)

#====================================================================#

if __name__=="__main__":
    main()

#====================================================================#