I often need to split up an input file because it's huge, and submit lots of jobs to the Sanger compute farm on all the little chunks.
For example, I had a file of BLAT results, and wanted to run a script on these results, but the file was too big.
Anyway, the BLAT file was enormous, so I split it up into smaller files of 10,000 lines each, using:
% split -l 10000 enormous_blat.txt fblat
This made files fblataa, fblatab... (47 files)
On each of these files I wanted to run my script (which is called 'strip_off_adaptors.py') on each of these small chunks: ie.
% python3 strip_off_adaptors.py fblataa
% python3 strip_off_adaptors.py fblataa
etc.
But that was going to take me ages to submit 47 jobs to the farm, typing all those 'bsub' commands. Well at least 10 minutes!
So I decided to write a Python script to submit the jobs (see my script below).
It takes a file with a list of the fblat* files as its input.
Then it makes a subdirectory for each of each fblat* file (e.g. fblataa), e.g. fblataadir.
Then it submits the job for fblataa in the directory fblataadir. And so on, for fblatab, fblatac, etc.
It can be run using:
% python3 submit_water_jobs.py fblat_file_list lib_all_R1_001.fa linker.fa
(where lib_all_R1_001.fa and linker.fa are just some other input files required by my script 'strip_off_adaptors.py'.)
Easy-peasy!
Here's my script submit_water_jobs.py, you can alter it to submit jobs for lots of chunks of any type of file to a compute farm using bsub:
import os
import sys
from collections import defaultdict
#====================================================================#
def read_input_file_list(input_file):
"""read in the input file with the list of input BLAT files"""
# define a list to contain the names of the input BLAT files:
input_file_list = list()
# read in the input file:
fileObj = open(input_file, "r")
for line in fileObj:
line = line.rstrip()
temp = line.split()
input_file_name = temp[0]
input_file_list.append(input_file_name)
fileObj.close()
return input_file_list
#====================================================================#
def main():
# check the command-line arguments:
if len(sys.argv) != 4 or os.path.exists(sys.argv[1]) == False or os.path.exists(sys.argv[2]) == False or os.path.exists(sys.argv[3]) == False:
print("Usage: %s input_list_file input_reads_fasta input_linker_fasta" % sys.argv[0])
sys.exit(1)
input_file = sys.argv[1] # input file with list of input BLAT files
input_reads_fasta = sys.argv[2] # input fasta file of reads
input_linker_fasta = sys.argv[3] # input fasta file with the linker sequence
# read in the input file with list of input BLAT files
input_file_list = read_input_file_list(input_file)
# get the current directory:
current_dir = os.getcwd()
# for each input BLAT file, submit the 'water' job:
for blat_file in input_file_list:
# make a directory for running this job
newdir = '%sdir' % blat_file # e.g. fblataadir
newdir2 = os.path.join(current_dir,newdir)
os.mkdir(newdir2)
os.chdir(newdir2)
# make a soft-link to the input BLAT file:
blat_file2 = os.path.join(current_dir,blat_file)
blat_file3 = os.path.join(newdir2,blat_file)
command0 = "ln -s %s %s" % (blat_file2, blat_file3) # blat_file3 is in the new directory
os.system(command0)
# make a soft-link to the input fasta file of reads:
input_reads_fasta2 = os.path.join(current_dir,input_reads_fasta)
input_reads_fasta3 = os.path.join(newdir2, input_reads_fasta)
command1 = "ln -s %s %s" % (input_reads_fasta2, input_reads_fasta3) # input_reads_fasta3 is in the new directory
os.system(command1)
# make a soft-link to the input file with the linker sequence:
input_linker_fasta2 = os.path.join(current_dir, input_linker_fasta)
input_linker_fasta3 = os.path.join(newdir2, input_linker_fasta)
command2 = "ln -s %s %s" % (input_linker_fasta2, input_linker_fasta3) # input_linker_fasta3 is in the new directory
os.system(command2)
# define the name of the output file:
output_file = "%s2" % blat_file3 # output_file is in the new directory
# submit the job to run 'water' between the reads and the linker:
command3 = "python3 ~alc/Documents/git/Python/strip_off_adaptors.py %s %s %s %s 0.5" % (blat_file3, input_reads_fasta3, input_linker_fasta3, output_file)
# specify the bsub output and error file names:
bsub_out = "myscript.o"
bsub_err = "myscript.e"
bsub_out2 = os.path.join(newdir2,bsub_out) # bsub_out2 is in the new directory
bsub_err2 = os.path.join(newdir2,bsub_err) # bsub_err2 is in the new directory
# submit farm job:
jobname = "%s" % blat_file
# request 5000 Mbyte of RAM for the job:
command4 = 'bsub -o %s -e %s -R "select[mem>5000] rusage[mem=5000]" -M5000 -J%s "%s"' % (bsub_out2, bsub_err2, jobname, command3)
print(command4)
os.system(command4)
os.chdir(current_dir)
#====================================================================#
if __name__=="__main__":
main()
#====================================================================#