Monday, 13 July 2015

Parsing a phylogenetic tree in Python using ETE2

I've just started learning about ETE2 for parsing and plotting phylogenetic trees in Python, on recommendation from my colleague Diogo Ribeiro (thanks Diogo!)

ETE2 is currently in Python2, but apparently a Python3 version is coming soon (I hope so!)

For example, here's a simple script to read in a tree, and traverse all the nodes in a tree, and for each node print its children, its parent, and all the leaves under (decendants to) the node:

import os
import sys
from ete2 import Tree

# note: this script is in python2, as ete2 is in python2
#====================================================================#

def main():

    # check the command-line arguments:
    if len(sys.argv) != 2 or os.path.exists(sys.argv[1]) == False:
        print("Usage: %s newick_file" % sys.argv[0])
        sys.exit(1)
    newick_file = sys.argv[1]

    # the code below is based on examples on http://pythonhosted.org/ete2/tutorial/tutorial_trees.html
    # load a tree from a file. The root node is put into 't'
    t = Tree(newick_file, format=1) # format=1 means internal node names are loaded

    # iterate over all the nodes in the tree, and get the children of each node:
    for node in t.traverse("postorder"):
        # get the children of this node:
        children = node.get_children()
        for child in children:
            print "node %s has child %s" % (node.name, child.name)

    # iterate over all the nodes in the tree, and get the parent of each node: 
    for node in t.traverse("postorder"):
        # get the parent of this node:
        if not node.is_root(): # if it is not the root node
            print "node %s has parent %s" % (node.name, node.up.name)

    # iterate over all the nodes in the tree, and get the descendant species of each node:
    for node in t.traverse("postorder"):
        if not node.is_leaf(): # if it is not a leaf
            # get the descendant species (leaves):
            leaves = node.get_leaves()
            leaf_names = [leaf.name for leaf in leaves]
            print "node %s has descendants %s" % (node.name, ','.join(leaf_names))

    # iterate over all the nodes in the tree, and get the ancestor nodes of each node, back
    # to the root:
    for node in t.traverse("postorder"):
        # get the ancestors of this node:
        if not node.is_root():
            ancestor = node.up
            ancestors = [ancestor]
            while not ancestor.is_root():
                ancestor = ancestor.up
                ancestors.append(ancestor)
            ancestor_names = [ancestor.name for ancestor in ancestors]
            print "node %s has ancestors %s" % (node.name, ','.join(ancestor_names))


    print("FINISHED")

#====================================================================#

if __name__=="__main__":
    main()

#====================================================================#


1 comment:

flashton said...

this helped me, thanks for posting!