As a part of coursework, I had to build an UPGMA tree from a given distance matrix and a list of species names. As a beginner/intermediate in python I saw this as a good opportunity to learn about classes. The code is as follows:
"""
UPGMA - Unweighted pair group method using arithmetic averages
Takes as input the distance matrix of species as a numpy array
Returns tree either as a dendrogram or in Newick format
"""
#%% import modules
import numpy as np
import itertools
#%% class for nodes
class Node:
"""
Data structure to store node of a UPGMA tree
"""
def __init__(self, left=None, right=None, up_height=0.0, down_height=0.0):
"""
Creating a node.
For a single taxon, set taxon name as self.left, leave right as none.
For an operational taxonomic unit(OTU) set left and right to child nodes.
Parameters
----------
left : default = none, taxon label
right : default = none, taxon label
up_height : float, default = 0.0, dist to parent node, if any
down_height : float, default = 0.0, dist to child node, if any
"""
self.left = left
self.right = right
self.uh = up_height
self.dh = down_height
def leaves(self) -> list:
"""
Method to find the taxa under any given node, effectively equivalent to
finding leaves of a binary tree. Only lists original taxa and not OTUs.
Returns a list of node names, not nodes themselves.
"""
if self == None:
return []
if self.right == None:
return [self.left]
leaves = self.left.leaves() + self.right.leaves()
return leaves
def __len__(self) -> int:
"""
Method to define len() of a node.
Returns the number of original taxa under any given node.
"""
return sum(1 for taxa in self.leaves())
def __repr__(self) -> str:
"""
Method to give readable print output
"""
return "-".join(self.leaves())
#%% class for UPGMA
class UPGMA:
def __init__(self, dist_matrix: np.ndarray, taxa: list):
"""
Initialize an UPGMA class.
Takes a nxn distance matrix as input. A list of n taxon id is required
in the same order as the distance matrix row/column
Parameters
----------
dist_matrix : numpy array, distance matrix of species
taxa : list of int or str to identify taxa
"""
self.distances = dist_matrix
self.taxa = taxa
self.build_tree(self.distances, self.taxa)
def build_tree(self, dist_matrix: np.ndarray, taxa: list) -> Node:
"""
Method to construct a tree from a given distance matrix and taxa list.
Parameters
----------
dist_matrix : np.ndarray of pairwise distances
taxa : list of taxa id. Elements of lists have to be unique
Returns the root node for constructed tree.
"""
# creating node for each taxa
nodes = list(Node(taxon) for taxon in taxa)
# dictionary row/column id -> node
rc_to_node = dict([i, j] for i, j in enumerate(nodes))
# dictionary taxa -> row/column id
taxa_to_rc = dict([i, j] for j, i in enumerate(taxa))
# make copy of dist matrix to work on
work_matrix = dist_matrix
# set all diagonal elements to infinity for ease of finding least distance
work_matrix = np.array(work_matrix, dtype=float)
np.fill_diagonal(work_matrix, np.inf)
# loop
while len(nodes) > 1:
# finding (row, col) of least dist
least_id = np.unravel_index(work_matrix.argmin(), work_matrix.shape, "C")
least_dist = work_matrix[least_id[0], least_id[1]]
# nodes corresponding to (row, col)
node1, node2 = rc_to_node[least_id[0]], rc_to_node[least_id[1]]
# add OTU with node1 and node2 as children. set heights of nodes
new_node = Node(node2, node1)
nodes.append(new_node)
node1.uh = least_dist / 2 - node1.dh
node2.uh = least_dist / 2 - node2.dh
new_node.dh = least_dist / 2
nodes.remove(node1)
nodes.remove(node2)
# create new working distance matrix
work_matrix = self.update_distance(dist_matrix, nodes, taxa_to_rc)
# update row/col id -> node dictionary
rc_to_node = dict([i, j] for i, j in enumerate(nodes))
# set tree to root
self.tree = nodes[0]
def update_distance(
self, dist_matrix: np.ndarray, nodes: list, taxa_to_rc: dict
) -> np.ndarray:
"""
Method to make a new distance matrix with newer node list.
Parameters
----------
dist_matrix : np.ndarray of pairwise distances for all taxa
nodes : list of updated nodes
taxa_to_rc : dict for taxa -> row/col id
Returns np.ndarray of pairwise distances for updated nodes
"""
# dictionary for node -> row/col id
node_to_rc = dict([i, j] for j, i in enumerate(nodes))
rc = len(nodes)
new_dist_matrix = np.zeros((rc, rc), dtype=float)
for node1 in nodes:
row = node_to_rc[node1]
for node2 in nodes:
node_pairs = list(itertools.product(node1.leaves(), node2.leaves()))
col = node_to_rc[node2]
new_dist_matrix[row, col] = sum(
dist_matrix[taxa_to_rc[i], taxa_to_rc[j]] for i, j in node_pairs
) / len(node_pairs)
np.fill_diagonal(new_dist_matrix, np.inf)
return new_dist_matrix
def tree_to_newick(t) -> str:
"""
Function to convert tree to Newick, slightly modified form of the tree.py version.
Takes the root node of an UPGMA tree as input
"""
if t.right == None:
return t.left + ":" + str(t.uh)
else:
return (
"("
+ ",".join([tree_to_newick(x) for x in [t.left, t.right]])
+ "):"
+ str(t.uh)
)
#%% main
if __name__ == "__main__":
# data from table 3 of Fitch and Margoliash, Construction of Phylogenetic trees
taxa = ["Turtle", "Human", "Tuna", "Chicken", "Moth", "Monkey", "Dog"]
distances = np.array(
[
[0, 19, 27, 8, 33, 18, 13],
[19, 0, 31, 18, 36, 1, 13],
[27, 31, 0, 26, 41, 32, 29],
[8, 18, 26, 0, 31, 17, 14],
[33, 36, 41, 31, 0, 35, 28],
[18, 1, 32, 17, 35, 0, 12],
[13, 13, 29, 14, 28, 12, 0],
]
)
x = UPGMA(distances, taxa).tree
print(tree_to_newick(x))
Please suggest improvements where possible.