UPGMA tree building in python

Question

As a part of coursework, I had to build an UPGMA tree from a given distance matrix and a list of species names. As a beginner/intermediate in python I saw this as a good opportunity to learn about classes. The code is as follows:

"""
UPGMA - Unweighted pair group method using arithmetic averages
Takes as input the distance matrix of species as a numpy array
Returns tree either as a dendrogram or in Newick format
"""

#%% import modules
import numpy as np
import itertools

#%% class for nodes
class Node:
    """
    Data structure to store node of a UPGMA tree
    """

    def __init__(self, left=None, right=None, up_height=0.0, down_height=0.0):
        """
        Creating a node.
        For a single taxon, set taxon name as self.left, leave right as none.
        For an operational taxonomic unit(OTU) set left and right to child nodes.

        Parameters
        ----------
        left : default = none, taxon label
        right : default = none, taxon label
        up_height : float, default = 0.0, dist to parent node, if any
        down_height : float, default = 0.0, dist to child node, if any
        """
        self.left = left
        self.right = right
        self.uh = up_height
        self.dh = down_height

    def leaves(self) -> list:
        """
        Method to find the taxa under any given node, effectively equivalent to
        finding leaves of a binary tree. Only lists original taxa and not OTUs.

        Returns a list of node names, not nodes themselves.
        """
        if self == None:
            return []
        if self.right == None:
            return [self.left]
        leaves = self.left.leaves() + self.right.leaves()
        return leaves

    def __len__(self) -> int:
        """
        Method to define len() of a node.

        Returns the number of original taxa under any given node.
        """
        return sum(1 for taxa in self.leaves())

    def __repr__(self) -> str:
        """
        Method to give readable print output
        """
        return "-".join(self.leaves())


#%% class for UPGMA
class UPGMA:
    def __init__(self, dist_matrix: np.ndarray, taxa: list):
        """
        Initialize an UPGMA class.
        Takes a nxn distance matrix as input. A list of n taxon id is required
        in the same order as the distance matrix row/column

        Parameters
        ----------
        dist_matrix : numpy array, distance matrix of species
        taxa : list of int or str to identify taxa
        """
        self.distances = dist_matrix
        self.taxa = taxa
        self.build_tree(self.distances, self.taxa)

    def build_tree(self, dist_matrix: np.ndarray, taxa: list) -> Node:
        """
        Method to construct a tree from a given distance matrix and taxa list.

        Parameters
        ----------
        dist_matrix : np.ndarray of pairwise distances
        taxa : list of taxa id. Elements of lists have to be unique

        Returns the root node for constructed tree.
        """
        # creating node for each taxa
        nodes = list(Node(taxon) for taxon in taxa)

        # dictionary row/column id -> node
        rc_to_node = dict([i, j] for i, j in enumerate(nodes))
        
        # dictionary taxa -> row/column id
        taxa_to_rc = dict([i, j] for j, i in enumerate(taxa))
        
        # make copy of dist matrix to work on
        work_matrix = dist_matrix
        # set all diagonal elements to infinity for ease of finding least distance
        work_matrix = np.array(work_matrix, dtype=float)
        np.fill_diagonal(work_matrix, np.inf)

        # loop
        while len(nodes) > 1:
            # finding (row, col) of least dist
            least_id = np.unravel_index(work_matrix.argmin(), work_matrix.shape, "C")
            least_dist = work_matrix[least_id[0], least_id[1]]
            # nodes corresponding to (row, col)
            node1, node2 = rc_to_node[least_id[0]], rc_to_node[least_id[1]]
            
            # add OTU with node1 and node2 as children. set heights of nodes
            new_node = Node(node2, node1)
            nodes.append(new_node)
            node1.uh = least_dist / 2 - node1.dh
            node2.uh = least_dist / 2 - node2.dh
            new_node.dh = least_dist / 2
            nodes.remove(node1)
            nodes.remove(node2)
           
            # create new working distance matrix
            work_matrix = self.update_distance(dist_matrix, nodes, taxa_to_rc)
            
            # update row/col id -> node dictionary
            rc_to_node = dict([i, j] for i, j in enumerate(nodes))
        # set tree to root
        self.tree = nodes[0]

    def update_distance(
        self, dist_matrix: np.ndarray, nodes: list, taxa_to_rc: dict
    ) -> np.ndarray:
        """
        Method to make a new distance matrix with newer node list.

        Parameters
        ----------
        dist_matrix : np.ndarray of pairwise distances for all taxa
        nodes : list of updated nodes
        taxa_to_rc : dict for taxa -> row/col id

        Returns np.ndarray of pairwise distances for updated nodes
        """
        # dictionary for node -> row/col id
        node_to_rc = dict([i, j] for j, i in enumerate(nodes))
        
        rc = len(nodes)
        new_dist_matrix = np.zeros((rc, rc), dtype=float)
        for node1 in nodes:
            row = node_to_rc[node1]
            for node2 in nodes:
                node_pairs = list(itertools.product(node1.leaves(), node2.leaves()))
                col = node_to_rc[node2]
                new_dist_matrix[row, col] = sum(
                    dist_matrix[taxa_to_rc[i], taxa_to_rc[j]] for i, j in node_pairs
                ) / len(node_pairs)
        np.fill_diagonal(new_dist_matrix, np.inf)
        return new_dist_matrix


def tree_to_newick(t) -> str:
    """
    Function to convert tree to Newick, slightly modified form of the tree.py version.
    Takes the root node of an UPGMA tree as input
    """
    if t.right == None:
        return t.left + ":" + str(t.uh)
    else:
        return (
            "("
            + ",".join([tree_to_newick(x) for x in [t.left, t.right]])
            + "):"
            + str(t.uh)
        )


#%% main
if __name__ == "__main__":
    # data from table 3 of Fitch and Margoliash, Construction of Phylogenetic trees
    taxa = ["Turtle", "Human", "Tuna", "Chicken", "Moth", "Monkey", "Dog"]
    distances = np.array(
        [
            [0, 19, 27, 8, 33, 18, 13],
            [19, 0, 31, 18, 36, 1, 13],
            [27, 31, 0, 26, 41, 32, 29],
            [8, 18, 26, 0, 31, 17, 14],
            [33, 36, 41, 31, 0, 35, 28],
            [18, 1, 32, 17, 35, 0, 12],
            [13, 13, 29, 14, 28, 12, 0],
        ]
    )
    x = UPGMA(distances, taxa).tree
    print(tree_to_newick(x))

Please suggest improvements where possible.

J_H · Accepted Answer · 2024-04-29 05:25:30Z

docstring

Thank you for this beautiful module-level docstring.

"""
UPGMA - Unweighted pair group method using arithmetic averages
Takes as input the distance matrix of species as a numpy array
Returns tree either as a dendrogram or in Newick format
"""

Maybe the second, and especially the third line are a bit premature. You might defer the details until we drill down to the class UPGMA docstring.

I think of module- and class- level docstrings as laying down the ground rules for what is "in scope". And by extension, if some maintenance engineer proposes to add a self.kitchen_sink attribute, the docstring should offer guidance that unrelated items are clearly "out of scope". We may also see concepts introduced and defined, such as the current pair group method.

For details of how to call and what to expect back, the class- and especially the method- level docstrings tend to be the more appropriate place for such discussion.

That said, I am always happy to see any sort of module-level docstring. They're not as rare as hen's teeth, but they are certainly less prevalent than function documentation.

BTW, in the Node __init__ ctor docstring, prefer "None" over "none" when describing that python singleton.

kwarg defaults

class Node: ...
    def __init__(self, left=None, right=None, up_height=...):

There is clearly a use case for "no right node", so that None default makes sense.

But I am skeptical that inviting caller to omit left is a sensible thing to do. For example, sometimes .leaves() will return [None]:

        if self == None:
            return []
        if self.right == None:
            return [self.left]

Worse, we risk an ugly de-reference exception:

        leaves = self.left.leaves() + ...

As a separate item, and, sorry, kind of a weird python community thing, please don't use == for a None check. It relates to None being a singleton; checking equality is different from checking an id() address. Prefer if self is None:.

length

    def __len__(self) -> int: ...        
        return sum(1 for taxa in self.leaves())

I'm sure that's correct. But it's just weird. It looks a bit like some SQL queries.

I happen to know that leaves is just a list; your annotation told me that. So return len(self.leaves()) and be done with it.

Also, I really like that you support repr(). But consider renaming to __str__, so you'll also support str(), which e.g. is used by print() and by f-string formatting.

type stability

class UPGMA:
    def __init__(self, dist_matrix: np.ndarray, taxa: list):
        ...
        taxa : list of int or str to identify taxa

Ugghhh, int or str, not sure how that will go. Oh, well, forewarned is forearmed. Consider putting def ..., taxa: list[int|str]): in the signature, so mypy will be on the same page. (I will just point out that it's easy for caller to turn integers into strings...)

conventional loop names

        rc_to_node = dict([i, j] for i, j in enumerate(nodes))        
        ...
        taxa_to_rc = dict([i, j] for j, i in enumerate(taxa))

It appears some copy-n-pasting was happening here.

Please don't state the second one in that way. Rather, assign ... = dict([j, i] for i, j in .... It will have the same effect.

For extra credit, use descriptive names like for i, node and for i, taxon.

cite a reference

Thank you so much for citing "Fitch and Margoliash [1967]". I appreciate it.

301_Moved_Permanently · Accepted Answer · 2024-04-30 13:37:34Z

Datastructure

        """
        Creating a node.
        For a single taxon, set taxon name as self.left, leave right as none.
        For an operational taxonomic unit(OTU) set left and right to child nodes.
        …
        """

You seem to try and hold either a leaf (taxon) or a node (OTU) in your node class, this is making your logic a bit more convoluted and confusing than it needs to be. Please, separate your concerns using several classes. Also your comment

    """
    Data structure to store node of a UPGMA tree
    """

and your init call for dataclasses here.

Considering how you need to build an OTU from 2 down nodes in your loop, I recommend an alternate Node constructor (@classmethod) to handle height management for you.

Algorithm

Your UPGMA class is only an __init__ calling a method. This is not a class, this is a function that has been artificially upgraded into a class, confusing the reader about why a state would be necessary to maintain here (hint, there isn't).

Just unfold the whole UPGMA body into a single function for ease of use and reasoning about.

You also use various dictionaries to map indexes (rc) to nodes or vice-versa. The first is superfluous as you can already index your list of Nodes using indexes, the need for the second can be toned down by a better use of enumerate into your working matrix update (update_distance).

Lastly, you can use np.mean to better indicate what you’re processing when updating the working matrix.

Proposed improvements

"""
UPGMA - Unweighted pair group method using arithmetic averages
Takes as input the distance matrix of species as a numpy array
Returns tree either as a dendrogram or in Newick format
"""

import itertools
from dataclasses import dataclass
from typing import Self, Generator

import numpy as np


@dataclass
class Leaf:
    """Data structure to store leaf of a UPGMA tree"""

    name: str
    up_height: float = 0.0

    def leaves(self) -> Generator[Self, None, None]:
        yield self

    def __str__(self):
        return f'{self.name}:{self.up_height}'


@dataclass
class Node:
    """
    Data structure to store OTU node of a UPGMA tree
    """

    left: Self | Leaf
    right: Self | Leaf
    up_height: float = 0.0
    down_height: float = 0.0

    @classmethod
    def from_height(cls, left: Self | Leaf, right: Self | Leaf, height: float):
        left.up_height = height if isinstance(left, Leaf) else height - left.down_height
        right.up_height = height if isinstance(right, Leaf) else height - right.down_height
        return cls(left, right, down_height=height)

    def leaves(self) -> Generator[Leaf, None, None]:
        """
        Method to find the taxa under any given node, effectively equivalent to
        finding leaves of a binary tree. Only lists original taxa and not OTUs.
        """
        yield from self.left.leaves()
        yield from self.right.leaves()

    def __len__(self) -> int:
        """
        Method to define len() of a node.

        Returns the number of original taxa under any given node.
        """
        return sum(1 for taxa in self.leaves())

    def __str__(self) -> str:
        """
        Method to give readable print output
        """
        return f"({self.left},{self.right}):{self.up_height}"


def upgma_tree(dist_matrix: np.ndarray, taxa: list[str]) -> Leaf | Node:
    if not taxa:
        raise ValueError('need at least one leaf (taxon)')

    size = len(taxa)
    work_matrix = np.array(dist_matrix, dtype=float)

    if work_matrix.shape != (size, size):
        raise ValueError('distance matrix should be squared in the number of taxa')

    # creating node for each taxa
    nodes = list(map(Leaf, taxa))
    taxa_to_rc = {j: i for i, j in enumerate(taxa)}

    while size > 1:
        # set all diagonal elements to infinity for ease of finding least distance
        np.fill_diagonal(work_matrix, np.inf)

        # finding (row, col) of least dist
        least_id = np.unravel_index(work_matrix.argmin(), work_matrix.shape, "C")
        least_dist = work_matrix[least_id]

        # nodes corresponding to (row, col)
        node1, node2 = (nodes[i] for i in least_id)
        
        # add OTU with node1 and node2 as children. set heights of nodes
        new_node = Node.from_height(node2, node1, least_dist / 2)
        nodes.remove(node1)
        nodes.remove(node2)
        nodes.append(new_node)
       
        # create new working distance matrix
        size -= 1  # Removing 2 nodes but adding only one new
        work_matrix = np.zeros((size, size), dtype=float)
        for row, node1 in enumerate(nodes):
            for col, node2 in enumerate(nodes):
                work_matrix[row, col] = np.mean([
                    dist_matrix[taxa_to_rc[i.name], taxa_to_rc[j.name]]
                    for i, j in itertools.product(node1.leaves(), node2.leaves())
                ])

    return nodes[-1]


if __name__ == "__main__":
    # data from table 3 of Fitch and Margoliash, Construction of Phylogenetic trees
    taxa = ["Turtle", "Human", "Tuna", "Chicken", "Moth", "Monkey", "Dog"]
    distances = np.array([
        [0, 19, 27, 8, 33, 18, 13],
        [19, 0, 31, 18, 36, 1, 13],
        [27, 31, 0, 26, 41, 32, 29],
        [8, 18, 26, 0, 31, 17, 14],
        [33, 36, 41, 31, 0, 35, 28],
        [18, 1, 32, 17, 35, 0, 12],
        [13, 13, 29, 14, 28, 12, 0],
    ])
    print(upgma_tree(distances, taxa))

Numpy

Now looking at your working matrix update algorithm and the one presented in the Wikipedia page you linked to, there seem to be quite a difference in complexity. Plus, your update doesn't seem to lean well into vectorization while adding two rows or two columns together using numpy is the core of its features.

And frankly, I didn't quite get why you'd want __len__ on your nodes until I realized the algorithm on Wikipedia needed it. So here is a version that follows the algorithm more closely and make use of vectorized numpy operations, the idea it to stack the weighted average distance of the current nodes at the bottom and right of the working matrix and then remove the rows and columns of the current nodes, in order to mimic what is happening in the nodes list:

"""
UPGMA - Unweighted pair group method using arithmetic averages
Takes as input the distance matrix of species as a numpy array
Returns tree either as a dendrogram or in Newick format
"""

from dataclasses import dataclass
from typing import Self, Generator

import numpy as np


@dataclass
class Leaf:
    """Data structure to store leaf of a UPGMA tree"""

    name: str
    up_height: float = 0.0

    def leaves(self) -> Generator[Self, None, None]:
        yield self

    def __len__(self):
        return 1

    def __str__(self):
        return f'{self.name}:{self.up_height}'


@dataclass
class Node:
    """
    Data structure to store OTU of a UPGMA tree
    """

    left: Self | Leaf
    right: Self | Leaf
    up_height: float = 0.0
    down_height: float = 0.0

    @classmethod
    def from_height(cls, left: Self | Leaf, right: Self | Leaf, height: float):
        left.up_height = height if isinstance(left, Leaf) else height - left.down_height
        right.up_height = height if isinstance(right, Leaf) else height - right.down_height
        return cls(left, right, down_height=height)

    def leaves(self) -> Generator[Leaf, None, None]:
        """
        Method to find the taxa under any given node, effectively equivalent to
        finding leaves of a binary tree. Only lists original taxa and not OTUs.
        """
        yield from self.left.leaves()
        yield from self.right.leaves()

    def __len__(self) -> int:
        """
        Method to define len() of a node.

        Returns the number of original taxa under any given node.
        """
        return len(self.left) + len(self.right)

    def __str__(self) -> str:
        """
        Method to give readable print output
        """
        return f"({self.left},{self.right}):{self.up_height}"


def upgma_tree(dist_matrix: np.ndarray, taxa: list[str]) -> Leaf | Node:
    if not taxa:
        raise ValueError('need at least one leaf (taxon)')

    size = len(taxa)
    work_matrix = np.array(dist_matrix, dtype=float)
    np.fill_diagonal(work_matrix, np.inf)

    if work_matrix.shape != (size, size):
        raise ValueError('distance matrix should be squared in the number of taxa')

    # creating node for each taxa
    nodes = list(map(Leaf, taxa))

    while len(nodes) > 1:
        # finding (row, col) of least dist
        least_id = np.unravel_index(work_matrix.argmin(), work_matrix.shape, "C")
        least_dist = work_matrix[least_id]

        # nodes corresponding to (row, col)
        node1, node2 = (nodes[i] for i in least_id)
        
        # add OTU with node1 and node2 as children. set heights of nodes
        new_node = Node.from_height(node2, node1, least_dist / 2)
        nodes.remove(node1)
        nodes.remove(node2)
        nodes.append(new_node)
       
        # create new working distance matrix:
        # 1) adding mean of distances as distance to the new node at the bottom row
        row1, row2 = work_matrix[least_id, :]
        len1 = len(node1)
        len2 = len(node2)
        distances_update = ((row1 * len1) + (row2 * len2)) / (len1 + len2)
        work_matrix = np.vstack((work_matrix, distances_update.reshape(1, -1)))
        # 2) adding mean of distances as distance to the new node at the last column
        distances_update = np.vstack((distances_update.reshape(-1, 1), [np.inf]))
        work_matrix = np.hstack((work_matrix, distances_update))
        # 3) removing rows and columns of the selected nodes
        work_matrix = np.delete(np.delete(work_matrix, least_id, axis=0), least_id, axis=1)

    return nodes[-1]


if __name__ == "__main__":
    # data from table 3 of Fitch and Margoliash, Construction of Phylogenetic trees
    taxa = ["Turtle", "Human", "Tuna", "Chicken", "Moth", "Monkey", "Dog"]
    distances = np.array([
        [0, 19, 27, 8, 33, 18, 13],
        [19, 0, 31, 18, 36, 1, 13],
        [27, 31, 0, 26, 41, 32, 29],
        [8, 18, 26, 0, 31, 17, 14],
        [33, 36, 41, 31, 0, 35, 28],
        [18, 1, 32, 17, 35, 0, 12],
        [13, 13, 29, 14, 28, 12, 0],
    ])
    print(upgma_tree(distances, taxa))

`numpy.ma`

Masked array may be a viable alternative to writing over existing data when you need to ignore part of your data (such as the diagonal of the distance matrix in your case). I chose not to use it here as adding data for new OTU over and over may lead to a very large matrix eventually; but they're an honorable mention nonetheless.

Stack Exchange Network

UPGMA tree building in python

2 Answers 2

docstring

kwarg defaults

length

type stability

conventional loop names

cite a reference

Datastructure

Algorithm

Proposed improvements

Numpy

`numpy.ma`

You must log in to answer this question.

Hot Network Questions

2 Answers 2

docstring

kwarg defaults

length

type stability

conventional loop names

cite a reference

Datastructure

Algorithm

Proposed improvements

Numpy

You must log in to answer this question.

Related