Manual Regression Tree using Python

Question

I wrote a code to create a regression tree for a synthetic train data of size Np. The idea is, first I have the source node (which consists of all set of points) represented as a dictionary {'points':..., 'avg':..., 'left_node':..., 'right_node', 'split_point': }. The left and right nodes are the leafs after the splitting process of the whole data (source). split_point is for information about the best split. Then I loop to get deeper tree with maximum number of nodes specified before, also I set that a node must have more than 5 points in order it can be split.

This way, If I want to predict a point (x',y'), I can just start from source node source and check which region the point lies (left_node or right_node), ..and then continuing down the tree. Because all left_nodes and right_nodes values have the same structure as source....

Also, the form function is used to find the best split, the best split is the one with the smallest form(reg_1, avg1, reg_2, avg2). This is a greedy algorithm to find the best split.

I would like to know better ways to perform it..without external modules. But this is intended to be taught to high school students.

Full code:

import random
import matplotlib.pyplot as plt


def form(region_1, av1, region_2, av2):
    return sum([(i[1]-av1)**2 for i in region_1]) \
           + sum([(i[1]-av2)**2 for i in region_2])

Np = 400
x_data = [abs(random.gauss(5, 0.2) + random.gauss(8, 0.5)) for i in range(Np)]
y_data = [abs(random.gauss(10, 0.2) + random.uniform(0, 10)) for i in range(Np)]
value = [abs(random.gauss(4, 0.5)) for i in range(Np)]

data = [((i,j), k) for i,j,k in zip(x_data, y_data, value)]

fig, ax = plt.subplots()

ax.plot(x_data, y_data, 'o')

fig.show()


###### Splitting from the source node (all data)

source = {'points': data, 'avg': sum([i[1] for i in data])/Np, \
          'split_point': None, 'left_node': None, 'right_node': None}
forms = []

for x in x_data:
    var = x
    region_1 = [j for j in data if j[0][0] <= var]
    region_2 = [j for j in data if j not in region_1]

    if len(region_1) > 0 and len(region_2) > 0:

        av1 = sum([i[1] for i in region_1])/len(region_1)
        av2 = sum([i[1] for i in region_2])/len(region_2)

        f = form(region_1, av1, region_2, av2)
        leaf_1 = {'points': region_1, 'avg': av1}
        leaf_2 = {'points': region_2, 'avg': av2}
        forms.append( (leaf_1, leaf_2, ('x', var), f) )

for y in y_data:
    var = y
    region_1 = [j for j in data if j[0][1] <= var]
    region_2 = [j for j in data if j not in region_1]

    if len(region_1) > 0 and len(region_2) > 0:

        av1 = sum([i[1] for i in region_1])/len(region_1)
        av2 = sum([i[1] for i in region_2])/len(region_2)

        f = form(region_1, av1, region_2, av2)
        leaf_1 = {'points': region_1, 'avg': av1}
        leaf_2 = {'points': region_2, 'avg': av2}
        forms.append( (leaf_1, leaf_2, ('y', var), f) )


sorted_f = sorted(forms, key = lambda x: x[3])
best_split = sorted_f[0]
source['split_point'] = best_split[2]
source['left_node'] = best_split[0]
source['right_node'] = best_split[1]


##### Splitting from the 2 leafs and so on..

leafs = [source['left_node'], source['right_node']]

all_nodes = [leafs[0], leafs[1]]

max_nodes = 1000

while len(all_nodes) <= max_nodes:
    next_leafs = []
    for leaf in leafs:
        if (len(leaf['points']) > 5):

            xx = [i[0][0] for i in leaf['points']]
            yy = [i[0][1] for i in leaf['points']]
            rr = [i[1] for i in leaf['points']]
            vv = [((i,j), k) for i,j,k in zip(xx, yy, rr)]
            forms = []

            for x in xx:
                var = x
                region_1 = [j for j in vv if j[0][0] <= var]
                region_2 = [j for j in vv if j not in region_1]

                if len(region_1) > 0 and len(region_2) > 0:

                    av1 = sum([i[1] for i in region_1])/len(region_1)
                    av2 = sum([i[1] for i in region_2])/len(region_2)

                    f = form(region_1, av1, region_2, av2)
                    leaf_1 = {'points': region_1, 'avg': av1}
                    leaf_2 = {'points': region_2, 'avg': av2}
                    forms.append( (leaf_1, leaf_2, ('x', var), f) )

            for y in yy:
                var = y
                region_1 = [j for j in vv if j[0][1] <= var]
                region_2 = [j for j in vv if j not in region_1]

                if len(region_1) > 0 and len(region_2) > 0:

                    av1 = sum([i[1] for i in region_1])/len(region_1)
                    av2 = sum([i[1] for i in region_2])/len(region_2)

                    f = form(region_1, av1, region_2, av2)
                    leaf_1 = {'points': region_1, 'avg': av1}
                    leaf_2 = {'points': region_2, 'avg': av2}
                    forms.append( (leaf_1, leaf_2, ('y', var), f) )

            sorted_f = sorted(forms, key = lambda x: x[3])
            best_split = sorted_f[0]
            leaf['split_point'] = best_split[2]
            leaf['left_node'] = best_split[0]
            leaf['right_node'] = best_split[1]

            print(leaf['split_point'])

            next_leafs.append(leaf['left_node'])
            next_leafs.append(leaf['right_node'])

            print("\n")

    leafs = next_leafs
    all_nodes.extend(leafs)
    if len(leafs) == 0:
        break

Redsbefall · Accepted Answer · 2019-07-03 04:42:41Z

Here is my updated version, It looks simpler and better by creating a class, Node.

import random


def formula(region_1, av1, region_2, av2):
    return sum([(i[1]-av1)**2 for i in region_1]) \
           + sum([(i[1]-av2)**2 for i in region_2])

def average(data):
    return sum([d[2] for d in data])/len(data)

Np = 400
x_data = [abs(random.gauss(5, 0.2) + random.gauss(8, 0.5)) for i in range(Np)]
y_data = [abs(random.gauss(10, 0.2) + random.uniform(0, 10)) for i in range(Np)]
z_data = [abs(random.gauss(4, 0.5)) for i in range(Np)]


class Node:
    def __init__(self, x_data, y_data, z_data):
        self.x_data = x_data
        self.y_data = y_data
        self.z_data = z_data
        self.points = [(i, j, k) for i, j, k in zip(x_data, y_data, z_data)]
        self.avg = average(self.points)

    def split(self):
        #Finding the best split:
        candidates = []
        for x in self.x_data:
            split_point = x
            region_1 = [i for i in self.points if i[0] <= split_point]
            region_2 = [i for i in self.points if i not in region_1]
            if (region_1 != []) and (region_2 != []):
                leaf_1 = Node([i[0] for i in region_1], \
                         [i[1] for i in region_1], \
                         [i[2] for i in region_1])
                leaf_2 = Node([i[0] for i in region_2], \
                         [i[1] for i in region_2], \
                         [i[2] for i in region_2])
                f = formula(region_1, leaf_1.avg, region_2, leaf_2.avg)
                candidates.append( (leaf_1, leaf_2, ('x', split_point), f) )
        for y in self.y_data:
            split_point = y
            region_1 = [i for i in self.points if i[1] <= split_point]
            region_2 = [i for i in self.points if i not in region_1]
            if (region_1 != []) and (region_2 != []):
                leaf_1 = Node([i[0] for i in region_1], \
                         [i[1] for i in region_1], \
                         [i[2] for i in region_1])
                leaf_2 = Node([i[0] for i in region_2], \
                         [i[1] for i in region_2], \
                         [i[2] for i in region_2])
                f = formula(region_1, leaf_1.avg, region_2, leaf_2.avg)
                candidates.append( (leaf_1, leaf_2, ('y', split_point), f) )

        sorted_f = sorted(candidates, key = lambda x: x[3])
        best_split = sorted_f[0]

        #The result:
        self.split_point = best_split[2]
        self.left_node = best_split[0]
        self.right_node = best_split[1]


#Source node and 1st split
source = Node(x_data, y_data, z_data)
source.split()

#Generate Binary Tree
result_nodes = [source.left_node, source.right_node]
all_nodes = [source.left_node, source.right_node]

min_nodes = 1000
min_points = 5

while len(all_nodes) <= min_nodes:
    next_nodes = []
    for node in result_nodes:
        if (len(node.points) > min_points):
            node.split()
            next_nodes.append(node.left_node)
            next_nodes.append(node.right_node)
    result_nodes = next_nodes
    all_nodes.extend(result_nodes)
    if len(result_nodes) == 0:
        break

Stack Exchange Network

Manual Regression Tree using Python

1 Answer 1

You must log in to answer this question.

Hot Network Questions

Manual Regression Tree using Python

1 Answer 1

You must log in to answer this question.

Related

Hot Network Questions