I wrote a code to create a regression tree for a synthetic train data of size Np. The idea is, first I have the source node (which consists of all set of points) represented as a dictionary {'points':..., 'avg':..., 'left_node':..., 'right_node', 'split_point': }. The left and right nodes are the leafs after the splitting process of the whole data (source). split_point is for information about the best split. Then I loop to get deeper tree with maximum number of nodes specified before, also I set that a node must have more than 5 points in order it can be split.
This way, If I want to predict a point (x',y'), I can just start from source node source and check which region the point lies (left_node or right_node), ..and then continuing down the tree. Because all left_nodes and right_nodes values have the same structure as source....
Also, the form function is used to find the best split, the best split is the one with the smallest form(reg_1, avg1, reg_2, avg2). This is a greedy algorithm to find the best split.
I would like to know better ways to perform it..without external modules. But this is intended to be taught to high school students.
Full code:
import random
import matplotlib.pyplot as plt
def form(region_1, av1, region_2, av2):
return sum([(i[1]-av1)**2 for i in region_1]) \
+ sum([(i[1]-av2)**2 for i in region_2])
Np = 400
x_data = [abs(random.gauss(5, 0.2) + random.gauss(8, 0.5)) for i in range(Np)]
y_data = [abs(random.gauss(10, 0.2) + random.uniform(0, 10)) for i in range(Np)]
value = [abs(random.gauss(4, 0.5)) for i in range(Np)]
data = [((i,j), k) for i,j,k in zip(x_data, y_data, value)]
fig, ax = plt.subplots()
ax.plot(x_data, y_data, 'o')
fig.show()
###### Splitting from the source node (all data)
source = {'points': data, 'avg': sum([i[1] for i in data])/Np, \
'split_point': None, 'left_node': None, 'right_node': None}
forms = []
for x in x_data:
var = x
region_1 = [j for j in data if j[0][0] <= var]
region_2 = [j for j in data if j not in region_1]
if len(region_1) > 0 and len(region_2) > 0:
av1 = sum([i[1] for i in region_1])/len(region_1)
av2 = sum([i[1] for i in region_2])/len(region_2)
f = form(region_1, av1, region_2, av2)
leaf_1 = {'points': region_1, 'avg': av1}
leaf_2 = {'points': region_2, 'avg': av2}
forms.append( (leaf_1, leaf_2, ('x', var), f) )
for y in y_data:
var = y
region_1 = [j for j in data if j[0][1] <= var]
region_2 = [j for j in data if j not in region_1]
if len(region_1) > 0 and len(region_2) > 0:
av1 = sum([i[1] for i in region_1])/len(region_1)
av2 = sum([i[1] for i in region_2])/len(region_2)
f = form(region_1, av1, region_2, av2)
leaf_1 = {'points': region_1, 'avg': av1}
leaf_2 = {'points': region_2, 'avg': av2}
forms.append( (leaf_1, leaf_2, ('y', var), f) )
sorted_f = sorted(forms, key = lambda x: x[3])
best_split = sorted_f[0]
source['split_point'] = best_split[2]
source['left_node'] = best_split[0]
source['right_node'] = best_split[1]
##### Splitting from the 2 leafs and so on..
leafs = [source['left_node'], source['right_node']]
all_nodes = [leafs[0], leafs[1]]
max_nodes = 1000
while len(all_nodes) <= max_nodes:
next_leafs = []
for leaf in leafs:
if (len(leaf['points']) > 5):
xx = [i[0][0] for i in leaf['points']]
yy = [i[0][1] for i in leaf['points']]
rr = [i[1] for i in leaf['points']]
vv = [((i,j), k) for i,j,k in zip(xx, yy, rr)]
forms = []
for x in xx:
var = x
region_1 = [j for j in vv if j[0][0] <= var]
region_2 = [j for j in vv if j not in region_1]
if len(region_1) > 0 and len(region_2) > 0:
av1 = sum([i[1] for i in region_1])/len(region_1)
av2 = sum([i[1] for i in region_2])/len(region_2)
f = form(region_1, av1, region_2, av2)
leaf_1 = {'points': region_1, 'avg': av1}
leaf_2 = {'points': region_2, 'avg': av2}
forms.append( (leaf_1, leaf_2, ('x', var), f) )
for y in yy:
var = y
region_1 = [j for j in vv if j[0][1] <= var]
region_2 = [j for j in vv if j not in region_1]
if len(region_1) > 0 and len(region_2) > 0:
av1 = sum([i[1] for i in region_1])/len(region_1)
av2 = sum([i[1] for i in region_2])/len(region_2)
f = form(region_1, av1, region_2, av2)
leaf_1 = {'points': region_1, 'avg': av1}
leaf_2 = {'points': region_2, 'avg': av2}
forms.append( (leaf_1, leaf_2, ('y', var), f) )
sorted_f = sorted(forms, key = lambda x: x[3])
best_split = sorted_f[0]
leaf['split_point'] = best_split[2]
leaf['left_node'] = best_split[0]
leaf['right_node'] = best_split[1]
print(leaf['split_point'])
next_leafs.append(leaf['left_node'])
next_leafs.append(leaf['right_node'])
print("\n")
leafs = next_leafs
all_nodes.extend(leafs)
if len(leafs) == 0:
break