Skip to content

Commit 0110bd3

Browse files
Initial commit
0 parents  commit 0110bd3

File tree

4 files changed

+509
-0
lines changed

4 files changed

+509
-0
lines changed

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
Text diff storage for Postgresql
2+
--------------------------------
3+
4+
Wiki engines usually store the full text of every revision of every page in the
5+
database. This is convienent, but a massive waste of space. The goal of this
6+
project is to store text diffs in a Postgresql database, and use triggers
7+
and views to create a virtual table that contains the full text of every
8+
revision of every page.
9+
10+
Yes, it's slower. But sometimes, you need or want to optimize for storage
11+
space instead of time.

apply.sql

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
create or replace function update_page(page_id int, new_content text,
2+
editor_id int, new_comment text DEFAULT '', context_len int DEFAULT 3) returns int as $$
3+
declare
4+
latest page_latest;
5+
new_revision int;
6+
hunk text[];
7+
context text[]; -- only contains consecutive lines in LCS
8+
in_hunk boolean := FALSE;
9+
hunk_start int := 1;
10+
hunk_lines_added int := 0;
11+
hunk_lines_deleted int := 0;
12+
hunk_lines_context int := 0;
13+
ary1 text[];
14+
ary2 text[];
15+
LCS text[];
16+
line1 text;
17+
line2 text;
18+
lineLCS text;
19+
ptr1 int := 1;
20+
ptr2 int := 1;
21+
ptrLCS int := 1;
22+
begin
23+
SELECT * INTO latest FROM page_latest WHERE id = page_id;
24+
IF NOT FOUND THEN
25+
RAISE EXCEPTION 'Page % not found', id;
26+
END IF;
27+
new_revision := latest.revision + 1;
28+
raise notice 'new revision: %', new_revision;
29+
-- write out new diff object
30+
INSERT INTO page_diff (page_id, revision, editor, comment)
31+
VALUES (page_id, latest.revision, latest.editor, latest.comment);
32+
-- make hunks
33+
ary1 := string_to_array(latest.content, E'\n');
34+
ary2 := string_to_array(new_content, E'\n');
35+
raise notice 'About to determine longest common substring';
36+
LCS := lcs(ary1, ary2);
37+
raise notice 'Longest common substring determined';
38+
line1 := ary1[ptr1];
39+
line2 := ary2[ptr2];
40+
lineLCS := LCS[ptrLCS];
41+
LOOP
42+
if line1 is null and line2 is null and lineLCS is null then
43+
-- we're done!
44+
IF in_hunk THEN
45+
IF array_length(context, 1) IS NOT NULL THEN
46+
-- add context to hunk
47+
hunk := hunk || context;
48+
hunk_lines_context := hunk_lines_context + array_length(context, 1);
49+
END IF;
50+
-- write out the last hunk
51+
INSERT INTO page_diff_hunk (page_id, revision, start,
52+
content, lines_added, lines_deleted, lines_context)
53+
VALUES
54+
(page_id, latest.revision, hunk_start, array_to_string(hunk, E'\n'),
55+
hunk_lines_added, hunk_lines_deleted, hunk_lines_context);
56+
END IF;
57+
-- update the page_latest object
58+
UPDATE page_latest SET content = new_content, revision = new_revision,
59+
num_lines = array_length(ary2, 1), comment = new_comment,
60+
editor = editor_id, edited_on = now()
61+
WHERE id = page_id;
62+
return new_revision;
63+
end if;
64+
-- handle same line
65+
if line1 = lineLCS and line2 = lineLCS then
66+
raise notice 'equal lines: %', lineLCS;
67+
IF NOT in_hunk THEN
68+
-- LIFO queue
69+
IF array_length(context, 1) < context_len THEN
70+
context := context || (' ' || lineLCS);
71+
hunk_lines_context := hunk_lines_context + 1;
72+
ELSE
73+
context := context[2:context_len] || (' ' || lineLCS);
74+
END IF;
75+
ELSE
76+
context := context || (' ' || lineLCS);
77+
hunk_lines_context := hunk_lines_context + 1;
78+
-- are we done with this hunk?
79+
IF array_length(context, 1) = context_len THEN
80+
-- write out the hunk
81+
INSERT INTO page_diff_hunk (page_id, revision, start,
82+
content, lines_added, lines_deleted, lines_context)
83+
VALUES
84+
(page_id, latest.revision, hunk_start, array_to_string(hunk, E'\n'),
85+
hunk_lines_added, hunk_lines_deleted, hunk_lines_context);
86+
-- and reset
87+
hunk := array[]::text[];
88+
context := array[]::text[];
89+
in_hunk := FALSE;
90+
hunk_lines_added := 0;
91+
hunk_lines_deleted := 0;
92+
hunk_lines_context := 0;
93+
END IF;
94+
END IF;
95+
ptr1 := ptr1 + 1;
96+
ptr2 := ptr2 + 1;
97+
ptrLCS := ptrLCS + 1;
98+
line1 := ary1[ptr1];
99+
line2 := ary2[ptr2];
100+
lineLCS := LCS[ptrLCS];
101+
continue; -- skip the rest of this function and go on
102+
end if;
103+
-- reset context array
104+
IF NOT in_hunk THEN
105+
-- start a new hunk
106+
hunk = context;
107+
in_hunk = TRUE;
108+
IF ptr1 > context_len THEN
109+
hunk_start = ptr1 - context_len;
110+
ELSE
111+
IF array_length(context, 1) IS NULL THEN
112+
hunk_start = ptr1;
113+
ELSE
114+
hunk_start = ptr1 - array_length(context, 1);
115+
END IF;
116+
END IF;
117+
ELSE
118+
IF array_length(context, 1) IS NOT NULL THEN
119+
-- add context to hunk
120+
hunk := hunk || context;
121+
hunk_lines_context := hunk_lines_context + array_length(context, 1);
122+
END IF;
123+
END IF;
124+
context := array[]::text[];
125+
-- done resetting context; handle addition and deletion
126+
if line1 is not null and (line1 != lineLCS or lineLCS is null) then
127+
-- must have been deleted
128+
hunk := hunk || ('-' || line1);
129+
ptr1 := ptr1 + 1;
130+
line1 := ary1[ptr1];
131+
hunk_lines_deleted := hunk_lines_deleted + 1;
132+
continue;
133+
end if;
134+
if line2 is not null and (line2 != lineLCS or lineLCS is null) then
135+
-- must have been added
136+
hunk := hunk || ('+' || line2);
137+
ptr2 := ptr2 + 1;
138+
line2 := ary2[ptr2];
139+
hunk_lines_added := hunk_lines_added + 1;
140+
continue;
141+
end if;
142+
END LOOP;
143+
end;
144+
$$ language plpgsql
145+
VOLATILE STRICT;
146+

diffing.sql

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
-- author: David Baumgold <david@davidbaumgold.com>
2+
3+
-- longest common subsequence
4+
create or replace function lcs(text[], text[]) returns text[] as $$
5+
declare
6+
len1 int;
7+
len2 int;
8+
lastrow1 text;
9+
lastrow2 text;
10+
recurse1 text[];
11+
recurse2 text[];
12+
begin
13+
len1 := array_length($1, 1);
14+
len2 := array_length($2, 1);
15+
if len1 is null or len2 is null then
16+
return null;
17+
end if;
18+
19+
lastrow1 := $1[len1];
20+
lastrow2 := $2[len2];
21+
22+
if lastrow1 = lastrow2 then
23+
if len1 = 1 then
24+
return array[lastrow1];
25+
else
26+
return array_append(lcs($1[1:len1-1], $2[1:len2-1]), lastrow1);
27+
end if;
28+
else
29+
recurse1 := lcs($1[1:len1-1], $2);
30+
recurse2 := lcs($1, $2[1:len2-1]);
31+
if recurse2 is null or array_length(recurse1, 1) > array_length(recurse2, 1) then
32+
return recurse1;
33+
else
34+
return recurse2;
35+
end if;
36+
end if;
37+
end;
38+
$$ language plpgsql
39+
IMMUTABLE STRICT;
40+
41+
-- generate diffs between text
42+
create or replace function create_diff(text, text) returns text as $$
43+
declare
44+
ary1 text[];
45+
ary2 text[];
46+
LCS text[];
47+
line1 text;
48+
line2 text;
49+
lineLCS text;
50+
ptr1 int := 1;
51+
ptr2 int := 1;
52+
ptrLCS int := 1;
53+
result text[];
54+
begin
55+
ary1 := string_to_array($1, E'\n');
56+
ary2 := string_to_array($2, E'\n');
57+
LCS := lcs(ary1, ary2);
58+
line1 := ary1[ptr1];
59+
line2 := ary2[ptr2];
60+
lineLCS := LCS[ptrLCS];
61+
LOOP
62+
if line1 is null and line2 is null and lineLCS is null then
63+
return array_to_string(result, E'\n');
64+
end if;
65+
if line1 = lineLCS and line2 = lineLCS then
66+
result := array_append(result, ' ' || lineLCS);
67+
ptr1 := ptr1 + 1;
68+
ptr2 := ptr2 + 1;
69+
ptrLCS := ptrLCS + 1;
70+
line1 := ary1[ptr1];
71+
line2 := ary2[ptr2];
72+
lineLCS := LCS[ptrLCS];
73+
continue;
74+
end if;
75+
if line1 is not null and (line1 != lineLCS or lineLCS is null) then
76+
-- must have been deleted
77+
result := array_append(result, '-' || line1);
78+
ptr1 := ptr1 + 1;
79+
line1 := ary1[ptr1];
80+
continue;
81+
end if;
82+
if line2 is not null and (line2 != lineLCS or lineLCS is null) then
83+
-- must have been added
84+
result := array_append(result, '+' || line2);
85+
ptr2 := ptr2 + 1;
86+
line2 := ary2[ptr2];
87+
continue;
88+
end if;
89+
END LOOP;
90+
end;
91+
$$ language plpgsql
92+
IMMUTABLE STRICT;
93+
94+
95+
96+
-- only for reference (unused)
97+
create table thingy (
98+
id int,
99+
text text
100+
);
101+
insert into thingy (id, text) values (1, 'abc'), (2, 'def');
102+
create or replace function get_things() returns setof thingy as $$
103+
declare
104+
row1 thingy;
105+
row2 thingy;
106+
begin
107+
row1 := (3, 'foo');
108+
row2 := (4, 'bar');
109+
return next row1;
110+
return next row2;
111+
end;
112+
$$ language plpgsql;
113+
114+
create or replace function process_things(things thingy[]) returns int[] as $$
115+
declare
116+
thing thingy;
117+
results int[];
118+
begin
119+
FOREACH thing IN ARRAY things LOOP
120+
results := results || thing.id;
121+
END LOOP;
122+
RETURN results;
123+
end;
124+
$$ language plpgsql;
125+
select process_things(array(select get_things())); -- works
126+
127+

0 commit comments

Comments
 (0)