Recursion with PostgreSQL Part 2 - Performances
This is a follow-up article on my previous blog post A Different Type of SQL Recursion with PostgreSQL.
Previously, I explored two different approaches to recursion with PostgreSQL.
Now, it's time to do some testing.
Testing
Instead of a view on system tables that enables us to traverse table relationship tree, I've decided to create a simple test table:
create table big_tree_test (
id text not null,
parent_id text not null
);
create table big_tree_test (
id text not null,
parent_id text not null
);
And then use a script to insert as many random values as I like:
do
language plpgsql
$$
declare
_max_rows int = 1000;
_max_group int = 25;
begin
drop table if exists big_tree_test;
create table big_tree_test (
id text not null,
parent_id text not null
);
create or replace function pg_temp.random_int(_min int, _max int) returns int language sql as
'select floor(random()* (_max - _min + 1) + _min)::int;';
insert into big_tree_test
select
'node_' || t1.id as id,
'node_' || t2.r as parent_id
from (
select id, pg_temp.random_int(1, _max_group) r
from generate_series(1, _max_rows) id
) t1
cross join lateral (
select distinct pg_temp.random_int(1, _max_rows) r
from generate_series(1, t1.r)
where r <> t1.id
) t2;
end;
$$;
do
language plpgsql
$$
declare
_max_rows int = 1000;
_max_group int = 25;
begin
drop table if exists big_tree_test;
create table big_tree_test (
id text not null,
parent_id text not null
);
create or replace function pg_temp.random_int(_min int, _max int) returns int language sql as
'select floor(random()* (_max - _min + 1) + _min)::int;';
insert into big_tree_test
select
'node_' || t1.id as id,
'node_' || t2.r as parent_id
from (
select id, pg_temp.random_int(1, _max_group) r
from generate_series(1, _max_rows) id
) t1
cross join lateral (
select distinct pg_temp.random_int(1, _max_rows) r
from generate_series(1, t1.r)
where r <> t1.id
) t2;
end;
$$;
This script will insert text nodes in groups. There are two parameters:
_max_rows
- how many id rows will be generated_max_groups
- any id can repeat to point to different parents how many times? Minimal 1, maximum this number.
For _max_rows = 10
and _max_groups = 3
, this script will generate the following test set:
id | parent_id |
---|---|
node_1 | node_3 |
node_1 | node_8 |
node_4 | node_8 |
node_5 | node_7 |
node_6 | node_4 |
node_6 | node_6 |
node_6 | node_9 |
node_7 | node_10 |
node_8 | node_10 |
node_8 | node_5 |
node_8 | node_9 |
node_9 | node_9 |
node_10 | node_1 |
So, based on a manual calculation, this is the result we need to get for processing a node_1
:
id | parent_id | level |
---|---|---|
node_1 | node_3 | 1 |
node_1 | node_8 | 1 |
node_8 | node_10 | 2 |
node_8 | node_5 | 2 |
node_8 | node_9 | 2 |
node_10 | node_1 | 3 |
node_5 | node_7 | 3 |
node_9 | node_9 | 3 |
node_7 | node_10 | 4 |
Finally, we can create new functions to work with table big_tree_test
:
- The version that uses recursive CTE:
create or replace function select_nodes_recursive_cte(
_id text
)
returns table (
id text,
parent_id text,
level int
)
language sql
as
$$
with recursive _recursive_cte as (
select
id,
parent_id
from
big_tree_test
where
id = _id
union
select
rec.parent_id as id,
t.parent_id
from
_recursive_cte rec
inner join big_tree_test t on rec.parent_id = t.id
), _cte as (
select
id,
parent_id,
row_number() over(),
case
when lag(id) over() = id
then 0
else 1
end as island_flips
from
_recursive_cte
)
select
id,
parent_id,
sum(island_flips) over (order by row_number) as level
from
_cte
order by
row_number
$$;
create or replace function select_nodes_recursive_cte(
_id text
)
returns table (
id text,
parent_id text,
level int
)
language sql
as
$$
with recursive _recursive_cte as (
select
id,
parent_id
from
big_tree_test
where
id = _id
union
select
rec.parent_id as id,
t.parent_id
from
_recursive_cte rec
inner join big_tree_test t on rec.parent_id = t.id
), _cte as (
select
id,
parent_id,
row_number() over(),
case
when lag(id) over() = id
then 0
else 1
end as island_flips
from
_recursive_cte
)
select
id,
parent_id,
sum(island_flips) over (order by row_number) as level
from
_cte
order by
row_number
$$;
- The version that uses recursive function call:
create or replace function select_nodes_recursive_function(
_id text,
_level int = 1
)
returns table (
id text,
parent_id text,
level int
)
language plpgsql
as
$$
declare
_row record;
begin
create temp table if not exists _result (
id text,
parent_id text,
level int
) on commit drop;
if exists(
select 1 from _result t where t.id = _id
) then
return;
end if;
for _row in (
select
t.id,
t.parent_id
from
big_tree_test t
where
t.id = _id
) loop
insert into _result
select
_row.id,
_row.parent_id,
_level;
perform select_nodes_recursive_function(
_row.parent_id,
_level + 1
);
end loop;
return query
select
t.id,
t.parent_id,
t.level
from
_result t;
end;
$$;
create or replace function select_nodes_recursive_function(
_id text,
_level int = 1
)
returns table (
id text,
parent_id text,
level int
)
language plpgsql
as
$$
declare
_row record;
begin
create temp table if not exists _result (
id text,
parent_id text,
level int
) on commit drop;
if exists(
select 1 from _result t where t.id = _id
) then
return;
end if;
for _row in (
select
t.id,
t.parent_id
from
big_tree_test t
where
t.id = _id
) loop
insert into _result
select
_row.id,
_row.parent_id,
_level;
perform select_nodes_recursive_function(
_row.parent_id,
_level + 1
);
end loop;
return query
select
t.id,
t.parent_id,
t.level
from
_result t;
end;
$$;
So, we're all set. Let's go.
Problem
Right from the start, while doing smoke tests, I noticed something very wrong. The function select_nodes_recursive_cte
doesn't return the expected results at all.
For a small test set above, this is what it returns:
id | parent_id | level |
---|---|---|
node_1 | node_3 | 1 |
node_1 | node_8 | 1 |
node_8 | node_10 | 2 |
node_8 | node_5 | 2 |
node_8 | node_9 | 2 |
node_5 | node_7 | 3 |
node_10 | node_1 | 4 |
node_9 | node_9 | 5 |
node_7 | node_10 | 6 |
Nodes node_5
, node_10
, and node_9
are all level 3, not 3, 4 and 6, and node_7
is level 4, not 6.
After some investigation, my strategy of "islands and gaps" (see previous article) is completely wrong!
Implementation was fine; the fact is that every change in id
row doesn't mean automatically that we have the new recursion level. At all.
And after some more investigation, it appears that the only way to implement the recursion level calculation is to add level counter in CTE union queries.
However, there is a big problem with this approach:
There is no way to know which node was already processed in recursive CTE to be able to skip it entirely, and:
- If we have a circular reference, one node points to another, and that node points to the first node - we will have an infinite loop. It's not even a stack overflow, just an infinite loop.
- The only way to exit this infinite loop is to set a hard limit on recursion depth.
So, my final version looks like this now:
create or replace function select_nodes_recursive_cte(
_id text,
_max_recursion int = 100
)
returns table (
id text,
parent_id text,
level int
)
language sql
as
$$
with recursive _recursive_cte as (
select
id,
parent_id,
1 as level
from
big_tree_test
where
id = _id
union
select
rec.parent_id as id,
t.parent_id,
rec.level + 1 as level
from
_recursive_cte rec
inner join big_tree_test t on rec.parent_id = t.id
where
level < _max_recursion
)
select r.id, r.parent_id, r.level
from _recursive_cte r
$$;
create or replace function select_nodes_recursive_cte(
_id text,
_max_recursion int = 100
)
returns table (
id text,
parent_id text,
level int
)
language sql
as
$$
with recursive _recursive_cte as (
select
id,
parent_id,
1 as level
from
big_tree_test
where
id = _id
union
select
rec.parent_id as id,
t.parent_id,
rec.level + 1 as level
from
_recursive_cte rec
inner join big_tree_test t on rec.parent_id = t.id
where
level < _max_recursion
)
select r.id, r.parent_id, r.level
from _recursive_cte r
$$;
But this is a very bad solution, and here is why:
We don't know which recursion depth is appropriate. And increasing the recursion threshold will only slow this function dramatically. We will still not know if we fetched all levels or if the recursive CTE ran in circles when it encountered the circular reference.
Let's see some tests - a test table with 1290 records and 250 nodes:
- Recursion depth 100 - 00.330 seconds.
- Recursion depth 1000 - 03.531 seconds.
- Recursion depth 10000 - 36.228 seconds.
So that's no good. If anyone has a better implementation with recursion level (depth) implemented, I'd like to see it.
The Solution
Let's try our procedural approach to this problem to see how it performs. Same test table: - 1290 records and 250 nodes - 00.783 seconds.
It's not bad at all, but let's try a bit bigger dataset:
- 3987 records with 500 nodes - 04.291 seconds
- 13232 records with 999 nodes -
ERROR: stack depth limit exceeded
So that's no good.
Let's see if we can optimize this function a bit.
Optimization 1
The first thing I noticed was that the message log was full of NOTICE: relation "_result" already exists, skipping˙
messages.
I'm not even sure if this is even an optimization, but I first would like to get rid of these messages.
So, instead of this:
create temp table if not exists _result (
id text,
parent_id text,
level int
) on commit drop;
create temp table if not exists _result (
id text,
parent_id text,
level int
) on commit drop;
We will have this:
if not exists(
select 1 from information_schema.tables t
where t.table_name = '_result' and table_type = 'LOCAL TEMPORARY'
) then
create temp table _result (
id text,
parent_id text,
level int
) on commit drop;
end if;
if not exists(
select 1 from information_schema.tables t
where t.table_name = '_result' and table_type = 'LOCAL TEMPORARY'
) then
create temp table _result (
id text,
parent_id text,
level int
) on commit drop;
end if;
The second thing I would try is to get rid of that loop. Looping is an anti-pattern with SQL, anyhow.
Instead, this is what I came up with:
- insert into the temp table and for
id
parameter and aggregateparent_id
values in an array variable. - call recursion for each distinct value from that array variable.
This is how that looks like:
with _insert_cte as (
insert into _result
select
t.id,
t.parent_id,
_level
from
big_tree_test t
where
t.id = _id
returning
_result.parent_id
)
select array_agg(distinct p.parent_id)
into _parents
from _insert_cte p;
perform select_nodes_recursive_function(p, _level + 1)
from unnest(_parents) p;
with _insert_cte as (
insert into _result
select
t.id,
t.parent_id,
_level
from
big_tree_test t
where
t.id = _id
returning
_result.parent_id
)
select array_agg(distinct p.parent_id)
into _parents
from _insert_cte p;
perform select_nodes_recursive_function(p, _level + 1)
from unnest(_parents) p;
Now, let's do performance things again:
- 13232 records with 999 nodes - 07.338 seconds.
Not bad, but let's see if we can do even better.
Optimization 2
The idea is to reduce the number of recursion calls as much as possible. Currently, we have this:
perform select_nodes_recursive_function(p, _level + 1)
from unnest(_parents) p;
perform select_nodes_recursive_function(p, _level + 1)
from unnest(_parents) p;
This executes the select_nodes_recursive_function
function one for each element in _parents
array variable. Instead, we could change the input parameter type so that it receives an entire array instead of a single value.
Great idea, let's do it!
Oh, wait, but to have that work as intended, we need to slightly modify the condition we use to exit the recursion in two steps:
- Remove for the array
id
parameter all IDs that are already processed (we don't want them). - Check is array empty, and exit if it is.
_id = array(select unnest(_id) except select distinct t.id from _result t);
if _id = '{}' then
return;
end if;
_id = array(select unnest(_id) except select distinct t.id from _result t);
if _id = '{}' then
return;
end if;
Also, we need to use any
operator when filtering our table for the insert:
insert into _result
select
t.id,
t.parent_id,
_level
from
big_tree_test t
where
t.id = any(_id)
returning
_result.parent_id
insert into _result
select
t.id,
t.parent_id,
_level
from
big_tree_test t
where
t.id = any(_id)
returning
_result.parent_id
This seems to produce satisfying results...
Now, let's do performance things again:
- 13232 records with 5000 nodes - 00.090 seconds.
Wait what?
Let's increase the test data set some more:
- 78146 records with 999 nodes - 00.450 seconds.
- 252296 records with 10000 nodes - 01.656 seconds.
- 757824 records with 19999 nodes - 05.372 seconds.
So it's almost 757K tree records with 20K unique nodes in 5 seconds, and I haven't even used any indexes yet. Not bad, not bad at all...
Finally, let's see how that magic function looks like
create or replace function select_nodes_recursive_function(
_id text[],
_level int = 1
)
returns table (
id text,
parent_id text,
level int
)
language plpgsql
as
$$
declare
_row record;
_parents text[];
begin
if not exists(
select 1 from information_schema.tables t
where t.table_name = '_result' and table_type = 'LOCAL TEMPORARY'
) then
create temp table _result (
id text,
parent_id text,
level int
) on commit drop;
end if;
_id = array(select unnest(_id) except select distinct t.id from _result t);
if _id = '{}' then
return;
end if;
with _insert_cte as (
insert into _result
select
t.id,
t.parent_id,
_level
from
big_tree_test t
where
t.id = any(_id)
returning
_result.parent_id
)
select array_agg(distinct p.parent_id)
into _parents
from _insert_cte p;
perform select_nodes_recursive_function(_parents, _level + 1);
return query
select
t.id,
t.parent_id,
t.level
from
_result t;
end;
$$;
create or replace function select_nodes_recursive_function(
_id text[],
_level int = 1
)
returns table (
id text,
parent_id text,
level int
)
language plpgsql
as
$$
declare
_row record;
_parents text[];
begin
if not exists(
select 1 from information_schema.tables t
where t.table_name = '_result' and table_type = 'LOCAL TEMPORARY'
) then
create temp table _result (
id text,
parent_id text,
level int
) on commit drop;
end if;
_id = array(select unnest(_id) except select distinct t.id from _result t);
if _id = '{}' then
return;
end if;
with _insert_cte as (
insert into _result
select
t.id,
t.parent_id,
_level
from
big_tree_test t
where
t.id = any(_id)
returning
_result.parent_id
)
select array_agg(distinct p.parent_id)
into _parents
from _insert_cte p;
perform select_nodes_recursive_function(_parents, _level + 1);
return query
select
t.id,
t.parent_id,
t.level
from
_result t;
end;
$$;
To invoke this function, pass the array parameter:
select *
from select_nodes_recursive_function(array['node_1'])
select *
from select_nodes_recursive_function(array['node_1'])
And that's it: fast tree parsing recursion for PostgreSQL.
Conclusion
This started as an exploration into the possibilities of parsing tree structures with PostgreSQL and has now grown into a series of articles.
My conclusion is that SQL is not that suitable for recursion. As we can see in this article, it has some severe limitations, mainly regarding recursion depth issues.
As far as I know, SQL was never even intended to do that in the first place.
On the other hand, the more procedural approach, optimized properly, can yield some spectacular results.
However, this approach may not be for everybody.
vb-software linkedin
You will receive notifications about new posts on your LinkedIn feed.