Generate nested array with OPENJSON very slow

Question

I've just started using OPENJSON with SQL Server 2016 SP1.

I have this sentence:

select c.Serial as Parent,
    (Select co.Serial, agc.Position
      from AggregationChildren agc, Aggregation ag, Code co
      where agc.AggregationId = a.AggregationId 
      and co.CodeId = agc.AggregationChildrenId for json path) as children
    from Aggregation a, Code c
    where c.CodeId = a.AggregationId for json path

To generate this JSON:

{"Parent":"4244287599479491","Children":
[{"Serial":"8915753733724633","Position":"1"},
{"Serial":"1247782815710855","Position":"2"},
...]}

But it is very very slow.

My problem is with the Children array because I don't know how to get it.

Is there a way to do it faster?

These are the tables:

CREATE TABLE [dbo].[Code] (
    [CodeId]            INT            IDENTITY (1, 1) NOT NULL,
    [Serial]            NVARCHAR (20)  NOT NULL,
    [ ... ],
    CONSTRAINT [PK_CODE] PRIMARY KEY CLUSTERED ([CodeId] ASC),
    [ ... ]
)

CREATE TABLE [dbo].[Aggregation] (
    [AggregationId] INT           NOT NULL,
    [ ... ], 
    CONSTRAINT [PK_AGGREGATIONS] PRIMARY KEY CLUSTERED ([AggregationId] ASC),
    CONSTRAINT [FK_Aggregation_Code]
           FOREIGN KEY ([AggregationId])
            REFERENCES [dbo].[Code] ([CodeId])
)

CREATE TABLE [dbo].[AggregationChildren] (
    [AggregationChildrenId] INT NOT NULL,
    [AggregationId]         INT NOT NULL,
    [Position]              INT NOT NULL,
    CONSTRAINT [PK_AGGREGATION_CHILDS] PRIMARY KEY CLUSTERED ([AggregationChildrenId] ASC),
    CONSTRAINT [FK_AggregationChildren_Code]
           FOREIGN KEY ([AggregationChildrenId])
            REFERENCES [dbo].[Code] ([CodeId]),
    CONSTRAINT [FK_AggregationChildren_Aggregation]
           FOREIGN KEY ([AggregationId])
            REFERENCES [dbo].[Aggregation] ([AggregationId]) ON DELETE CASCADE
)

The Serial column is an nvarchar(20) since the values may be any combination of alphanumerics, even though my example only shows numerics.

Hannah Vernon · Accepted Answer · 2018-03-08 21:35:26Z

I had a hard time parsing your query, however I believe this returns the same result, and is much faster:

SELECT Parent = c.Serial
    , Children = (
        SELECT c.Serial 
            , cac.Position
        FROM dbo.Code cc
            INNER JOIN dbo.AggregationChildren cac ON cac.AggregationChildrenId = cc.CodeId
        WHERE cac.AggregationId = a.AggregationId
        FOR JSON PATH 
    )
FROM dbo.Code c
    INNER JOIN dbo.Aggregation a ON c.CodeId = a.AggregationId
FOR JSON PATH;

The plan for the above query looks like:

The plan for your query looks like:

If we add the following index, we can make the first variant even faster:

CREATE NONCLUSTERED INDEX IX_AggregationChildren_IX0
ON dbo.AggregationChildren (AggregationId)
INCLUDE (AggregationChildrenId,Position);

Clearly, you'd need to evaluate this against your workload, however.

I've created a minimally viable complete example setup to use for testing:

USE tempdb;

IF OBJECT_ID(N'dbo.AggregationChildren', N'U') IS NOT NULL 
DROP TABLE dbo.AggregationChildren;
IF OBJECT_ID(N'dbo.Aggregation', N'U') IS NOT NULL 
DROP TABLE dbo.Aggregation;
IF OBJECT_ID(N'dbo.Code', N'U') IS NOT NULL 
DROP TABLE dbo.Code;
GO

CREATE TABLE dbo.Code (
    CodeId int NOT NULL
        CONSTRAINT PK_CODE 
        PRIMARY KEY 
        CLUSTERED
    , Serial nvarchar(20) NOT NULL
);


CREATE TABLE dbo.Aggregation (
    AggregationId int NOT NULL
        CONSTRAINT PK_AGGREGATIONS 
        PRIMARY KEY 
        CLUSTERED
        CONSTRAINT FK_Aggregation_Code
        FOREIGN KEY (AggregationId)
        REFERENCES dbo.Code (CodeId)
)

CREATE TABLE dbo.AggregationChildren (
    AggregationChildrenId int NOT NULL
        CONSTRAINT PK_AGGREGATION_CHILDS 
        PRIMARY KEY 
        CLUSTERED
        CONSTRAINT FK_AggregationChildren_Code
        FOREIGN KEY (AggregationChildrenId)
        REFERENCES dbo.Code (CodeId)
    , AggregationId int NOT NULL
        CONSTRAINT FK_AggregationChildren_Aggregation
        FOREIGN KEY (AggregationId)
        REFERENCES dbo.Aggregation (AggregationId) 
        ON DELETE CASCADE
    , Position int NOT NULL
)

I re-formatted the constraint clauses to more kind to my brain; essentially, the above code is the same as the DDL in your question.

This populates the three tables with enough data to make meaningful comparisons:

;WITH src AS 
(
    SELECT n.Val
    FROM (VALUES (0), (1), (2), (3), (4), (5), (6), (7), (8), (9)) n(Val)
)
INSERT INTO dbo.Code (CodeId, Serial)
SELECT s1.Val 
        + (s2.Val * 10)
        + (s3.Val * 100)
        + (s4.Val * 1000)
        + (s5.Val * 10000)
    , CONVERT(bigint, CRYPT_GEN_RANDOM(8))
FROM src s1
    CROSS JOIN src s2
    CROSS JOIN src s3
    CROSS JOIN src s4
    CROSS JOIN src s5


;WITH src AS 
(
    SELECT n.Val
    FROM (VALUES (0), (1), (2), (3), (4), (5), (6), (7), (8), (9)) n(Val)
)
INSERT INTO dbo.Aggregation (AggregationId)
SELECT s1.Val 
    + (s2.Val * 10)
    + (s3.Val * 100)
FROM src s1
    CROSS JOIN src s2
    CROSS JOIN src s3;



;WITH src AS 
(
    SELECT n.Val
    FROM (VALUES (0), (1), (2), (3), (4), (5), (6), (7), (8), (9)) n(Val)
)
INSERT INTO dbo.AggregationChildren (AggregationChildrenId, AggregationId, Position)
SELECT s1.Val 
        + (s2.Val * 10)
        + (s3.Val * 100)
        + (s4.Val * 1000)
        + (s5.Val * 10000)
    , s1.Val 
        + (s2.Val * 10)
        + (s3.Val * 100)
    , s1.Val 
FROM src s1
    CROSS JOIN src s2
    CROSS JOIN src s3
    CROSS JOIN src s4
    CROSS JOIN src s5;

These are the row counts for each table:

╔════════╦═════════════╦═════════════════════╗
║  Code  ║ Aggregation ║ AggregationChildren ║
╠════════╬═════════════╬═════════════════════╣
║ 100000 ║        1000 ║              100000 ║
╚════════╩═════════════╩═════════════════════╝

My version of the query:

SELECT Parent = c.Serial
    , Children = (
        SELECT c.Serial 
            , cac.Position
        FROM dbo.Code cc
            INNER JOIN dbo.AggregationChildren cac ON cac.AggregationChildrenId = cc.CodeId
        WHERE cac.AggregationId = a.AggregationId
        FOR JSON PATH 
    )
FROM dbo.Code c
    INNER JOIN dbo.Aggregation a ON c.CodeId = a.AggregationId
FOR JSON PATH;

To compare the output of both queries, I've created two user-defined-functions, as in:

CREATE FUNCTION dbo.fn_json_test_1()
RETURNS nvarchar(max)
AS
BEGIN
    RETURN (
        SELECT Parent = c.Serial
            , Children = (
                SELECT c.Serial 
                    , cac.Position
                FROM dbo.Code cc
                    INNER JOIN dbo.AggregationChildren cac ON cac.AggregationChildrenId = cc.CodeId
                WHERE cac.AggregationId = a.AggregationId
                FOR JSON PATH 
            )
        FROM dbo.Code c
            INNER JOIN dbo.Aggregation a ON c.CodeId = a.AggregationId
        FOR JSON PATH
    );
END;
GO


GO
CREATE FUNCTION dbo.fn_json_test_2()
RETURNS nvarchar(max)
AS
BEGIN
    RETURN (
        SELECT c.Serial as Parent,
            (Select co.Serial, agc.Position
              from AggregationChildren agc, Aggregation ag, Code co
              where agc.AggregationId = a.AggregationId 
              and co.CodeId = agc.AggregationChildrenId for json path) as children
        from Aggregation a, Code c
        where c.CodeId = a.AggregationId for json path
    );
END;
GO

Now, I can compare the output for both queries via:

DECLARE @res1 nvarchar(max) = dbo.fn_json_test_1();
DECLARE @res2 nvarchar(max) = dbo.fn_json_test_2();

SELECT CASE WHEN @res1 <> @res2 THEN 'mismatch' ELSE 'match' END;

The result is:

The results do NOT match. The output of my query contains less child nodes than your query. I'm going back to the drawing board, and will simplify the test bed to see where the disparity lies.

The simplified testbed consists of 10 rows in the Code table, 2 rows in the Aggregation (Parent) table, and 8 rows in the AggregationChildren (child) table:

;WITH src AS 
(
    SELECT n.Val
    FROM (VALUES (0), (1), (2), (3), (4), (5), (6), (7), (8), (9)) n(Val)
)
INSERT INTO dbo.Code (CodeId, Serial)
SELECT s1.Val 
    , CONVERT(bigint, CRYPT_GEN_RANDOM(8))
FROM src s1


;WITH src AS 
(
    SELECT n.Val
    FROM (VALUES (0), (1)) n(Val)
)
INSERT INTO dbo.Aggregation (AggregationId)
SELECT s1.Val 
FROM src s1;



;WITH src AS 
(
    SELECT n.Val
    FROM (VALUES (0), (1), (2), (3), (4), (5), (6), (7)) n(Val)
)
INSERT INTO dbo.AggregationChildren (AggregationChildrenId, AggregationId, Position)
SELECT s1.Val + 2
    , s1.Val % 2
    , s1.Val 
FROM src s1;

Row counts:

SELECT Code = (SELECT COUNT(1) FROM dbo.Code)
    , Aggregation = (SELECT COUNT(1) FROM dbo.Aggregation)
    , AggregationChildren = (SELECT COUNT(1) FROM dbo.AggregationChildren)

╔══════╦═════════════╦═════════════════════╗
║ Code ║ Aggregation ║ AggregationChildren ║
╠══════╬═════════════╬═════════════════════╣
║   10 ║           2 ║                   8 ║
╚══════╩═════════════╩═════════════════════╝

The predicted pattern should be two parent json arrays, each with 4 child arrays.

My Results:

[
  {
    "Parent": "-5601362097731340301",
    "Children": [
      {
        "Serial": "-5601362097731340301",
        "Position": 0
      },
      {
        "Serial": "-5601362097731340301",
        "Position": 2
      },
      {
        "Serial": "-5601362097731340301",
        "Position": 4
      },
      {
        "Serial": "-5601362097731340301",
        "Position": 6
      }
    ]
  },
  {
    "Parent": "-8896860091721838065",
    "Children": [
      {
        "Serial": "-8896860091721838065",
        "Position": 1
      },
      {
        "Serial": "-8896860091721838065",
        "Position": 3
      },
      {
        "Serial": "-8896860091721838065",
        "Position": 5
      },
      {
        "Serial": "-8896860091721838065",
        "Position": 7
      }
    ]
  }

Your query:

[
  {
    "Parent": "-5601362097731340301",
    "Children": [
      {
        "Serial": "5802227619253639548",
        "Position": 0
      },
      {
        "Serial": "5802227619253639548",
        "Position": 0
      },
      {
        "Serial": "4504664379821512162",
        "Position": 2
      },
      {
        "Serial": "4504664379821512162",
        "Position": 2
      },
      {
        "Serial": "6561435639659176802",
        "Position": 4
      },
      {
        "Serial": "6561435639659176802",
        "Position": 4
      },
      {
        "Serial": "-7417083263182709739",
        "Position": 6
      },
      {
        "Serial": "-7417083263182709739",
        "Position": 6
      }
    ]
  },
  {
    "Parent": "-8896860091721838065",
    "Children": [
      {
        "Serial": "-7646118996434234523",
        "Position": 1
      },
      {
        "Serial": "-7646118996434234523",
        "Position": 1
      },
      {
        "Serial": "-6372739442099935942",
        "Position": 3
      },
      {
        "Serial": "-6372739442099935942",
        "Position": 3
      },
      {
        "Serial": "-882384147532911428",
        "Position": 5
      },
      {
        "Serial": "-882384147532911428",
        "Position": 5
      },
      {
        "Serial": "4293317573306886053",
        "Position": 7
      },
      {
        "Serial": "4293317573306886053",
        "Position": 7
      }
    ]
  }
]

Your query has too many children; my query returns the predicted number of children, and it returns the correct Position values, however is returning the incorrect Serial values.

The "bug" in my query appears in the inner query. The incorrect query is:

SELECT c.Serial 
    , cac.Position
FROM dbo.Code cc
    INNER JOIN dbo.AggregationChildren cac ON cac.AggregationChildrenId = cc.CodeId
WHERE cac.AggregationId = a.AggregationId
ORDER BY c.Serial
FOR JSON PATH

The correct version is:

SELECT cc.Serial --changed "c." to "cc."
    , cac.Position
FROM dbo.Code cc
    INNER JOIN dbo.AggregationChildren cac ON cac.AggregationChildrenId = cc.CodeId
WHERE cac.AggregationId = a.AggregationId
ORDER BY cc.CodeId --not a big deal, but different order for children in output
FOR JSON PATH

The corrected query now looks like:

SELECT  Parent = c.Serial
    , Children = (
        SELECT cc.Serial 
            , cac.Position
        FROM dbo.Code cc
            INNER JOIN dbo.AggregationChildren cac ON cac.AggregationChildrenId = cc.CodeId
        WHERE cac.AggregationId = a.AggregationId
        ORDER BY cc.CodeId
        FOR JSON PATH 
    )
FROM dbo.Code c
    INNER JOIN dbo.Aggregation a ON c.CodeId = a.AggregationId
ORDER BY c.Serial
FOR JSON PATH;

And returns the following result:

[
  {
    "Parent": "-195930341251513493",
    "Children": [
      {
        "Serial": "-6126601633786720400",
        "Position": 1
      },
      {
        "Serial": "5216562173012877678",
        "Position": 3
      },
      {
        "Serial": "-1992909345438478098",
        "Position": 5
      },
      {
        "Serial": "8329388691987940194",
        "Position": 7
      }
    ]
  },
  {
    "Parent": "8774608126018975726",
    "Children": [
      {
        "Serial": "-3380643917643646211",
        "Position": 0
      },
      {
        "Serial": "-2042609074595538493",
        "Position": 2
      },
      {
        "Serial": "7345460002653774160",
        "Position": 4
      },
      {
        "Serial": "-2126530822210070443",
        "Position": 6
      }
    ]
  }
]

Stack Exchange Network

Generate nested array with OPENJSON very slow

1 Answer 1

Your Answer

Hot Network Questions

Generate nested array with OPENJSON very slow

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Related

Hot Network Questions