Actual rows read in a table scan is multiplied by the number of threads used for the scan
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
5
down vote
favorite
I'm running into quite a strange problem. I'm running the same script to generate data and do some matching later on, on an older 2008R2 instance. The last query (an UPDATE
) does a single table scan and returns all of the 250.000 rows, whereas on a newer 2017 instance, the table is scanned in parallel and each of the 4 threads reads 250.000 rows, and returns 1 million "actual rows read".
I changed the compatibility mode to 2008 in my 2017 instance and the actuals stayed the same, at 1.000.000.
Is there any valid reason for why this would happen or does this seem like it should be a Connect item?
Plans contain the same operators, but one of them does the scan in parallel and instead of splitting the 250.000 rows per each of the 4 threads (and only read 62.500 rows on each thread) all the threads read 250k each *4 = 1.000.000
Both execution plans can be found at pastetheplan:
- 2017 version here
- 2008R2 version here
Also, the full script I was running can be found down below:
create table #targets (id int identity(1,1), start_point int, end_point int, refference_type_id int, bla1 int, bla2 int, bla3 int, bla4 int, bla5 int, bla6 int, bla7 int, bla8 int, bla9 bit, assignedTouch varchar(10));
;with cte as (
select
1 sp
, abs(checksum(newid())) % 11 + 2 ep
, 1 rn
, abs(checksum(newid())) % 3 b
, abs(checksum(newid())) % 3 c
, abs(checksum(newid())) % 3 d
, abs(checksum(newid())) % 3 e
, abs(checksum(newid())) % 3 f
, abs(checksum(newid())) % 3 g
, abs(checksum(newid())) % 3 g2
, abs(checksum(newid())) % 3 h
, abs(checksum(newid())) % 3 x3
union all
select
sp
, abs(checksum(newid())) % 11 + 2 ep
, rn + 1
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
from cte
where rn < 250000)
insert into #targets
select *
, 'Unassigned' [Default State]
from cte
option (maxrecursion 0)
select top 250 *
, char(abs(checksum(newid())) % 85 + 65) [class]
into #matching
from #targets
where end_point in ( 11, 14, 22, 33 )
order by abs(checksum(newid())) % 13
update #matching
set bla8 = case
when bla1 - bla2 > 0
then NULL
else bla8
end
, bla7 = case
when bla2 - bla3 > 0
then NULL
else bla7
end
, bla6 = case
when bla4 - bla5 > 0
then NULL
else bla6
end
create nonclustered index nc_assignedTouch on #targets (assignedTouch);
update t
set assignedTouch = m.class
from #targets t
inner join #matching m
on t.start_point = isnull(m.start_point, t.start_point)
and t.bla1 = isnull(m.bla1, t.bla1)
and t.bla2 = isnull(m.bla2, t.bla2)
and t.bla3 = isnull(m.bla3, t.bla3)
and t.bla4 = isnull(m.bla4, t.bla4)
and t.bla5 = isnull(m.bla5, t.bla5)
and t.bla6 = isnull(m.bla6, t.bla6)
and t.bla7 = isnull(m.bla7, t.bla7)
and t.bla8 = isnull(m.bla8, t.bla8)
and t.bla9 = isnull(m.bla9, t.bla9)
where t.assignedTouch = 'Unassigned';
sp_configure info:
sql-server sql-server-2008-r2 execution-plan sql-server-2017
add a comment |Â
up vote
5
down vote
favorite
I'm running into quite a strange problem. I'm running the same script to generate data and do some matching later on, on an older 2008R2 instance. The last query (an UPDATE
) does a single table scan and returns all of the 250.000 rows, whereas on a newer 2017 instance, the table is scanned in parallel and each of the 4 threads reads 250.000 rows, and returns 1 million "actual rows read".
I changed the compatibility mode to 2008 in my 2017 instance and the actuals stayed the same, at 1.000.000.
Is there any valid reason for why this would happen or does this seem like it should be a Connect item?
Plans contain the same operators, but one of them does the scan in parallel and instead of splitting the 250.000 rows per each of the 4 threads (and only read 62.500 rows on each thread) all the threads read 250k each *4 = 1.000.000
Both execution plans can be found at pastetheplan:
- 2017 version here
- 2008R2 version here
Also, the full script I was running can be found down below:
create table #targets (id int identity(1,1), start_point int, end_point int, refference_type_id int, bla1 int, bla2 int, bla3 int, bla4 int, bla5 int, bla6 int, bla7 int, bla8 int, bla9 bit, assignedTouch varchar(10));
;with cte as (
select
1 sp
, abs(checksum(newid())) % 11 + 2 ep
, 1 rn
, abs(checksum(newid())) % 3 b
, abs(checksum(newid())) % 3 c
, abs(checksum(newid())) % 3 d
, abs(checksum(newid())) % 3 e
, abs(checksum(newid())) % 3 f
, abs(checksum(newid())) % 3 g
, abs(checksum(newid())) % 3 g2
, abs(checksum(newid())) % 3 h
, abs(checksum(newid())) % 3 x3
union all
select
sp
, abs(checksum(newid())) % 11 + 2 ep
, rn + 1
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
from cte
where rn < 250000)
insert into #targets
select *
, 'Unassigned' [Default State]
from cte
option (maxrecursion 0)
select top 250 *
, char(abs(checksum(newid())) % 85 + 65) [class]
into #matching
from #targets
where end_point in ( 11, 14, 22, 33 )
order by abs(checksum(newid())) % 13
update #matching
set bla8 = case
when bla1 - bla2 > 0
then NULL
else bla8
end
, bla7 = case
when bla2 - bla3 > 0
then NULL
else bla7
end
, bla6 = case
when bla4 - bla5 > 0
then NULL
else bla6
end
create nonclustered index nc_assignedTouch on #targets (assignedTouch);
update t
set assignedTouch = m.class
from #targets t
inner join #matching m
on t.start_point = isnull(m.start_point, t.start_point)
and t.bla1 = isnull(m.bla1, t.bla1)
and t.bla2 = isnull(m.bla2, t.bla2)
and t.bla3 = isnull(m.bla3, t.bla3)
and t.bla4 = isnull(m.bla4, t.bla4)
and t.bla5 = isnull(m.bla5, t.bla5)
and t.bla6 = isnull(m.bla6, t.bla6)
and t.bla7 = isnull(m.bla7, t.bla7)
and t.bla8 = isnull(m.bla8, t.bla8)
and t.bla9 = isnull(m.bla9, t.bla9)
where t.assignedTouch = 'Unassigned';
sp_configure info:
sql-server sql-server-2008-r2 execution-plan sql-server-2017
1
Server-wide maxdop 1 is never a good thing. Check cost threshold for parallelism settings too, and clustered and/or unique indexes couldn't hurt.
â Aaron Bertrandâ¦
Aug 31 at 15:02
add a comment |Â
up vote
5
down vote
favorite
up vote
5
down vote
favorite
I'm running into quite a strange problem. I'm running the same script to generate data and do some matching later on, on an older 2008R2 instance. The last query (an UPDATE
) does a single table scan and returns all of the 250.000 rows, whereas on a newer 2017 instance, the table is scanned in parallel and each of the 4 threads reads 250.000 rows, and returns 1 million "actual rows read".
I changed the compatibility mode to 2008 in my 2017 instance and the actuals stayed the same, at 1.000.000.
Is there any valid reason for why this would happen or does this seem like it should be a Connect item?
Plans contain the same operators, but one of them does the scan in parallel and instead of splitting the 250.000 rows per each of the 4 threads (and only read 62.500 rows on each thread) all the threads read 250k each *4 = 1.000.000
Both execution plans can be found at pastetheplan:
- 2017 version here
- 2008R2 version here
Also, the full script I was running can be found down below:
create table #targets (id int identity(1,1), start_point int, end_point int, refference_type_id int, bla1 int, bla2 int, bla3 int, bla4 int, bla5 int, bla6 int, bla7 int, bla8 int, bla9 bit, assignedTouch varchar(10));
;with cte as (
select
1 sp
, abs(checksum(newid())) % 11 + 2 ep
, 1 rn
, abs(checksum(newid())) % 3 b
, abs(checksum(newid())) % 3 c
, abs(checksum(newid())) % 3 d
, abs(checksum(newid())) % 3 e
, abs(checksum(newid())) % 3 f
, abs(checksum(newid())) % 3 g
, abs(checksum(newid())) % 3 g2
, abs(checksum(newid())) % 3 h
, abs(checksum(newid())) % 3 x3
union all
select
sp
, abs(checksum(newid())) % 11 + 2 ep
, rn + 1
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
from cte
where rn < 250000)
insert into #targets
select *
, 'Unassigned' [Default State]
from cte
option (maxrecursion 0)
select top 250 *
, char(abs(checksum(newid())) % 85 + 65) [class]
into #matching
from #targets
where end_point in ( 11, 14, 22, 33 )
order by abs(checksum(newid())) % 13
update #matching
set bla8 = case
when bla1 - bla2 > 0
then NULL
else bla8
end
, bla7 = case
when bla2 - bla3 > 0
then NULL
else bla7
end
, bla6 = case
when bla4 - bla5 > 0
then NULL
else bla6
end
create nonclustered index nc_assignedTouch on #targets (assignedTouch);
update t
set assignedTouch = m.class
from #targets t
inner join #matching m
on t.start_point = isnull(m.start_point, t.start_point)
and t.bla1 = isnull(m.bla1, t.bla1)
and t.bla2 = isnull(m.bla2, t.bla2)
and t.bla3 = isnull(m.bla3, t.bla3)
and t.bla4 = isnull(m.bla4, t.bla4)
and t.bla5 = isnull(m.bla5, t.bla5)
and t.bla6 = isnull(m.bla6, t.bla6)
and t.bla7 = isnull(m.bla7, t.bla7)
and t.bla8 = isnull(m.bla8, t.bla8)
and t.bla9 = isnull(m.bla9, t.bla9)
where t.assignedTouch = 'Unassigned';
sp_configure info:
sql-server sql-server-2008-r2 execution-plan sql-server-2017
I'm running into quite a strange problem. I'm running the same script to generate data and do some matching later on, on an older 2008R2 instance. The last query (an UPDATE
) does a single table scan and returns all of the 250.000 rows, whereas on a newer 2017 instance, the table is scanned in parallel and each of the 4 threads reads 250.000 rows, and returns 1 million "actual rows read".
I changed the compatibility mode to 2008 in my 2017 instance and the actuals stayed the same, at 1.000.000.
Is there any valid reason for why this would happen or does this seem like it should be a Connect item?
Plans contain the same operators, but one of them does the scan in parallel and instead of splitting the 250.000 rows per each of the 4 threads (and only read 62.500 rows on each thread) all the threads read 250k each *4 = 1.000.000
Both execution plans can be found at pastetheplan:
- 2017 version here
- 2008R2 version here
Also, the full script I was running can be found down below:
create table #targets (id int identity(1,1), start_point int, end_point int, refference_type_id int, bla1 int, bla2 int, bla3 int, bla4 int, bla5 int, bla6 int, bla7 int, bla8 int, bla9 bit, assignedTouch varchar(10));
;with cte as (
select
1 sp
, abs(checksum(newid())) % 11 + 2 ep
, 1 rn
, abs(checksum(newid())) % 3 b
, abs(checksum(newid())) % 3 c
, abs(checksum(newid())) % 3 d
, abs(checksum(newid())) % 3 e
, abs(checksum(newid())) % 3 f
, abs(checksum(newid())) % 3 g
, abs(checksum(newid())) % 3 g2
, abs(checksum(newid())) % 3 h
, abs(checksum(newid())) % 3 x3
union all
select
sp
, abs(checksum(newid())) % 11 + 2 ep
, rn + 1
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
, abs(checksum(newid())) % 8
from cte
where rn < 250000)
insert into #targets
select *
, 'Unassigned' [Default State]
from cte
option (maxrecursion 0)
select top 250 *
, char(abs(checksum(newid())) % 85 + 65) [class]
into #matching
from #targets
where end_point in ( 11, 14, 22, 33 )
order by abs(checksum(newid())) % 13
update #matching
set bla8 = case
when bla1 - bla2 > 0
then NULL
else bla8
end
, bla7 = case
when bla2 - bla3 > 0
then NULL
else bla7
end
, bla6 = case
when bla4 - bla5 > 0
then NULL
else bla6
end
create nonclustered index nc_assignedTouch on #targets (assignedTouch);
update t
set assignedTouch = m.class
from #targets t
inner join #matching m
on t.start_point = isnull(m.start_point, t.start_point)
and t.bla1 = isnull(m.bla1, t.bla1)
and t.bla2 = isnull(m.bla2, t.bla2)
and t.bla3 = isnull(m.bla3, t.bla3)
and t.bla4 = isnull(m.bla4, t.bla4)
and t.bla5 = isnull(m.bla5, t.bla5)
and t.bla6 = isnull(m.bla6, t.bla6)
and t.bla7 = isnull(m.bla7, t.bla7)
and t.bla8 = isnull(m.bla8, t.bla8)
and t.bla9 = isnull(m.bla9, t.bla9)
where t.assignedTouch = 'Unassigned';
sp_configure info:
sql-server sql-server-2008-r2 execution-plan sql-server-2017
sql-server sql-server-2008-r2 execution-plan sql-server-2017
edited Aug 31 at 16:12
Paul Whiteâ¦
46.7k14252401
46.7k14252401
asked Aug 31 at 14:22
Radu Gheorghiu
489617
489617
1
Server-wide maxdop 1 is never a good thing. Check cost threshold for parallelism settings too, and clustered and/or unique indexes couldn't hurt.
â Aaron Bertrandâ¦
Aug 31 at 15:02
add a comment |Â
1
Server-wide maxdop 1 is never a good thing. Check cost threshold for parallelism settings too, and clustered and/or unique indexes couldn't hurt.
â Aaron Bertrandâ¦
Aug 31 at 15:02
1
1
Server-wide maxdop 1 is never a good thing. Check cost threshold for parallelism settings too, and clustered and/or unique indexes couldn't hurt.
â Aaron Bertrandâ¦
Aug 31 at 15:02
Server-wide maxdop 1 is never a good thing. Check cost threshold for parallelism settings too, and clustered and/or unique indexes couldn't hurt.
â Aaron Bertrandâ¦
Aug 31 at 15:02
add a comment |Â
1 Answer
1
active
oldest
votes
up vote
11
down vote
accepted
Parallel heap scan
You might be expecting distribution among parallel threads as in the following toy example:
SELECT TOP (5 * 1000 * 1000)
n = ROW_NUMBER() OVER (ORDER BY @@SPID)
INTO #n
FROM sys.columns AS C
CROSS JOIN sys.columns AS C2
CROSS JOIN sys.columns AS C3;
SELECT COUNT_BIG(*)
FROM #n AS N
GROUP BY N.n % 10
OPTION (USE HINT ('ENABLE_PARALLEL_PLAN_PREFERENCE'));
In that plan, the heap table is indeed parallel scanned, with all threads coordinating to share the work of reading the whole table:
or in SSMS view:
Your case
But this is not the arrangement in your uploaded plan:
The heap scan is on the inner side of a nested loops join*, so each thread runs a serial copy of the inner side, meaning the there are DOP (degree of parallelism) independent copies of the Table Spool and Table Scan.
At DOP four, this means there are four spools and four scans, as evidenced by the Number of Executions = 4
on the table scan. Indeed, the heap table is fully scanned four times (once per thread), giving 250,000 * 4 = 1,000,000 rows. The lazy spool caches the result of the scan per thread.
So the difference is that your parallel scan is four serial scans in parallel, rather than four threads cooperating to parallel scan the heap once (as in the toy example above).
It can be challenging to conceptualize the difference, but it is crucial. Once you see the branch between the two exchanges as DOP separate serial plans, it becomes easier to decode.
Of course the plan is wildly inefficient, and the spool adds little value. Notice the join predicate is stuck at the nested loops join, rather than being pushed down the inner side (making the join an apply). This is due to the complex join conditions involving ISNULL
.
You might get a slightly better-performing plan by making your nc_assignedTouch
index clustered rather than nonclustered, but the bulk of the work would still be happening at the join, and the improvement would almost certainly be minimal. A query rewrite is probably necessary here. Ask a follow-up question if you want assistance expressing the query in a more execution-friendly way.
For more background on the parallelism aspects, see my article Understanding and Using Parallelism in SQL Server.
Footnotes
* There is one general exception to this, where one can see a true co-operative parallel scan (and exchanges) on the inner side of a nested loops join: The outer input must be guaranteed to produce at most one row, and the loop join cannot have any correlated parameters (outer references). Under these conditions, the execution engine will allow parallelism on the inner side because it will always produce correct results.
You may also notice that operators below an Eager Spool on the inner side of a parallel nested loops join also execute only once. There are still DOP copies of these operators, but the runtime behaviour is that only one thread builds a shared indexed structure, which is then used by all instances of the Eager Spool. I do apologise that all this is so complicated.
1
Thank you very much Paul! I have read the entire answer very carefully and it all makes very much sense now. I do agree that a query rewrite is necessary, but at this point I'm just testing out different ways of rewriting in a more set-based fashion a "matching" process we have in one big stored procedure. And please, do not apologise, you answer and explanations are 100% on point once you can wrap your head around the concepts (their difficulty or complexity is not your fault and it falls on the part of the reader to become more knowledgeable in order to understand them).Thank you once again!
â Radu Gheorghiu
Aug 31 at 16:53
add a comment |Â
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
11
down vote
accepted
Parallel heap scan
You might be expecting distribution among parallel threads as in the following toy example:
SELECT TOP (5 * 1000 * 1000)
n = ROW_NUMBER() OVER (ORDER BY @@SPID)
INTO #n
FROM sys.columns AS C
CROSS JOIN sys.columns AS C2
CROSS JOIN sys.columns AS C3;
SELECT COUNT_BIG(*)
FROM #n AS N
GROUP BY N.n % 10
OPTION (USE HINT ('ENABLE_PARALLEL_PLAN_PREFERENCE'));
In that plan, the heap table is indeed parallel scanned, with all threads coordinating to share the work of reading the whole table:
or in SSMS view:
Your case
But this is not the arrangement in your uploaded plan:
The heap scan is on the inner side of a nested loops join*, so each thread runs a serial copy of the inner side, meaning the there are DOP (degree of parallelism) independent copies of the Table Spool and Table Scan.
At DOP four, this means there are four spools and four scans, as evidenced by the Number of Executions = 4
on the table scan. Indeed, the heap table is fully scanned four times (once per thread), giving 250,000 * 4 = 1,000,000 rows. The lazy spool caches the result of the scan per thread.
So the difference is that your parallel scan is four serial scans in parallel, rather than four threads cooperating to parallel scan the heap once (as in the toy example above).
It can be challenging to conceptualize the difference, but it is crucial. Once you see the branch between the two exchanges as DOP separate serial plans, it becomes easier to decode.
Of course the plan is wildly inefficient, and the spool adds little value. Notice the join predicate is stuck at the nested loops join, rather than being pushed down the inner side (making the join an apply). This is due to the complex join conditions involving ISNULL
.
You might get a slightly better-performing plan by making your nc_assignedTouch
index clustered rather than nonclustered, but the bulk of the work would still be happening at the join, and the improvement would almost certainly be minimal. A query rewrite is probably necessary here. Ask a follow-up question if you want assistance expressing the query in a more execution-friendly way.
For more background on the parallelism aspects, see my article Understanding and Using Parallelism in SQL Server.
Footnotes
* There is one general exception to this, where one can see a true co-operative parallel scan (and exchanges) on the inner side of a nested loops join: The outer input must be guaranteed to produce at most one row, and the loop join cannot have any correlated parameters (outer references). Under these conditions, the execution engine will allow parallelism on the inner side because it will always produce correct results.
You may also notice that operators below an Eager Spool on the inner side of a parallel nested loops join also execute only once. There are still DOP copies of these operators, but the runtime behaviour is that only one thread builds a shared indexed structure, which is then used by all instances of the Eager Spool. I do apologise that all this is so complicated.
1
Thank you very much Paul! I have read the entire answer very carefully and it all makes very much sense now. I do agree that a query rewrite is necessary, but at this point I'm just testing out different ways of rewriting in a more set-based fashion a "matching" process we have in one big stored procedure. And please, do not apologise, you answer and explanations are 100% on point once you can wrap your head around the concepts (their difficulty or complexity is not your fault and it falls on the part of the reader to become more knowledgeable in order to understand them).Thank you once again!
â Radu Gheorghiu
Aug 31 at 16:53
add a comment |Â
up vote
11
down vote
accepted
Parallel heap scan
You might be expecting distribution among parallel threads as in the following toy example:
SELECT TOP (5 * 1000 * 1000)
n = ROW_NUMBER() OVER (ORDER BY @@SPID)
INTO #n
FROM sys.columns AS C
CROSS JOIN sys.columns AS C2
CROSS JOIN sys.columns AS C3;
SELECT COUNT_BIG(*)
FROM #n AS N
GROUP BY N.n % 10
OPTION (USE HINT ('ENABLE_PARALLEL_PLAN_PREFERENCE'));
In that plan, the heap table is indeed parallel scanned, with all threads coordinating to share the work of reading the whole table:
or in SSMS view:
Your case
But this is not the arrangement in your uploaded plan:
The heap scan is on the inner side of a nested loops join*, so each thread runs a serial copy of the inner side, meaning the there are DOP (degree of parallelism) independent copies of the Table Spool and Table Scan.
At DOP four, this means there are four spools and four scans, as evidenced by the Number of Executions = 4
on the table scan. Indeed, the heap table is fully scanned four times (once per thread), giving 250,000 * 4 = 1,000,000 rows. The lazy spool caches the result of the scan per thread.
So the difference is that your parallel scan is four serial scans in parallel, rather than four threads cooperating to parallel scan the heap once (as in the toy example above).
It can be challenging to conceptualize the difference, but it is crucial. Once you see the branch between the two exchanges as DOP separate serial plans, it becomes easier to decode.
Of course the plan is wildly inefficient, and the spool adds little value. Notice the join predicate is stuck at the nested loops join, rather than being pushed down the inner side (making the join an apply). This is due to the complex join conditions involving ISNULL
.
You might get a slightly better-performing plan by making your nc_assignedTouch
index clustered rather than nonclustered, but the bulk of the work would still be happening at the join, and the improvement would almost certainly be minimal. A query rewrite is probably necessary here. Ask a follow-up question if you want assistance expressing the query in a more execution-friendly way.
For more background on the parallelism aspects, see my article Understanding and Using Parallelism in SQL Server.
Footnotes
* There is one general exception to this, where one can see a true co-operative parallel scan (and exchanges) on the inner side of a nested loops join: The outer input must be guaranteed to produce at most one row, and the loop join cannot have any correlated parameters (outer references). Under these conditions, the execution engine will allow parallelism on the inner side because it will always produce correct results.
You may also notice that operators below an Eager Spool on the inner side of a parallel nested loops join also execute only once. There are still DOP copies of these operators, but the runtime behaviour is that only one thread builds a shared indexed structure, which is then used by all instances of the Eager Spool. I do apologise that all this is so complicated.
1
Thank you very much Paul! I have read the entire answer very carefully and it all makes very much sense now. I do agree that a query rewrite is necessary, but at this point I'm just testing out different ways of rewriting in a more set-based fashion a "matching" process we have in one big stored procedure. And please, do not apologise, you answer and explanations are 100% on point once you can wrap your head around the concepts (their difficulty or complexity is not your fault and it falls on the part of the reader to become more knowledgeable in order to understand them).Thank you once again!
â Radu Gheorghiu
Aug 31 at 16:53
add a comment |Â
up vote
11
down vote
accepted
up vote
11
down vote
accepted
Parallel heap scan
You might be expecting distribution among parallel threads as in the following toy example:
SELECT TOP (5 * 1000 * 1000)
n = ROW_NUMBER() OVER (ORDER BY @@SPID)
INTO #n
FROM sys.columns AS C
CROSS JOIN sys.columns AS C2
CROSS JOIN sys.columns AS C3;
SELECT COUNT_BIG(*)
FROM #n AS N
GROUP BY N.n % 10
OPTION (USE HINT ('ENABLE_PARALLEL_PLAN_PREFERENCE'));
In that plan, the heap table is indeed parallel scanned, with all threads coordinating to share the work of reading the whole table:
or in SSMS view:
Your case
But this is not the arrangement in your uploaded plan:
The heap scan is on the inner side of a nested loops join*, so each thread runs a serial copy of the inner side, meaning the there are DOP (degree of parallelism) independent copies of the Table Spool and Table Scan.
At DOP four, this means there are four spools and four scans, as evidenced by the Number of Executions = 4
on the table scan. Indeed, the heap table is fully scanned four times (once per thread), giving 250,000 * 4 = 1,000,000 rows. The lazy spool caches the result of the scan per thread.
So the difference is that your parallel scan is four serial scans in parallel, rather than four threads cooperating to parallel scan the heap once (as in the toy example above).
It can be challenging to conceptualize the difference, but it is crucial. Once you see the branch between the two exchanges as DOP separate serial plans, it becomes easier to decode.
Of course the plan is wildly inefficient, and the spool adds little value. Notice the join predicate is stuck at the nested loops join, rather than being pushed down the inner side (making the join an apply). This is due to the complex join conditions involving ISNULL
.
You might get a slightly better-performing plan by making your nc_assignedTouch
index clustered rather than nonclustered, but the bulk of the work would still be happening at the join, and the improvement would almost certainly be minimal. A query rewrite is probably necessary here. Ask a follow-up question if you want assistance expressing the query in a more execution-friendly way.
For more background on the parallelism aspects, see my article Understanding and Using Parallelism in SQL Server.
Footnotes
* There is one general exception to this, where one can see a true co-operative parallel scan (and exchanges) on the inner side of a nested loops join: The outer input must be guaranteed to produce at most one row, and the loop join cannot have any correlated parameters (outer references). Under these conditions, the execution engine will allow parallelism on the inner side because it will always produce correct results.
You may also notice that operators below an Eager Spool on the inner side of a parallel nested loops join also execute only once. There are still DOP copies of these operators, but the runtime behaviour is that only one thread builds a shared indexed structure, which is then used by all instances of the Eager Spool. I do apologise that all this is so complicated.
Parallel heap scan
You might be expecting distribution among parallel threads as in the following toy example:
SELECT TOP (5 * 1000 * 1000)
n = ROW_NUMBER() OVER (ORDER BY @@SPID)
INTO #n
FROM sys.columns AS C
CROSS JOIN sys.columns AS C2
CROSS JOIN sys.columns AS C3;
SELECT COUNT_BIG(*)
FROM #n AS N
GROUP BY N.n % 10
OPTION (USE HINT ('ENABLE_PARALLEL_PLAN_PREFERENCE'));
In that plan, the heap table is indeed parallel scanned, with all threads coordinating to share the work of reading the whole table:
or in SSMS view:
Your case
But this is not the arrangement in your uploaded plan:
The heap scan is on the inner side of a nested loops join*, so each thread runs a serial copy of the inner side, meaning the there are DOP (degree of parallelism) independent copies of the Table Spool and Table Scan.
At DOP four, this means there are four spools and four scans, as evidenced by the Number of Executions = 4
on the table scan. Indeed, the heap table is fully scanned four times (once per thread), giving 250,000 * 4 = 1,000,000 rows. The lazy spool caches the result of the scan per thread.
So the difference is that your parallel scan is four serial scans in parallel, rather than four threads cooperating to parallel scan the heap once (as in the toy example above).
It can be challenging to conceptualize the difference, but it is crucial. Once you see the branch between the two exchanges as DOP separate serial plans, it becomes easier to decode.
Of course the plan is wildly inefficient, and the spool adds little value. Notice the join predicate is stuck at the nested loops join, rather than being pushed down the inner side (making the join an apply). This is due to the complex join conditions involving ISNULL
.
You might get a slightly better-performing plan by making your nc_assignedTouch
index clustered rather than nonclustered, but the bulk of the work would still be happening at the join, and the improvement would almost certainly be minimal. A query rewrite is probably necessary here. Ask a follow-up question if you want assistance expressing the query in a more execution-friendly way.
For more background on the parallelism aspects, see my article Understanding and Using Parallelism in SQL Server.
Footnotes
* There is one general exception to this, where one can see a true co-operative parallel scan (and exchanges) on the inner side of a nested loops join: The outer input must be guaranteed to produce at most one row, and the loop join cannot have any correlated parameters (outer references). Under these conditions, the execution engine will allow parallelism on the inner side because it will always produce correct results.
You may also notice that operators below an Eager Spool on the inner side of a parallel nested loops join also execute only once. There are still DOP copies of these operators, but the runtime behaviour is that only one thread builds a shared indexed structure, which is then used by all instances of the Eager Spool. I do apologise that all this is so complicated.
edited Sep 3 at 6:47
answered Aug 31 at 15:26
Paul Whiteâ¦
46.7k14252401
46.7k14252401
1
Thank you very much Paul! I have read the entire answer very carefully and it all makes very much sense now. I do agree that a query rewrite is necessary, but at this point I'm just testing out different ways of rewriting in a more set-based fashion a "matching" process we have in one big stored procedure. And please, do not apologise, you answer and explanations are 100% on point once you can wrap your head around the concepts (their difficulty or complexity is not your fault and it falls on the part of the reader to become more knowledgeable in order to understand them).Thank you once again!
â Radu Gheorghiu
Aug 31 at 16:53
add a comment |Â
1
Thank you very much Paul! I have read the entire answer very carefully and it all makes very much sense now. I do agree that a query rewrite is necessary, but at this point I'm just testing out different ways of rewriting in a more set-based fashion a "matching" process we have in one big stored procedure. And please, do not apologise, you answer and explanations are 100% on point once you can wrap your head around the concepts (their difficulty or complexity is not your fault and it falls on the part of the reader to become more knowledgeable in order to understand them).Thank you once again!
â Radu Gheorghiu
Aug 31 at 16:53
1
1
Thank you very much Paul! I have read the entire answer very carefully and it all makes very much sense now. I do agree that a query rewrite is necessary, but at this point I'm just testing out different ways of rewriting in a more set-based fashion a "matching" process we have in one big stored procedure. And please, do not apologise, you answer and explanations are 100% on point once you can wrap your head around the concepts (their difficulty or complexity is not your fault and it falls on the part of the reader to become more knowledgeable in order to understand them).Thank you once again!
â Radu Gheorghiu
Aug 31 at 16:53
Thank you very much Paul! I have read the entire answer very carefully and it all makes very much sense now. I do agree that a query rewrite is necessary, but at this point I'm just testing out different ways of rewriting in a more set-based fashion a "matching" process we have in one big stored procedure. And please, do not apologise, you answer and explanations are 100% on point once you can wrap your head around the concepts (their difficulty or complexity is not your fault and it falls on the part of the reader to become more knowledgeable in order to understand them).Thank you once again!
â Radu Gheorghiu
Aug 31 at 16:53
add a comment |Â
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fdba.stackexchange.com%2fquestions%2f216422%2factual-rows-read-in-a-table-scan-is-multiplied-by-the-number-of-threads-used-for%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
1
Server-wide maxdop 1 is never a good thing. Check cost threshold for parallelism settings too, and clustered and/or unique indexes couldn't hurt.
â Aaron Bertrandâ¦
Aug 31 at 15:02