Thursday, October 19, 2017

RANKing functions, Part 2: RANK vs DENSE_RANK vs ROW_NUMBER

-- Database by Doug
-- Douglas Kline
-- 10/19/2017
-- RANKing functions, Part 2: RANK vs DENSE_RANK vs ROW_NUMBER

-- you might want to check out the first video
-- on how ranking functions work, and the OVER clause

-- Here's some data I've made up for this example
-- It has an ID (primary key), first names, gender, and test scores

SELECT   [ID],
         [Name],
         [Gender],
         [Score]
FROM     (VALUES  (1, 'Jill', 'F', 100), (2, 'Jose', 'M', 95),  (3, 'Joe',  'M', 95), (4, 'Jack', 'M', 95),  
                  (5, 'Jess', 'F', 88),  (6, 'Jen',  'F', 98),  (7, 'Jim',  'M', 85), (8, 'Jeff', 'M', 75),  
                  (9, 'Josh', 'M', 74),  (10,'Jake', 'M', 83),  (11,'Jan',  'F', 78), (12,'Jean', 'F', 98),
                  (13,'Joan', 'F', 84),  (14,'Judy', 'F', 89),  (15,'Joy',  'F', 87), (16,'Bob',  'M', 67) ) 
         AS Exam1([ID], [Name], [Gender], [Score])
ORDER BY [Score] DESC

-- so let's say that we've created a contest

-- places in the contest (top place, 2nd place, etc.) 
-- will be determined by the test score

-- in other words, we're not so concerned with the raw score
-- but rather, we're interested in the *relative* score
-- and the order in which people appear, based on their score

-- we can use the ROW_NUMBER() function to give a 
-- 'ranking' to each record, based on Score

SELECT   [ID],
         [Name],
         [Gender],
         [Score],
         ROW_NUMBER() OVER (ORDER BY Score DESC) AS [row]
FROM     (VALUES  (1, 'Jill', 'F', 100), (2, 'Jose', 'M', 95),  (3, 'Joe',  'M', 95), (4, 'Jack', 'M', 95),  
                  (5, 'Jess', 'F', 88),  (6, 'Jen',  'F', 98),  (7, 'Jim',  'M', 85), (8, 'Jeff', 'M', 75),  
                  (9, 'Josh', 'M', 74),  (10,'Jake', 'M', 83),  (11,'Jan',  'F', 78), (12,'Jean', 'F', 98),
                  (13,'Joan', 'F', 84),  (14,'Judy', 'F', 89),  (15,'Joy',  'F', 87), (16,'Bob',  'M', 67) ) 
         AS Exam1([ID], [Name], [Gender], [Score])
ORDER BY [Score] DESC



-- notice that there are no ties - no repeated row_numbers
-- but how did it decide that Jen is 2nd, and Jean is 3rd?
-- why not the other way around?

-- the answer is: we didn't specify, so it made a decision
-- that decision could change next time it is run
-- or with different data

-- Now, it's not really fair that Jen placed higher with the same score as Jean
-- but if we'd at least like a *consistent* ranking
-- we could add a "tie breaker" to the ORDER BY

-- the "tie breaker" is based on the primary key, ID

SELECT   [ID],
         [Name],
         [Gender],
         [Score],
         ROW_NUMBER() OVER (ORDER BY Score DESC, ID ASC) AS [row]
FROM     (VALUES  (1, 'Jill', 'F', 100), (2, 'Jose', 'M', 95),  (3, 'Joe',  'M', 95), (4, 'Jack', 'M', 95),  
                  (5, 'Jess', 'F', 88),  (6, 'Jen',  'F', 98),  (7, 'Jim',  'M', 85), (8, 'Jeff', 'M', 75),  
                  (9, 'Josh', 'M', 74),  (10,'Jake', 'M', 83),  (11,'Jan',  'F', 78), (12,'Jean', 'F', 98),
                  (13,'Joan', 'F', 84),  (14,'Judy', 'F', 89),  (15,'Joy',  'F', 87), (16,'Bob',  'M', 67) ) 
         AS Exam1([ID], [Name], [Gender], [Score])
ORDER BY [Score] DESC

-- of course, this is arbitrary, but at least it is always consistent 

-- but let's be fair, Jen and Jean should "tie" for 2nd place, right?

-- we'll do that with RANK()

SELECT   [ID],
         [Name],
         [Gender],
         [Score],
         ROW_NUMBER() OVER (ORDER BY Score DESC, ID ASC) AS [row],
         RANK()       OVER (ORDER BY Score DESC)         AS [rank]
FROM     (VALUES  (1, 'Jill', 'F', 100), (2, 'Jose', 'M', 95),  (3, 'Joe',  'M', 95), (4, 'Jack', 'M', 95),  
                  (5, 'Jess', 'F', 88),  (6, 'Jen',  'F', 98),  (7, 'Jim',  'M', 85), (8, 'Jeff', 'M', 75),  
                  (9, 'Josh', 'M', 74),  (10,'Jake', 'M', 83),  (11,'Jan',  'F', 78), (12,'Jean', 'F', 98),
                  (13,'Joan', 'F', 84),  (14,'Judy', 'F', 89),  (15,'Joy',  'F', 87), (16,'Bob',  'M', 67) ) 
         AS Exam1([ID], [Name], [Gender], [Score])
ORDER BY [Score] DESC

-- note that RANK() explicitly deals with the ties
-- ranks are repeated

-- and since ties are explicitly dealt with, we don't want to
-- create our own tie-breaker as we did with ROW_NUMBER:

SELECT   [ID],
         [Name],
         [Gender],
         [Score],
         ROW_NUMBER() OVER (ORDER BY Score DESC, ID ASC) AS [row],
         RANK()       OVER (ORDER BY Score DESC, ID ASC) AS [rank]
FROM     (VALUES  (1, 'Jill', 'F', 100), (2, 'Jose', 'M', 95),  (3, 'Joe',  'M', 95), (4, 'Jack', 'M', 95),  
                  (5, 'Jess', 'F', 88),  (6, 'Jen',  'F', 98),  (7, 'Jim',  'M', 85), (8, 'Jeff', 'M', 75),  
                  (9, 'Josh', 'M', 74),  (10,'Jake', 'M', 83),  (11,'Jan',  'F', 78), (12,'Jean', 'F', 98),
                  (13,'Joan', 'F', 84),  (14,'Judy', 'F', 89),  (15,'Joy',  'F', 87), (16,'Bob',  'M', 67) ) 
         AS Exam1([ID], [Name], [Gender], [Score])
ORDER BY [Score] DESC

-- when we add ID to the RANK() ORDER BY, there are no ties
-- so ROW_NUMBER() and RANK() produce the same results


-- back to the example
-- notice that RANK() not only duplicates values, 
-- but also skips values after repeats
-- so, for example, no one is in 3rd place

SELECT   [ID],
         [Name],
         [Gender],
         [Score],
         ROW_NUMBER() OVER (ORDER BY Score DESC, ID ASC) AS [row],
         RANK()       OVER (ORDER BY Score DESC)         AS [rank]
FROM     (VALUES  (1, 'Jill', 'F', 100), (2, 'Jose', 'M', 95),  (3, 'Joe',  'M', 95), (4, 'Jack', 'M', 95),  
                  (5, 'Jess', 'F', 88),  (6, 'Jen',  'F', 98),  (7, 'Jim',  'M', 85), (8, 'Jeff', 'M', 75),  
                  (9, 'Josh', 'M', 74),  (10,'Jake', 'M', 83),  (11,'Jan',  'F', 78), (12,'Jean', 'F', 98),
                  (13,'Joan', 'F', 84),  (14,'Judy', 'F', 89),  (15,'Joy',  'F', 87), (16,'Bob',  'M', 67) ) 
         AS Exam1([ID], [Name], [Gender], [Score])
ORDER BY [Score] DESC

-- if we'd like to NOT skip values, we can use DENSE_RANK()

-- we can see repeated dense ranks,
-- but no skipped values

-- like rank, dense rank handles ties explicitly, 
-- so we don't add in ID 

SELECT   [ID],
         [Name],
         [Gender],
         [Score],
         ROW_NUMBER() OVER (ORDER BY Score DESC, ID ASC) AS [row],
         RANK()       OVER (ORDER BY Score DESC)         AS [rank],
         DENSE_RANK() OVER (ORDER BY Score DESC)         AS [d_rank]
FROM     (VALUES  (1, 'Jill', 'F', 100), (2, 'Jose', 'M', 95),  (3, 'Joe',  'M', 95), (4, 'Jack', 'M', 95),  
                  (5, 'Jess', 'F', 88),  (6, 'Jen',  'F', 98),  (7, 'Jim',  'M', 85), (8, 'Jeff', 'M', 75),  
                  (9, 'Josh', 'M', 74),  (10,'Jake', 'M', 83),  (11,'Jan',  'F', 78), (12,'Jean', 'F', 98),
                  (13,'Joan', 'F', 84),  (14,'Judy', 'F', 89),  (15,'Joy',  'F', 87), (16,'Bob',  'M', 67) ) 
         AS Exam1([ID], [Name], [Gender], [Score])
ORDER BY [Score] DESC

-- finally, to show a more full example, 
-- let's rank people within their gender
-- (see my other video for more details on this syntax)

SELECT   [ID],
         [Name],
         [Gender],
         [Score],
         ROW_NUMBER() OVER (ORDER BY Score DESC, ID ASC)                AS [row],
         RANK()       OVER (ORDER BY Score DESC)                        AS [rank],
         DENSE_RANK() OVER (ORDER BY Score DESC)                        AS [d_rank],
         ROW_NUMBER() OVER (PARTITION BY [Gender] ORDER BY Score DESC)  AS [d_rank_by_gender]
FROM     (VALUES  (1, 'Jill', 'F', 100), (2, 'Jose', 'M', 95),  (3, 'Joe',  'M', 95), (4, 'Jack', 'M', 95),  
                  (5, 'Jess', 'F', 88),  (6, 'Jen',  'F', 98),  (7, 'Jim',  'M', 85), (8, 'Jeff', 'M', 75),  
                  (9, 'Josh', 'M', 74),  (10,'Jake', 'M', 83),  (11,'Jan',  'F', 78), (12,'Jean', 'F', 98),
                  (13,'Joan', 'F', 84),  (14,'Judy', 'F', 89),  (15,'Joy',  'F', 87), (16,'Bob',  'M', 67) ) 
         AS Exam1([ID], [Name], [Gender], [Score])
ORDER BY [Score] DESC

-- note that the row_number is determined by comparison within gender


-- Database by Doug
-- Douglas Kline
-- 10/19/2017
-- RANKing functions, Part 2: RANK vs DENSE_RANK vs ROW_NUMBER

No comments:

Post a Comment

Followers