Thursday, October 19, 2017

RANKing functions, Part 2: RANK vs DENSE_RANK vs ROW_NUMBER

-- Database by Doug
-- Douglas Kline
-- 10/19/2017
-- RANKing functions, Part 2: RANK vs DENSE_RANK vs ROW_NUMBER

-- you might want to check out the first video
-- on how ranking functions work, and the OVER clause

-- Here's some data I've made up for this example
-- It has an ID (primary key), first names, gender, and test scores

SELECT   [ID],
         [Name],
         [Gender],
         [Score]
FROM     (VALUES  (1, 'Jill', 'F', 100), (2, 'Jose', 'M', 95),  (3, 'Joe',  'M', 95), (4, 'Jack', 'M', 95),  
                  (5, 'Jess', 'F', 88),  (6, 'Jen',  'F', 98),  (7, 'Jim',  'M', 85), (8, 'Jeff', 'M', 75),  
                  (9, 'Josh', 'M', 74),  (10,'Jake', 'M', 83),  (11,'Jan',  'F', 78), (12,'Jean', 'F', 98),
                  (13,'Joan', 'F', 84),  (14,'Judy', 'F', 89),  (15,'Joy',  'F', 87), (16,'Bob',  'M', 67) ) 
         AS Exam1([ID], [Name], [Gender], [Score])
ORDER BY [Score] DESC

-- so let's say that we've created a contest

-- places in the contest (top place, 2nd place, etc.) 
-- will be determined by the test score

-- in other words, we're not so concerned with the raw score
-- but rather, we're interested in the *relative* score
-- and the order in which people appear, based on their score

-- we can use the ROW_NUMBER() function to give a 
-- 'ranking' to each record, based on Score

SELECT   [ID],
         [Name],
         [Gender],
         [Score],
         ROW_NUMBER() OVER (ORDER BY Score DESC) AS [row]
FROM     (VALUES  (1, 'Jill', 'F', 100), (2, 'Jose', 'M', 95),  (3, 'Joe',  'M', 95), (4, 'Jack', 'M', 95),  
                  (5, 'Jess', 'F', 88),  (6, 'Jen',  'F', 98),  (7, 'Jim',  'M', 85), (8, 'Jeff', 'M', 75),  
                  (9, 'Josh', 'M', 74),  (10,'Jake', 'M', 83),  (11,'Jan',  'F', 78), (12,'Jean', 'F', 98),
                  (13,'Joan', 'F', 84),  (14,'Judy', 'F', 89),  (15,'Joy',  'F', 87), (16,'Bob',  'M', 67) ) 
         AS Exam1([ID], [Name], [Gender], [Score])
ORDER BY [Score] DESC



-- notice that there are no ties - no repeated row_numbers
-- but how did it decide that Jen is 2nd, and Jean is 3rd?
-- why not the other way around?

-- the answer is: we didn't specify, so it made a decision
-- that decision could change next time it is run
-- or with different data

-- Now, it's not really fair that Jen placed higher with the same score as Jean
-- but if we'd at least like a *consistent* ranking
-- we could add a "tie breaker" to the ORDER BY

-- the "tie breaker" is based on the primary key, ID

SELECT   [ID],
         [Name],
         [Gender],
         [Score],
         ROW_NUMBER() OVER (ORDER BY Score DESC, ID ASC) AS [row]
FROM     (VALUES  (1, 'Jill', 'F', 100), (2, 'Jose', 'M', 95),  (3, 'Joe',  'M', 95), (4, 'Jack', 'M', 95),  
                  (5, 'Jess', 'F', 88),  (6, 'Jen',  'F', 98),  (7, 'Jim',  'M', 85), (8, 'Jeff', 'M', 75),  
                  (9, 'Josh', 'M', 74),  (10,'Jake', 'M', 83),  (11,'Jan',  'F', 78), (12,'Jean', 'F', 98),
                  (13,'Joan', 'F', 84),  (14,'Judy', 'F', 89),  (15,'Joy',  'F', 87), (16,'Bob',  'M', 67) ) 
         AS Exam1([ID], [Name], [Gender], [Score])
ORDER BY [Score] DESC

-- of course, this is arbitrary, but at least it is always consistent 

-- but let's be fair, Jen and Jean should "tie" for 2nd place, right?

-- we'll do that with RANK()

SELECT   [ID],
         [Name],
         [Gender],
         [Score],
         ROW_NUMBER() OVER (ORDER BY Score DESC, ID ASC) AS [row],
         RANK()       OVER (ORDER BY Score DESC)         AS [rank]
FROM     (VALUES  (1, 'Jill', 'F', 100), (2, 'Jose', 'M', 95),  (3, 'Joe',  'M', 95), (4, 'Jack', 'M', 95),  
                  (5, 'Jess', 'F', 88),  (6, 'Jen',  'F', 98),  (7, 'Jim',  'M', 85), (8, 'Jeff', 'M', 75),  
                  (9, 'Josh', 'M', 74),  (10,'Jake', 'M', 83),  (11,'Jan',  'F', 78), (12,'Jean', 'F', 98),
                  (13,'Joan', 'F', 84),  (14,'Judy', 'F', 89),  (15,'Joy',  'F', 87), (16,'Bob',  'M', 67) ) 
         AS Exam1([ID], [Name], [Gender], [Score])
ORDER BY [Score] DESC

-- note that RANK() explicitly deals with the ties
-- ranks are repeated

-- and since ties are explicitly dealt with, we don't want to
-- create our own tie-breaker as we did with ROW_NUMBER:

SELECT   [ID],
         [Name],
         [Gender],
         [Score],
         ROW_NUMBER() OVER (ORDER BY Score DESC, ID ASC) AS [row],
         RANK()       OVER (ORDER BY Score DESC, ID ASC) AS [rank]
FROM     (VALUES  (1, 'Jill', 'F', 100), (2, 'Jose', 'M', 95),  (3, 'Joe',  'M', 95), (4, 'Jack', 'M', 95),  
                  (5, 'Jess', 'F', 88),  (6, 'Jen',  'F', 98),  (7, 'Jim',  'M', 85), (8, 'Jeff', 'M', 75),  
                  (9, 'Josh', 'M', 74),  (10,'Jake', 'M', 83),  (11,'Jan',  'F', 78), (12,'Jean', 'F', 98),
                  (13,'Joan', 'F', 84),  (14,'Judy', 'F', 89),  (15,'Joy',  'F', 87), (16,'Bob',  'M', 67) ) 
         AS Exam1([ID], [Name], [Gender], [Score])
ORDER BY [Score] DESC

-- when we add ID to the RANK() ORDER BY, there are no ties
-- so ROW_NUMBER() and RANK() produce the same results


-- back to the example
-- notice that RANK() not only duplicates values, 
-- but also skips values after repeats
-- so, for example, no one is in 3rd place

SELECT   [ID],
         [Name],
         [Gender],
         [Score],
         ROW_NUMBER() OVER (ORDER BY Score DESC, ID ASC) AS [row],
         RANK()       OVER (ORDER BY Score DESC)         AS [rank]
FROM     (VALUES  (1, 'Jill', 'F', 100), (2, 'Jose', 'M', 95),  (3, 'Joe',  'M', 95), (4, 'Jack', 'M', 95),  
                  (5, 'Jess', 'F', 88),  (6, 'Jen',  'F', 98),  (7, 'Jim',  'M', 85), (8, 'Jeff', 'M', 75),  
                  (9, 'Josh', 'M', 74),  (10,'Jake', 'M', 83),  (11,'Jan',  'F', 78), (12,'Jean', 'F', 98),
                  (13,'Joan', 'F', 84),  (14,'Judy', 'F', 89),  (15,'Joy',  'F', 87), (16,'Bob',  'M', 67) ) 
         AS Exam1([ID], [Name], [Gender], [Score])
ORDER BY [Score] DESC

-- if we'd like to NOT skip values, we can use DENSE_RANK()

-- we can see repeated dense ranks,
-- but no skipped values

-- like rank, dense rank handles ties explicitly, 
-- so we don't add in ID 

SELECT   [ID],
         [Name],
         [Gender],
         [Score],
         ROW_NUMBER() OVER (ORDER BY Score DESC, ID ASC) AS [row],
         RANK()       OVER (ORDER BY Score DESC)         AS [rank],
         DENSE_RANK() OVER (ORDER BY Score DESC)         AS [d_rank]
FROM     (VALUES  (1, 'Jill', 'F', 100), (2, 'Jose', 'M', 95),  (3, 'Joe',  'M', 95), (4, 'Jack', 'M', 95),  
                  (5, 'Jess', 'F', 88),  (6, 'Jen',  'F', 98),  (7, 'Jim',  'M', 85), (8, 'Jeff', 'M', 75),  
                  (9, 'Josh', 'M', 74),  (10,'Jake', 'M', 83),  (11,'Jan',  'F', 78), (12,'Jean', 'F', 98),
                  (13,'Joan', 'F', 84),  (14,'Judy', 'F', 89),  (15,'Joy',  'F', 87), (16,'Bob',  'M', 67) ) 
         AS Exam1([ID], [Name], [Gender], [Score])
ORDER BY [Score] DESC

-- finally, to show a more full example, 
-- let's rank people within their gender
-- (see my other video for more details on this syntax)

SELECT   [ID],
         [Name],
         [Gender],
         [Score],
         ROW_NUMBER() OVER (ORDER BY Score DESC, ID ASC)                AS [row],
         RANK()       OVER (ORDER BY Score DESC)                        AS [rank],
         DENSE_RANK() OVER (ORDER BY Score DESC)                        AS [d_rank],
         ROW_NUMBER() OVER (PARTITION BY [Gender] ORDER BY Score DESC)  AS [d_rank_by_gender]
FROM     (VALUES  (1, 'Jill', 'F', 100), (2, 'Jose', 'M', 95),  (3, 'Joe',  'M', 95), (4, 'Jack', 'M', 95),  
                  (5, 'Jess', 'F', 88),  (6, 'Jen',  'F', 98),  (7, 'Jim',  'M', 85), (8, 'Jeff', 'M', 75),  
                  (9, 'Josh', 'M', 74),  (10,'Jake', 'M', 83),  (11,'Jan',  'F', 78), (12,'Jean', 'F', 98),
                  (13,'Joan', 'F', 84),  (14,'Judy', 'F', 89),  (15,'Joy',  'F', 87), (16,'Bob',  'M', 67) ) 
         AS Exam1([ID], [Name], [Gender], [Score])
ORDER BY [Score] DESC

-- note that the row_number is determined by comparison within gender


-- Database by Doug
-- Douglas Kline
-- 10/19/2017
-- RANKing functions, Part 2: RANK vs DENSE_RANK vs ROW_NUMBER

Thursday, October 12, 2017

SQL Ranking Functions, Part 1: the OVER clause

-- Database by Doug
-- Douglas Kline
-- 10/10/2017
-- RANKing functions, Part 1: OVER

USE Northwind

-- here's a simple SELECT statement from the Products table

SELECT   ProductName,
         UnitPrice
FROM     Products
ORDER BY UnitPrice DESC

-- this shows that the highest priced product is Cote de Blaye, productID 38

-- but sometimes the *relative* price is more important than the actual price
-- in other words, we want to know how products *rank*, based on price

-- So, here's how you would do that in SQL Server:

SELECT   ProductName,
         UnitPrice,
         RANK() OVER (ORDER BY unitprice DESC) AS [price rank]
FROM     Products
ORDER BY UnitPrice DESC

-- notice that the highest priced product is ranked 1
-- the second highest priced product has a rank of 2

-- let's look at the syntax

-- RANK() is a function that takes no parameters
-- but it requires an OVER clause

-- so this doesn't work, giving an error
SELECT   ProductName,
         UnitPrice,
         RANK()  -- no over
FROM     Products
ORDER BY UnitPrice DESC

-- the OVER clause tells the RANK how to do its ranking

-- in our example, the phrase (ORDER BY unitprice DESC) means
-- "When calculating RANK, rank based on price, descending."

SELECT   ProductName,
         UnitPrice,
         RANK() OVER (ORDER BY unitprice DESC) AS [price rank]
FROM     Products
ORDER BY UnitPrice DESC

-- we could also choose to rank the products by productname:

SELECT   ProductName,
         UnitPrice,
         RANK() OVER (ORDER BY ProductName ASC) AS [name rank]
FROM     Products
ORDER BY UnitPrice DESC

-- note that I didn't change the record ordering
-- so its hard to tell if the rank is correct
-- let's fix that:

SELECT   ProductName,
         UnitPrice,
         RANK() OVER (ORDER BY ProductName ASC) AS [name rank]
FROM     Products
ORDER BY ProductName ASC

-- so now I can visually verify the ranking

-- but, this makes demonstrates something important:

-- the ORDER BY in an OVER clause only affects the ranking function
-- it is independent of the ORDER BY for the SELECT statement

-- in other words, you can rank in one way, and show records another

-- for example, you might want to show products alphabetically,
-- but also show their rank in terms of price:

SELECT   ProductName,
         UnitPrice,
         RANK() OVER (ORDER BY unitprice DESC) AS [price rank]
FROM     Products
ORDER BY ProductName ASC


-- now consider this

SELECT   ProductName,
         CategoryID,
         UnitPrice,
         RANK() OVER (ORDER BY unitprice DESC) AS [price rank]
FROM     Products
ORDER BY unitprice DESC

-- notice that Ipoh Coffee is ranked 9th overall
-- but is also the 2nd highest priced product in CategoryID 1

-- so, I could do this:
SELECT   ProductName,
         CategoryID,
         UnitPrice,
         RANK() OVER (ORDER BY unitprice DESC) AS [price rank in category]
FROM     Products
WHERE    CategoryID = 1
UNION
SELECT   ProductName,
         CategoryID,
         UnitPrice,
         RANK() OVER (ORDER BY unitprice DESC) AS [price rank in category]
FROM     Products
WHERE    CategoryID = 2
ORDER BY unitprice DESC
-- etc.

-- but that's a pain
-- and is really slow at scale

-- let's use the OVER clause to tell RANK to *rank by category*

SELECT   ProductName,
         CategoryID,
         UnitPrice,
         RANK() OVER (ORDER BY unitprice DESC) AS [price rank overall],
         RANK() OVER (PARTITION BY CategoryID ORDER BY unitprice DESC) AS [price rank in category]
FROM     Products
ORDER BY unitprice DESC

-- here it is formatted to show the difference:
SELECT   ProductName,
         CategoryID,
         UnitPrice,
         RANK() OVER (                        ORDER BY unitprice DESC) AS [price rank overall],
         RANK() OVER (PARTITION BY CategoryID ORDER BY unitprice DESC) AS [price rank in category]
FROM     Products
ORDER BY unitprice DESC

-- here, we've calculated rank in two different ways
-- the PARTITION BY says: rank against other products with the same CategoryID

-- now let's do it again, but rank products
-- against other products from the same supplier

-- and to prove that the rankings are true,
-- we can order the records in different ways:

SELECT   ProductName,
         CategoryID,
         SupplierID,
         UnitPrice,
         RANK() OVER (                        ORDER BY unitprice DESC) AS [price rank overall],
         RANK() OVER (PARTITION BY CategoryID ORDER BY unitprice DESC) AS [price rank in category],
         RANK() OVER (PARTITION BY SupplierID ORDER BY unitprice DESC) AS [price rank by supplier]
FROM     Products
--ORDER BY unitprice DESC
--ORDER BY CategoryID, unitprice DESC
ORDER BY SupplierID, unitprice DESC

-- one final example
-- let's rank by dollar value in inventory, within category

SELECT   ProductName,
         CategoryID,
         unitprice * UnitsInStock AS [dollar value in inventory],
         RANK() OVER (PARTITION BY CategoryID
                      ORDER BY  unitprice * UnitsInStock DESC,
                                ProductID ASC) AS [value rank by category] 
FROM     Products
ORDER BY CategoryID,
         [dollar value in inventory] DESC

-- notice that the ORDER BY in the OVER can be an expression

-- in the future videos, I'll cover ROW_NUMBER(), DENSE_RANK(), and NTILE(n)
-- they are all ranking functions
-- and they all use the OVER clause in exactly the same way
-- but they calculate a "rank" in various ways

-- Database by Doug
-- Douglas Kline
-- 10/10/2017
-- RANKing functions, Part 1: OVER

Quick! What's the difference between RANK, DENSE_RANK, and ROW_NUMBER?

-- Database by Doug
-- Douglas Kline
-- 10/12/2017
-- Quick! What's the difference between RANK, DENSE_RANK, and ROW_NUMBER?

-- in short, they are only different when there are ties...

-- here's a table that will help show the difference
-- between the ranking functions

-- note the [Score] column, 
-- it will be the basis of the ranking

SELECT   [Name],
         [Score]
FROM     (VALUES  ('A', 100),
                  ('B', 101), 
                  ('C', 101), 
                  ('D', 101), 
                  ('E', 102), 
                  ('F', 103) ) AS Temp([Name], [Score])
ORDER BY [Score] ASC

-- first, ROW_NUMBER

SELECT   [Name],
         [Score],
         ROW_NUMBER() OVER (ORDER BY [Score]) AS [row_number]
FROM     (VALUES  ('A', 100),
                  ('B', 101), 
                  ('C', 101), 
                  ('D', 101), 
                  ('E', 102), 
                  ('F', 103) ) AS Temp([Name], [Score])
ORDER BY [Score] ASC

-- row number gives every item a unique number
-- it does NOT recognize ties
-- no numbers are repeated, no numbers are skipped

-- next, RANK()

SELECT   [Name],
         [Score],
         ROW_NUMBER() OVER (ORDER BY [Score]) AS [row_number],
         RANK()       OVER (ORDER BY [Score]) AS [rank]
FROM     (VALUES  ('A', 100),
                  ('B', 101), 
                  ('C', 101), 
                  ('D', 101), 
                  ('E', 102), 
                  ('F', 103) ) AS Temp([Name], [Score])
ORDER BY [Score] ASC

-- RANK() recognizes "ties" by repeating the same rank value
-- but then skipping to the next row_number
-- note the skip from 2,2,2 to rank of 5

-- now look at dense rank

SELECT   [Name],
         [Score],
         ROW_NUMBER() OVER (ORDER BY [Score]) AS [row_number],
         RANK()       OVER (ORDER BY [Score]) AS [rank],
         DENSE_RANK() OVER (ORDER BY [Score]) AS [dense_rank]
FROM     (VALUES  ('A', 100),
                  ('B', 101), 
                  ('C', 101), 
                  ('D', 101), 
                  ('E', 102), 
                  ('F', 103) ) AS Temp([Name], [Score])
ORDER BY [Score] ASC

-- dense_rank handles ties
-- but does not "skip" ranks
-- note the 2, 2, 2 then 3

-- row_number, rank, and dense_rank 
-- are the SAME when there are no ties

-- to prove it, see this, where the Scores are all unique

SELECT   [Name],
         [Score],
         ROW_NUMBER() OVER (ORDER BY [Score]) AS [row_number],
         RANK()       OVER (ORDER BY [Score]) AS [rank],
         DENSE_RANK() OVER (ORDER BY [Score]) AS [dense_rank]
FROM     (VALUES  ('A', 100),
                  ('B', 101.0), -- added a decimal point 
                  ('C', 101.1), -- to break the ties
                  ('D', 101.2), 
                  ('E', 102), 
                  ('F', 103) ) AS Temp([Name], [Score])
ORDER BY [Score] ASC

-- Database by Doug
-- Douglas Kline
-- 10/12/2017
-- Quick! What's the difference between RANK, DENSE_RANK, and ROW_NUMBER?


Wednesday, March 15, 2017

Monday, March 13, 2017

Ordering mixed Alpha and Digit Characters

Suppose you have a VARCHAR column that has a mix of alpha and digit characters. In other words, the values represent different types across rows.

Like this:

SELECT   mixedColumn
FROM     (VALUES ('fred'), 
             (CONVERT (VARCHAR, 1)), 
             (CONVERT (VARCHAR, 3)), 
             ('george'), 
             (CONVERT (VARCHAR, 7)), 
             ('ginger') 
         ) AS myTable(mixedColumn)

Which produces:













So how does this get sorted, by default?

SELECT   mixedAlphaColumn
FROM     (VALUES ('fred'), 
             (CONVERT (VARCHAR, 1)), 
             (CONVERT (VARCHAR, 3)), 
             ('george'), 
             (CONVERT (VARCHAR, 7)), 
             ('ginger') 
         ) AS myTable(mixedAlphaColumn)
ORDER BY mixedAlphaColumn

Which produces:













This gets ordered based on its ASCII code value. Since all ASCII digit characters come before alpha characters, digits will come before alpha when using ORDER BY.  You can see this with this code:

SELECT   mixedColumn,
         ASCII(LEFT(mixedColumn, 1)) AS [ASCII Value]
FROM     (VALUES ('fred'), 
             (CONVERT (VARCHAR, 1)), 
             (CONVERT (VARCHAR, 3)), 
             ('george'), 
             (CONVERT (VARCHAR, 7)), 
             ('ginger') 
         ) AS myTable(mixedColumn)
ORDER BY mixedColumn

Which produces:












But what if we want to have the digit characters come after the alpha characters?

We could do this:

SELECT   mixedColumn,
         ASCII(LEFT(mixedColumn, 1)) AS [ASCII Value]
FROM     (VALUES ('fred'), 
             (CONVERT (VARCHAR, 1)), 
             (CONVERT (VARCHAR, 3)), 
             ('george'), 
             (CONVERT (VARCHAR, 7)), 
             ('ginger') 
         ) AS myTable(mixedColumn)
ORDER BY mixedColumn DESC

Which produces this:













But what if we want the digits ordered ascending, and the alpha ordered ascending, but the alpha before the digits?

Here's the trick - the ISNUMERIC function.

SELECT   mixedColumn,
         ISNUMERIC(mixedColumn) AS [isNumeric]
FROM     (VALUES ('fred'), 
             (CONVERT (VARCHAR, 1)), 
             (CONVERT (VARCHAR, 3)), 
             ('george'), 
             (CONVERT (VARCHAR, 7)), 
             ('ginger') 
         ) AS myTable(mixedColumn)
ORDER BY ISNUMERIC(mixedColumn),
         mixedColumn

Which produces:













And of course, we don't need to display the isNumeric column:

SELECT   mixedColumn
FROM     (VALUES ('fred'), 
             (CONVERT (VARCHAR, 1)), 
             (CONVERT (VARCHAR, 3)), 
             ('george'), 
             (CONVERT (VARCHAR, 7)), 
             ('ginger') 
         ) AS myTable(mixedColumn)
ORDER BY ISNUMERIC(mixedColumn),
         mixedColumn

Which produces:













The ISNUMERIC function became available in SQL 2008. However, be careful, since certain characters that you might expect to be alpha are actually interpreted as digit, e.g. periods, commas, plus sign, etc.


Followers