Thursday, October 12, 2017

SQL Ranking Functions, Part 1: the OVER clause

-- Database by Doug
-- Douglas Kline
-- 10/10/2017
-- RANKing functions, Part 1: OVER

USE Northwind

-- here's a simple SELECT statement from the Products table

SELECT   ProductName,
         UnitPrice
FROM     Products
ORDER BY UnitPrice DESC

-- this shows that the highest priced product is Cote de Blaye, productID 38

-- but sometimes the *relative* price is more important than the actual price
-- in other words, we want to know how products *rank*, based on price

-- So, here's how you would do that in SQL Server:

SELECT   ProductName,
         UnitPrice,
         RANK() OVER (ORDER BY unitprice DESC) AS [price rank]
FROM     Products
ORDER BY UnitPrice DESC

-- notice that the highest priced product is ranked 1
-- the second highest priced product has a rank of 2

-- let's look at the syntax

-- RANK() is a function that takes no parameters
-- but it requires an OVER clause

-- so this doesn't work, giving an error
SELECT   ProductName,
         UnitPrice,
         RANK()  -- no over
FROM     Products
ORDER BY UnitPrice DESC

-- the OVER clause tells the RANK how to do its ranking

-- in our example, the phrase (ORDER BY unitprice DESC) means
-- "When calculating RANK, rank based on price, descending."

SELECT   ProductName,
         UnitPrice,
         RANK() OVER (ORDER BY unitprice DESC) AS [price rank]
FROM     Products
ORDER BY UnitPrice DESC

-- we could also choose to rank the products by productname:

SELECT   ProductName,
         UnitPrice,
         RANK() OVER (ORDER BY ProductName ASC) AS [name rank]
FROM     Products
ORDER BY UnitPrice DESC

-- note that I didn't change the record ordering
-- so its hard to tell if the rank is correct
-- let's fix that:

SELECT   ProductName,
         UnitPrice,
         RANK() OVER (ORDER BY ProductName ASC) AS [name rank]
FROM     Products
ORDER BY ProductName ASC

-- so now I can visually verify the ranking

-- but, this makes demonstrates something important:

-- the ORDER BY in an OVER clause only affects the ranking function
-- it is independent of the ORDER BY for the SELECT statement

-- in other words, you can rank in one way, and show records another

-- for example, you might want to show products alphabetically,
-- but also show their rank in terms of price:

SELECT   ProductName,
         UnitPrice,
         RANK() OVER (ORDER BY unitprice DESC) AS [price rank]
FROM     Products
ORDER BY ProductName ASC


-- now consider this

SELECT   ProductName,
         CategoryID,
         UnitPrice,
         RANK() OVER (ORDER BY unitprice DESC) AS [price rank]
FROM     Products
ORDER BY unitprice DESC

-- notice that Ipoh Coffee is ranked 9th overall
-- but is also the 2nd highest priced product in CategoryID 1

-- so, I could do this:
SELECT   ProductName,
         CategoryID,
         UnitPrice,
         RANK() OVER (ORDER BY unitprice DESC) AS [price rank in category]
FROM     Products
WHERE    CategoryID = 1
UNION
SELECT   ProductName,
         CategoryID,
         UnitPrice,
         RANK() OVER (ORDER BY unitprice DESC) AS [price rank in category]
FROM     Products
WHERE    CategoryID = 2
ORDER BY unitprice DESC
-- etc.

-- but that's a pain
-- and is really slow at scale

-- let's use the OVER clause to tell RANK to *rank by category*

SELECT   ProductName,
         CategoryID,
         UnitPrice,
         RANK() OVER (ORDER BY unitprice DESC) AS [price rank overall],
         RANK() OVER (PARTITION BY CategoryID ORDER BY unitprice DESC) AS [price rank in category]
FROM     Products
ORDER BY unitprice DESC

-- here it is formatted to show the difference:
SELECT   ProductName,
         CategoryID,
         UnitPrice,
         RANK() OVER (                        ORDER BY unitprice DESC) AS [price rank overall],
         RANK() OVER (PARTITION BY CategoryID ORDER BY unitprice DESC) AS [price rank in category]
FROM     Products
ORDER BY unitprice DESC

-- here, we've calculated rank in two different ways
-- the PARTITION BY says: rank against other products with the same CategoryID

-- now let's do it again, but rank products
-- against other products from the same supplier

-- and to prove that the rankings are true,
-- we can order the records in different ways:

SELECT   ProductName,
         CategoryID,
         SupplierID,
         UnitPrice,
         RANK() OVER (                        ORDER BY unitprice DESC) AS [price rank overall],
         RANK() OVER (PARTITION BY CategoryID ORDER BY unitprice DESC) AS [price rank in category],
         RANK() OVER (PARTITION BY SupplierID ORDER BY unitprice DESC) AS [price rank by supplier]
FROM     Products
--ORDER BY unitprice DESC
--ORDER BY CategoryID, unitprice DESC
ORDER BY SupplierID, unitprice DESC

-- one final example
-- let's rank by dollar value in inventory, within category

SELECT   ProductName,
         CategoryID,
         unitprice * UnitsInStock AS [dollar value in inventory],
         RANK() OVER (PARTITION BY CategoryID
                      ORDER BY  unitprice * UnitsInStock DESC,
                                ProductID ASC) AS [value rank by category] 
FROM     Products
ORDER BY CategoryID,
         [dollar value in inventory] DESC

-- notice that the ORDER BY in the OVER can be an expression

-- in the future videos, I'll cover ROW_NUMBER(), DENSE_RANK(), and NTILE(n)
-- they are all ranking functions
-- and they all use the OVER clause in exactly the same way
-- but they calculate a "rank" in various ways

-- Database by Doug
-- Douglas Kline
-- 10/10/2017
-- RANKing functions, Part 1: OVER

1 comment:

Followers