-- Database by Doug
-- Douglas Kline
-- 10/9/2018
-- DISTINCT - removing duplicates, but in a "dumb" way
-- consider this
SELECT ProductID,
ProductName,
SupplierID
FROM Products
-- this is a list of all products
-- note the repeats in the SupplierID
-- suppose I want top know the list of suppliers
-- in the Products table
SELECT SupplierID
FROM Products
-- again, notice the repeats
-- to remove the repeats, I can do this:
SELECT DISTINCT SupplierID
FROM Products
-- you might be saying, why not just do this:
SELECT SupplierID
FROM Suppliers
-- my response is: that's a different list
-- it's the list of all suppliers in the supplier table
-- What I'm looking for is the list of all suppliers
-- in the products table
-- in other words, all suppliers which we actually *use*
-- Let's look at another example, with a table I've created
SELECT firstName
FROM Person
ORDER BY firstName
-- this is a list of all people's first names
-- as we scroll down, we'll start to see repeats
-- in other words, multiple people have the same first name
-- now let's say we want a list of all first distinct firstnames
SELECT DISTINCT firstName
FROM Person
ORDER BY firstName
-- so now there isn't a record returned for every Person record
-- there's a record returned for every unique firstname
-- also note that NULL is considered to be a unique firstname
-- you might say, why not use Group By to do this?
-- like this:
SELECT firstName
FROM Person
GROUP BY firstName
ORDER BY firstName
-- logically, it returns the exact same records, and always will
-- however, GROUP BY does a lot more work
-- it actually sets up groups of records in preparation to
-- calculate aggregates like SUM, COUNT, AVG, etc.
-- DISTINCT is much faster
-- if it sees a value it has seen before, it just throws it out
-- in other words, it doesn't group the records
-- it just makes a list of unique values
-- so, don't use GROUP BY when what you really need is DISTINCT
-- alright, what if you want to count stuff?
SELECT COUNT(ID) AS [# of people],
COUNT(DISTINCT ID) AS [# of distinct primary key values],
COUNT(firstName) AS [# of people with non-NULL firstnames],
COUNT(DISTINCT firstName) AS [# of distinct firstNames]
FROM Person
-- note that the first two values are always the same,
-- since primary key values are distinct aka unique
-- and also notice that there are 599 distinct first names, but recall
SELECT DISTINCT firstName
FROM Person
ORDER BY firstName
-- so why is the COUNT(DISTINCT firstname) = 599
-- but DISTINCT firstname gives 600 records?
-- remember that COUNT counts non-NULL values
-- finally, DISTINCT is sort of "dumb", in that it doesn't
-- know anything about primary keys
-- or anything about the underlying table(s)
-- it only considers values from the fields you provide
-- consider this:
SELECT DISTINCT firstname
FROM Person
-- it doesn't give distinct Person records, just distinct firstnames
-- now this:
SELECT DISTINCT lastname
FROM Person
-- and the distinct applies to the *combination*
-- of all the fields in the SELECT clause
-- in this example,
-- all distinct *combinations* of gender and firstname are shown
SELECT DISTINCT gender,
firstname
FROM Person
ORDER BY gender,
firstname
-- in summary,
-- distinct removes duplicates
-- it removes duplicates based on all fields in the SELECT list
-- when used with COUNT, it will not count duplicate values
-- thanks for watching!
-- Database by Doug
-- Douglas Kline
-- 10/9/2018
-- DISTINCT - removing duplicates, but in a "dumb" way
Wednesday, October 10, 2018
Using DISTINCT in SQL
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment