Monday, November 12, 2018

Understanding Relational Division in SQL



Here is the script to create the JobSkills database, with data, used in this example.

-- Database by Doug
-- Douglas Kline
-- 11/10/2018
-- Understanding Relational Division - a (relatively) simple example

USE JobSkills

-- show the diagram of the data model
-- explain

-- here are our applicants (note - 4 of them)

SELECT   *
FROM     Applicant

-- here are our skills (note - 4 of them)

SELECT   *
FROM     Skill

-- now what if every applicant had every skill
-- that would mean that there would be 16 (4 x 4)
-- records in the ApplicantSkill table
-- it would look something like this:

SELECT         *
FROM           Applicant
   CROSS JOIN  Skill
ORDER BY       Applicant.applicantID,
               Skill.skillID

-- if you haven't seen this before,
-- this is a cross join 
-- a cross join of two tables creates all possible combinations
-- of the records in the two tables
-- with no condition - no requirement on lining up FKs and PKs

-- this is known as the *cartesian product* of all records in both tables
-- and you might recall that *product* is another way of saying 
-- 'the result of multiplication'

-- in other words, CROSS JOIN is kind of *relational multiplication*
-- sort of (Applicant records) x (Skill records)

-- think of this in math terms a * b = c
-- where a is Applicants
-- b is Skills
-- and c is the result

-- now if you remember your algebra,
-- if a * b = c
-- that means that c / b = a

-- in our context 4 records times 4 records equals 16
-- so 16 divided by 4 should equal 4

-- let's try to do that!
-- I'm going to use a common table expression (CTE),
-- so you can tell there's nothing up my sleeve!
-- note that this is exactly the CROSS JOIN from above

; WITH [c] (applicantID, lastName, skillID, skillName)
AS
(
   SELECT         *
   FROM           Applicant
      CROSS JOIN  Skill
)
SELECT   applicantID,
         lastName
FROM     [c]
GROUP BY applicantID,
         lastName  
HAVING   COUNT(skillID) = (SELECT COUNT(*) FROM Skill)

-- this could also be rephrased as a subquery like this:

 SELECT   applicantID,
         lastName
FROM     (
            SELECT         *
            FROM           Applicant
               CROSS JOIN  Skill
          ) [c]
GROUP BY applicantID,
         lastName  
HAVING   COUNT(skillID) = (SELECT COUNT(*) FROM Skill)

-- the logic is that I'm showing all applicants
-- that have the same number of skills
-- as are in the skill table

-- so if CROSS JOIN is the "relational multiplication" operator
-- what is the "relational division" operator?

-- unfortunately, there is no relational division operator
-- that's why we are using HAVING and COUNT to get the logical equivalent

-- summary so far
-- 3 applicants CROSS JOIN 4 skills = 12 results, e.g., everyone has every skill
-- then 12 results divided by 4 skills = 3 applicants, the original applicants

-- but this is maybe an unrealistically simple example
-- not all applicants will have all skills

-- what we'll have in reality is a job that requires a certain set of skills
-- then we'll have applicants that each have a certain set of skills
-- and what we want is a set of applicants that whose skills *match* the requirements

-- here is the data model

-- let's start simply, with a single job that requires two skills
-- and a set of applicants that each have a set of skills

-- I've cooked some data for that...
-- examining jobskill...

SELECT   Job.jobID,
         Job.jobName,
         Skill.SkillID,
         Skill.SkillName
FROM     Job
   JOIN  JobSkill ON Job.jobID = JobSkill.jobID
   JOIN  Skill    ON JobSkill.skillID = Skill.skillID
WHERE    Job.JobID = 10
ORDER BY Job.jobID, 
         Skill.SkillID

-- this says that the software developer job 
-- requires the sql and javascript skills

-- now let's cook up some applicants with some skills

SELECT   Applicant.applicantID,
         Applicant.lastName,
         Skill.skillID,
         Skill.skillName
FROM     Applicant
   JOIN  ApplicantSkill ON Applicant.applicantID = ApplicantSkill.applicantID
   JOIN  Skill          ON ApplicantSkill.skillID = Skill.skillid
ORDER BY Applicant.applicantID,
         Skill.skillID

-- so it looks like viable candidates are Jones and Brown
-- Jones has exactly the right skills and no more
-- Brown has the right skills, plus one

-- so to find a candidate that satisfies our job...
-- a * b = c
-- candidateSkills = jobskills
-- candidates * skills = jobskills
-- candidates = jobskills / skills

-- this analogy is not perfect, in that * is not really a cross join
-- and candidateskills and jobskills are actual records, 
-- for which we are trying to find alignment via joins
--
-- we'll solve this with the COUNT method we used before

SELECT   Applicant.applicantID,
         Applicant.lastName,
         COUNT(jobSkill.SkillID) AS [MatchingSkillCount]
FROM     ApplicantSkill
   JOIN  JobSkill ON ApplicantSkill.skillID = JobSkill.SkillID
   JOIN  Applicant ON Applicant.ApplicantID = ApplicantSkill.ApplicantID
WHERE    JobSkill.JobID = 10
GROUP BY Applicant.applicantID,
         Applicant.lastName
HAVING   COUNT(jobSkill.SkillID) = (SELECT COUNT(*) FROM JobSkill WHERE JobSkill.jobID = 10)
ORDER BY Applicant.applicantID

-- note that the outer query limits the applicant skill count to only
-- skills that match exactly the skills for this job
-- and the subquery does the same

-- let's look at this in a more general sense:

SELECT   Job.jobID,
         Job.jobName,
         Skill.SkillID,
         Skill.SkillName
FROM     Job
   JOIN  JobSkill ON Job.jobID = JobSkill.jobID
   JOIN  Skill    ON JobSkill.skillID = Skill.skillID
ORDER BY Job.jobID, 
         Skill.SkillID

-- two jobs, each of which require two skills
-- disjoint skill set

-- same set of applicants
SELECT   Applicant.applicantID,
         Applicant.lastName,
         Skill.skillID,
         Skill.skillName
FROM     Applicant
   JOIN  ApplicantSkill ON Applicant.applicantID = ApplicantSkill.applicantID
   JOIN  Skill          ON ApplicantSkill.skillID = Skill.skillid
ORDER BY Applicant.applicantID,
         Skill.skillID

-- and the number of matching applicant skills for each job

SELECT   Applicant.applicantID,
         Applicant.lastName,
         Job.JobID,
         Job.jobName,
         COUNT(ApplicantSkill.SkillID) AS [MatchingJobSkillCount]
FROM     ApplicantSkill
   JOIN  JobSkill ON ApplicantSkill.skillID = JobSkill.SkillID -- this is *key*
   JOIN  Job      ON Job.jobID = JobSkill.JobID
   JOIN  Applicant ON Applicant.ApplicantID = ApplicantSkill.ApplicantID
GROUP BY Applicant.applicantID,
         Applicant.lastName,
         Job.JobID,
         Job.jobName
ORDER BY Applicant.applicantID,
         Job.JobID

-- now we'd like to limit this to the applicants with jobskill count
-- equal to the jobskill count of each job
-- the trick here is that the the HAVING subquery must be correlated

SELECT   Applicant.applicantID,
         Applicant.lastName,
         Job.JobID,
         Job.jobName,
         COUNT(ApplicantSkill.SkillID) AS [MatchingJobSkillCount]
FROM     ApplicantSkill
   JOIN  JobSkill ON ApplicantSkill.skillID = JobSkill.SkillID -- this is *key*
   JOIN  Job      ON Job.jobID = JobSkill.JobID
   JOIN  Applicant ON Applicant.ApplicantID = ApplicantSkill.ApplicantID
GROUP BY Applicant.applicantID,
         Applicant.lastName,
         Job.JobID,
         Job.jobName
HAVING   COUNT(ApplicantSkill.SkillID) = (SELECT COUNT(js.skillID)
                                          FROM   JobSkill js
                                          WHERE  js.jobID = job.jobID)
ORDER BY Applicant.applicantID,
         Job.JobID

-- perhaps this is clearer as a CTE:
; WITH [JobWithSkillCount] (jobID, skillCount)
AS
(
   SELECT   jobID,
            COUNT(skillID)
   FROM     JobSkill
   GROUP BY jobID 
)
SELECT   Applicant.applicantID,
         Applicant.lastName,
         Job.JobID,
         Job.jobName,
         COUNT(ApplicantSkill.SkillID) AS [MatchingJobSkillCount]
FROM     ApplicantSkill
   JOIN  JobSkill ON ApplicantSkill.skillID = JobSkill.SkillID -- this is *key*
   JOIN  Job      ON Job.jobID = JobSkill.JobID
   JOIN  Applicant ON Applicant.ApplicantID = ApplicantSkill.ApplicantID
   JOIN  [JobWithSkillCount] ON [JobWithSkillCount].jobID = job.jobID
GROUP BY Applicant.applicantID,
         Applicant.lastName,
         Job.JobID,
         Job.jobName,
         JobWithSkillCount.skillCount
HAVING   COUNT(ApplicantSkill.SkillID) = JobWithSkillCount.skillCount
ORDER BY Applicant.applicantID,
         Job.JobID

-- hopefully, you find this a helpful example for understanding relational division

-- A couple last comments

-- I've presented this from the employer point of view:
   -- applicants who match job requirements
-- but it is exactly equivalent form an applicant point of view:
-- jobs that match skillsets

-- I'm not showing the "remainder" of the division
-- for example, Brown actually has *extra* skills!
-- this example shows applicants with the exact matching skillsets
-- and some may have extra skills

-- this is not necessarily the *most efficient* way to
-- do relational division
-- my goal was to create an example for understanding relational division

-- an performant solution would of course depend on many things:
-- the data volume, its statistical distribution, available indexes,
-- the schema, etc.

-- I hope that you found this helpful!

-- Database by Doug
-- Douglas Kline
-- 11/10/2018
-- Understanding Relational Division - a (relatively) simple example



JobSkills database schema and data for Understanding Relational Division

The script below creates a database called JobSkills with the following schema:


The script also creates records in each table that are consistent with the following demonstration:
Understanding Relational Division in SQL


USE [JobSkills]
GO
/****** Object:  Table [dbo].[Applicant]    Script Date: 11/12/2018 9:10:48 AM ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE TABLE [dbo].[Applicant](
	[applicantID] [int] NOT NULL,
	[lastName] [varchar](50) NOT NULL,
 CONSTRAINT [PK_Applicant] PRIMARY KEY CLUSTERED 
(
	[applicantID] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
) ON [PRIMARY]
GO
/****** Object:  Table [dbo].[ApplicantSkill]    Script Date: 11/12/2018 9:10:49 AM ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE TABLE [dbo].[ApplicantSkill](
	[employeeSkillID] [int] NOT NULL,
	[applicantID] [int] NOT NULL,
	[skillID] [int] NOT NULL,
 CONSTRAINT [PK_ApplicantSkill] PRIMARY KEY CLUSTERED 
(
	[employeeSkillID] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
) ON [PRIMARY]
GO
/****** Object:  Table [dbo].[Job]    Script Date: 11/12/2018 9:10:49 AM ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE TABLE [dbo].[Job](
	[jobID] [int] NOT NULL,
	[jobName] [varchar](50) NOT NULL,
 CONSTRAINT [PK_Job] PRIMARY KEY CLUSTERED 
(
	[jobID] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
) ON [PRIMARY]
GO
/****** Object:  Table [dbo].[JobSkill]    Script Date: 11/12/2018 9:10:49 AM ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE TABLE [dbo].[JobSkill](
	[jobSkillID] [int] NOT NULL,
	[jobID] [int] NOT NULL,
	[skillID] [int] NOT NULL,
 CONSTRAINT [PK_JobSkill] PRIMARY KEY CLUSTERED 
(
	[jobSkillID] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
) ON [PRIMARY]
GO
/****** Object:  Table [dbo].[Skill]    Script Date: 11/12/2018 9:10:49 AM ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE TABLE [dbo].[Skill](
	[skillID] [int] NOT NULL,
	[skillName] [varchar](50) NOT NULL,
 CONSTRAINT [PK_Skill] PRIMARY KEY CLUSTERED 
(
	[skillID] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
) ON [PRIMARY]
GO
INSERT [dbo].[Applicant] ([applicantID], [lastName]) VALUES (100, N'Kline')
GO
INSERT [dbo].[Applicant] ([applicantID], [lastName]) VALUES (101, N'Smith')
GO
INSERT [dbo].[Applicant] ([applicantID], [lastName]) VALUES (102, N'Jones')
GO
INSERT [dbo].[Applicant] ([applicantID], [lastName]) VALUES (103, N'Brown')
GO
INSERT [dbo].[ApplicantSkill] ([employeeSkillID], [applicantID], [skillID]) VALUES (1000, 100, 1)
GO
INSERT [dbo].[ApplicantSkill] ([employeeSkillID], [applicantID], [skillID]) VALUES (1001, 100, 2)
GO
INSERT [dbo].[ApplicantSkill] ([employeeSkillID], [applicantID], [skillID]) VALUES (1002, 101, 3)
GO
INSERT [dbo].[ApplicantSkill] ([employeeSkillID], [applicantID], [skillID]) VALUES (1003, 102, 1)
GO
INSERT [dbo].[ApplicantSkill] ([employeeSkillID], [applicantID], [skillID]) VALUES (1004, 102, 4)
GO
INSERT [dbo].[ApplicantSkill] ([employeeSkillID], [applicantID], [skillID]) VALUES (1005, 103, 1)
GO
INSERT [dbo].[ApplicantSkill] ([employeeSkillID], [applicantID], [skillID]) VALUES (1006, 103, 4)
GO
INSERT [dbo].[ApplicantSkill] ([employeeSkillID], [applicantID], [skillID]) VALUES (1007, 103, 3)
GO
INSERT [dbo].[ApplicantSkill] ([employeeSkillID], [applicantID], [skillID]) VALUES (1008, 103, 2)
GO
INSERT [dbo].[Job] ([jobID], [jobName]) VALUES (10, N'Software Developer')
GO
INSERT [dbo].[Job] ([jobID], [jobName]) VALUES (11, N'Business Analyst')
GO
INSERT [dbo].[Job] ([jobID], [jobName]) VALUES (12, N'Data Analyst')
GO
INSERT [dbo].[JobSkill] ([jobSkillID], [jobID], [skillID]) VALUES (100, 10, 1)
GO
INSERT [dbo].[JobSkill] ([jobSkillID], [jobID], [skillID]) VALUES (101, 10, 4)
GO
INSERT [dbo].[JobSkill] ([jobSkillID], [jobID], [skillID]) VALUES (102, 11, 2)
GO
INSERT [dbo].[JobSkill] ([jobSkillID], [jobID], [skillID]) VALUES (103, 11, 3)
GO
INSERT [dbo].[Skill] ([skillID], [skillName]) VALUES (1, N'SQL')
GO
INSERT [dbo].[Skill] ([skillID], [skillName]) VALUES (2, N'Public Speaking')
GO
INSERT [dbo].[Skill] ([skillID], [skillName]) VALUES (3, N'Project Management')
GO
INSERT [dbo].[Skill] ([skillID], [skillName]) VALUES (4, N'Javascript')
GO
ALTER TABLE [dbo].[ApplicantSkill]  WITH CHECK ADD  CONSTRAINT [FK_ApplicantSkill_Applicant] FOREIGN KEY([applicantID])
REFERENCES [dbo].[Applicant] ([applicantID])
GO
ALTER TABLE [dbo].[ApplicantSkill] CHECK CONSTRAINT [FK_ApplicantSkill_Applicant]
GO
ALTER TABLE [dbo].[ApplicantSkill]  WITH CHECK ADD  CONSTRAINT [FK_ApplicantSkill_Skill] FOREIGN KEY([skillID])
REFERENCES [dbo].[Skill] ([skillID])
GO
ALTER TABLE [dbo].[ApplicantSkill] CHECK CONSTRAINT [FK_ApplicantSkill_Skill]
GO
ALTER TABLE [dbo].[JobSkill]  WITH CHECK ADD  CONSTRAINT [FK_JobSkill_Job] FOREIGN KEY([jobID])
REFERENCES [dbo].[Job] ([jobID])
GO
ALTER TABLE [dbo].[JobSkill] CHECK CONSTRAINT [FK_JobSkill_Job]
GO
ALTER TABLE [dbo].[JobSkill]  WITH CHECK ADD  CONSTRAINT [FK_JobSkill_Skill] FOREIGN KEY([skillID])
REFERENCES [dbo].[Skill] ([skillID])
GO
ALTER TABLE [dbo].[JobSkill] CHECK CONSTRAINT [FK_JobSkill_Skill]
GO

Followers