View on GitHub

Emile's Notes

Data Science/Programming notes

Intermediate SQL

CASE statements

Simple CASE statements

 SELECT
    id,
    home_goal,
    away_goal,
    CASE WHEN home_goal > away_goal THEN 'Home Team Win'
         WHEN home_goal < away_goal THEN 'Away Team Win'
         ELSE 'Tie' END AS outcome
FROM match
WHERE season = '2013/2014';

More complex logic

SELECT date, hometeam_id, awayteam_id,
   CASE WHEN hometeam_id = 8455 AND home_goal > away_goal
         THEN 'Chelsea home win!'
      WHEN awayteam_id = 8455 AND home_goal < away_goal
         THEN ' Chelsea away win!'
      ELSE 'Loss or tie :)' END AS outcome
FROM match
WHERE hometeam_id = 8455 OR awayteam_id = 8455;

Filter out NULLs

If we do not specify an ELSE clause, then any rows where the conditions set are not met will yield NULL. We can filter these by using a WHERE clause to select only rows where the result is NOT NULL.

SELECT date, season,
   CASE WHEN hometeam_id = 8455 AND home_goal > away_goal
         THEN 'Chelsea home win!'
      WHEN awayteam_id = 8455 AND home_goal < away_goal
         THEN ' Chelsea away win!'
   END AS outcome
FROM match
WHERE CASE WHEN hometeam_id = 8455 AND home_goal > away_goal
         THEN 'Chelsea home win!'
      WHEN awayteam_id = 8455 AND home_goal < away_goal
         THEN ' Chelsea away win!'
      END IS NOT NULL;

CASE WHEN with aggregate functions

CASE WHEN with COUNT:

SELECT
   season,
   COUNT(CASE WHEN hometeam_id = 8650
            AND home_goal > away_goal
            THEN id END) AS home_wins,
   COUNT(CASE WHEN awayteam_id = 8650
            AND away_goal > home_goal
            THEN id END) AS away_wins
FROM match
GROUP BY season;

CASE WHEN with SUM:

SELECT
   season,
   SUM(CASE WHEN hometeam_id = 8650
         THEN home_goal END) AS home_goals,
   SUM(CASE WHEN awayteam_id = 8650
         THEN away_goal END) AS away_goals
FROM match
GROUP BY season;

CASE WHEN with AVERAGE:

SELECT
   season,
   ROUND(AVG(CASE WHEN hometeam_id = 8650
         THEN home_goal END),2) AS home_goals,
   ROUND(AVG(CASE WHEN awayteam_id = 8650
         THEN away_goal END),2) AS away_goals
FROM match
GROUP BY season;

Percentages with CASE and AVG:

SELECT
   season,
   AVG(CASE WHEN hometeam_id = 8455 AND home_goal > away_goal THEN 1
            WHEN hometeam_id = 8455 AND home_goal < away_goal THEN 0
      END) AS pct_homewins,
   AVG(CASE WHEN awayteam_id = 8455 AND away_goal > home_goal THEN 1
            WHEN awayteam_id = 8455 AND away_goal < home_goal THEN 0
      END) AS pct_awaywins
FROM match
GROUP BY season;

Subqueries

What is a subquery?

SELECT column
FROM (SELECT column
      FROM table) AS subquery;

Simple subqueries

SELECT home_goal
FROM match
WHERE home_goal > (
   SELECT AVG(home_goal)
   FROM match);
SELECT
   team_long_name,
   team_short_name AS abr
FROM team
WHERE
   team_api_id IN
   (SELECT hometeam_id
   FROM match
   WHERE country_id = 15722)

Subqueries in FROM

SELECT team, home_avg
FROM (SELECT
         t.team_long_name AS team,
         AVG(m.home_goal) AS home_avg
      FROM match AS m
      LEFT JOIN team AS t
      ON m.hometeam_id = t.team_api_id
      WHERE season = '2011/2012'
      GROUP BY team) AS subquery
ORDER BY home_avg DESC,
LIMIT 3;

Subqueries in SELECT

SELECT
   season,
   COUNT(id) AS matches,
   (SELECT COUNT(id) FROM match) as total_matches
FROM match
GROUP BY season;
SELECT
   date
   (home_goal + away_goal) AS goals,
   (home_goal + away_goal) -
      (SELECT AVG(home_goal + away_goal)
      FROM match
      WHERE season = '2011/2012') AS diff
FROM match
WHERE season = '2011/2012';

Multiple subqueries

SELECT
   country_id,
   ROUND(AVG(matches.home_goal + matches.away_goal),2) AS avg_goals,
   (SELECT ROUND(AVG(home_goal + away_goal),2)
   FROM match WHERE season = '2013/2014') AS overall_avg
FROM (SELECT
         id,
         home_goal,
         away_goal,
         season
      FROM match
      WHERE home_goal > 5) AS matches
WHERE matches.season = '2013/2014'
   AND (AVG(matches.home_goal + matches.away_goal) >
       (SELECT AVG(home_goal + away_goal)
       FROM match WHERE season = '2013/2014')
GROUP BY country_id;

Correlated Queries, Nested Queries and CTEs

Correlated Subqueries

-- Which match stages tend to have higher than average number of goals scored

-- Simple Query
SELECT
   s.stage,
   ROUND(s.avg_goal,2) AS avg_goal,
   (SELECT AVG(home_goal + away_goal)
   FROM match
   WHERE season = '2012/2013') AS overall_avg
FROM (SELECT
         stage,
         AVG(home_goal + away_goal) AS avg_goals
      FROM match
      WHERE season = '2012/2013'
      GROUP BY stage) AS s
WHERE s.avg_goals > (SELECT AVG(home_goal + away_goal)
                     FROM match
                     WHERE season = '2012/2013');

-- Correlated Query
SELECT
   s.stage,
   ROUND(s.avg_goals,2) AS avg_goal,
   (SELECT AVG(home_goal + away+goal)
   FROM match
   WHERE season = '2012/2013') AS overall_avg
FROM
   (SELECT
      stage,
      AVG(home_goal + away_goal) AS  avg_goals
   FROM match
   WHERE season = '2012/2013'
   GROUP BY stage) AS s
WHERE s.avg_goals > (SELECT AVG(home_goal + away_goal)
                     FROM match AS m
                     WHERE s.stage > m.stage);

Differences:

Simple Subquery Correlated
Can be run independently from the main query Dependent on the main query to execute
Evaluated once in the whole query Evaluated in loops - significantly slows down runtime
-- What is the average number of goals scored in each country

-- Simple Subquery
SELECT
   c.name AS country,
   AVG(m.home_goal + m.away_goal)
      AS avg_goals
FROM country AS c
LEFT JOIN match AS m
ON c.id = m.country_id
GROUP BY country;


-- Correlated Subquery
SELECT
   c.name AS country,
   (SELECT
      AVG(home_goal + away_goal)
    FROM match AS m
    WHERE m.country_id = c.id)
      AS avg_goals
FROM country AS c
GROUP BY country;

Nested Subqueries


-- How much did each countries average differ from the overall average?

--Simple Subquery
SELECT
   c.name AS country,
   AVG(m.home_goal + m.away_goal) AS avg_goals,
   AVG(m.home_goal + m.away_goal) -
      (SELECT AVG(home_goal + away_goal)
       FROM match) AS avg_diff
FROM country AS c
LEFT JOIN match AS m
ON c.id = m.country_id
GROUP BY country;
-- How does each month's total goals differ from the average monthly total of goals scored?

--Nested Subquery
SELECT
   EXTRACT(MONTH FROM date) AS month,
   SUM(m.home_goal + m.away_goal) AS total_goals,
   SUM(m.home_goals + m.away_goals) -
      (SELECT AVG(goals)
       FROM (SELECT
               EXTRACT(MONTH FROM date) AS month,
               SUM(home_goal + away_goal) AS goals
             FROM match
             GROUP BY month)) AS avg_diff
   FROM match AS m
   GROUP BY month;
-- What is each country's average goals scored in the 2011/2012 season?

-- Correlated Nested Subquery
SELECT
   c.name AS country,
   (SELECT AVG(home_goal + away_goal)
    FROM match AS m
    WHERE m.country_id = c.id
          AND id IN (
               SELECT id
               FROM match
               WHERE season = '2011/2012')) AS avg_goals
FROM country AS c
GROUP BY country;

Common Table Expressions

When adding subqueries, complexity increases quickly. CTE’s can help us with this.

WITH cte AS (
   SELECT col1, col2
   FROM table)
SELECT
   AVG(col1) AS avg_col
FROM cte;
-- Subquery in FROM
SELECT
   c.name AS country,
   COUNT(s.id) AS matches
FROM country AS c
INNER JOIN (
   SELECT country_id, id
   FROM match
   WHERE (home_goal + away_goal) >= 10) AS s
ON c.id = s.country_id
GROUP BY country;

-- With CTE

WITH s AS (
   SELECT country_id, id
   FROM match
   WHERE (home_goal + away_goal) >= 10
)

SELECT c.name AS country,
   c.name AS country,
   COUNT(s.id) AS matches
FROM country AS c
INNER JOIN s
ON c.id = s.country_id
GROUP BY country;

Multiple CTEs:

WITH s1 AS (
   SELECT country_id, id
   FROM match
   WHERE (home_goal + away_goal) >= 10),
s2 AS (
   SELECT country_id, id
   FROM match
   WHERE (home_goal + away_goal) <=1
)
SELECT
   c.name AS country,
   COUNT(s1.id) AS high_scores,
   COUNT(s2.id) AS low_scores,
FROM country AS c
INNER JOIN s1
ON c.id = s1.country_id
INNER JOIN s2
ON c.id = s2.country_id
GROUP BY country;

Why use CTEs?

Window Functions

Introduction to Window Functions

-- How many goals were scored in each match in 2011/2012, and how did that compare to the average?

-- Using a subquery
SELECT
   date,
   (home_goal + away_goal) AS goals,
   (SELECT AVG(home_goal + away_goal)
    FROM match
    WHERE season = '2011/2012') AS overall_avg
FROM match
WHERE season = '2011/2012';

-- Using a window function
SELECT
   date,
   (home_goal + away_goal) AS goals,
   AVG(home_goal + away_goal) OVER() as overall_avg
FROM match
WHERE season = '2011/2012';
--What is the rank of matches based on the number of goals scored?

SELECT
   date,
   (home_goal + away_goal) AS goals,
   RANK() OVER(ORDER BY home_goal + away_goal DESC) AS goals_rank
FROM match
WHERE season = '2011/2012';

Key differences:

Window Partitions

-- How many goals were scored in each match, and how did that compare to the season's average?

SELECT
   date,
   (home_goal + away_goal) AS goals,
   AVG(home_goal + away_goal)
      OVER(PARTITION BY season) AS season_avg
FROM match;

Partition by Multiple Columns:

SELECT
   c.name,
   m.season,
   (home_goal + away_goal) AS goals,
   AVG(home_goal + away_goal)
      OVER(PARTITION BY m.season, c.name) AS season_ctry_avg
FROM country AS c
LEFT JOIN match AS m
ON c.id = m.country_id

Partition considerations:

Sliding Windows

ROWS BETWEEN <start> AND <finish>

-- Can specify
PRECEDING
FOLLOWING
UNBOUNDED PRECEDING
UNBOUNDED FOLLOWING
CURRENT ROW
SELECT
   date,
   home_goal,
   away_goal,
   SUM(home_goal)
      OVER(ORDER BY date ROWS BETWEEN
           UNBOUNDED PRECEDING AND CURRENT ROW) AS running_total
FROM match
WHERE hometeam_id = 8456 AND season = '2011/2012';

```sql SELECT date, home_goal away_goal, SUM(home_goal) OVER(ORDER BY date ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) AS last2 FROM match WHERE hometeam_id = 8456 AND season = ‘2011/2012’;