Scalable Solution to get latest row for each ID in BigQuery

Short and scalable version:

select array_agg(t order by collection_time desc limit 1)[offset(0)].*
from mytable t
group by t.id;

If you don't care about writing a piece of code for every column:

SELECT ID, 
  ARRAY_AGG(col1 ORDER BY collection_time DESC)[OFFSET(0)] AS col1,
  ARRAY_AGG(col2 ORDER BY collection_time DESC)[OFFSET(0)] AS col2
FROM myTable
GROUP BY ID

SELECT
  agg.table.*
FROM (
  SELECT
    id,
    ARRAY_AGG(STRUCT(table)
    ORDER BY
      collection_time DESC)[SAFE_OFFSET(0)] agg
  FROM
    `dataset.table` table
  GROUP BY
    id)

This will do the job for you and is scalable considering the fact that the schema keeps changing, you won't have to change this

Quick and dirty option - combine your both queries into one - first get all records with latest collection_time (using your second query) and then dedup them using your first query:

SELECT * FROM (
  SELECT *, ROW_NUMBER() OVER (PARTITION BY tab1.ID) AS rn 
  FROM (
    SELECT tab1.* 
    FROM mytable AS tab1
    INNER JOIN (
      SELECT ID, MAX(collection_time) AS second_time 
      FROM mytable GROUP BY ID
    ) AS tab2
    ON tab1.ID=tab2.ID AND tab1.collection_time=tab2.second_time
  )
)
WHERE rn = 1

And with Standard SQL (proposed by S.Mohsen sh)

WITH myTable AS (
  SELECT 1 AS ID, 1 AS collection_time
),
tab1 AS (
  SELECT ID,
  MAX(collection_time) AS second_time 
  FROM myTable GROUP BY ID
),
tab2 AS (
  SELECT * FROM myTable
),
joint AS (
  SELECT tab2.* 
  FROM tab2 INNER JOIN tab1
  ON tab2.ID=tab1.ID AND tab2.collection_time=tab1.second_time 
)
SELECT * EXCEPT(rn) 
FROM (
  SELECT *, ROW_NUMBER() OVER (PARTITION BY ID) AS rn 
  FROM joint
)
WHERE rn=1

Scalable Solution to get latest row for each ID in BigQuery

Tags:

Sql

Google Bigquery

Related

Recent Posts