BigQuery ML train/test split code example

Example: Google BigQuery how to create machine learning model using SQL

-- standardSQL to build the model
CREATE OR REPLACE MODEL `bqml_lab.sample_model`
OPTIONS(model_type='logistic_reg') AS
SELECT
  IF(totals.transactions IS NULL, 0, 1) AS label,
  IFNULL(device.operatingSystem, "") AS os,
  device.isMobile AS is_mobile,
  IFNULL(geoNetwork.country, "") AS country,
  IFNULL(totals.pageviews, 0) AS pageviews
FROM
  `bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE
  _TABLE_SUFFIX BETWEEN '20160801' AND '20170631'
LIMIT 100000;

SELECT
  *
FROM
  ml.EVALUATE(MODEL `bqml_lab.sample_model`, (
SELECT
  IF(totals.transactions IS NULL, 0, 1) AS label,
  IFNULL(device.operatingSystem, "") AS os,
  device.isMobile AS is_mobile,
  IFNULL(geoNetwork.country, "") AS country,
  IFNULL(totals.pageviews, 0) AS pageviews
FROM
  `bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE
  _TABLE_SUFFIX BETWEEN '20170701' AND '20170801'));

-- We used logistic regression for this model, hence, this query returns
-- values such as precision, recall, accuracy, f1_score, log_loss, roc_auc.
-- If it was linear regression, we'd have seen values such as mean_absolute_error, 
-- mean_squared_error, mean_squared_log_error, median_absolute_error, 
-- r2_score, explained_variance.
  
-- output in Json/Grid 
[
  {
    "precision": "0.43601895734597157",
    "recall": "0.0856610800744879",
    "accuracy": "0.9851952452667814",
    "f1_score": "0.1431906614785992",
    "log_loss": "0.04735969966406397",
    "roc_auc": "0.982000999000999"
  }
]

Tags:

Sql Example