How to extract all the keys in a JSON object with BigQuery
This is what I came up with (Specifically for StandardSQL).. Not sure if accumulating in a list is the best method... Also.. I simplified for my case where I'm just concerned with keys.
CREATE TEMPORARY FUNCTION Foo(infoo STRING)
RETURNS Array<String>
LANGUAGE js AS """
blah = [];
function processKey(node, parent) {
if (parent !== '') {parent += '.'};
Object.keys(node).forEach(function(key) {
value = node[key].toString();
if (value !== '[object Object]') {
blah.push(parent+key)
} else {
processKey(node[key], parent + key);
};
});
};
try {
x = JSON.parse(infoo);
processKey(x,'');
return blah;
} catch (e) { return null }
"""
OPTIONS ();
WITH x as(
select Foo(jsonfield) as bbb from clickstream.clikcs
)
select distinct arr_item from (SELECT arr_item FROM x, UNNEST(bbb) as arr_item)
Here's something that uses Standard SQL:
CREATE TEMP FUNCTION jsonObjectKeys(input STRING)
RETURNS Array<String>
LANGUAGE js AS """
return Object.keys(JSON.parse(input));
""";
WITH keys AS (
SELECT
jsonObjectKeys(myColumn) AS keys
FROM
myProject.myTable
WHERE myColumn IS NOT NULL
)
SELECT
DISTINCT k
FROM keys
CROSS JOIN UNNEST(keys.keys) AS k
ORDER BY k
How to extract all of a JSON object keys using a JavaScript UDF in BigQuery:
SELECT type, key
FROM (
SELECT * FROM
js(
(SELECT json, type FROM [fh-bigquery:openlibrary.ol_dump_20151231]
),
// Input columns.
json, type,
// Output schema.
"[{name: 'key', type:'string'},
{name: 'type', type:'string'}]",
// The function.
"function(r, emit) {
x=JSON.parse(r.json)
Object.keys(x).forEach(function(entry) {
emit({key:entry, type:r.type,});
});
}"
)
)
LIMIT 100
Grouped and counted:
Once you've found all the keys you can use, then you can use JSON_EXTRACT_SCALAR on a normal SQL query:
Now that you know the keys, you can extract all information known for a type:
SELECT JSON_EXTRACT_SCALAR(json, '$.key') key,
JSON_EXTRACT_SCALAR(json, '$.type.key') type,
JSON_EXTRACT(json, '$.revision') revision,
JSON_EXTRACT_SCALAR(json, '$.last_modified.value') last_modified,
JSON_EXTRACT_SCALAR(json, '$.title') title,
JSON_EXTRACT_SCALAR(json, '$.publish_date') publish_date,
JSON_EXTRACT(json, '$.publishers') publishers,
JSON_EXTRACT(json, '$.latest_revision') latest_revision,
JSON_EXTRACT(json, '$.languages') languages,
JSON_EXTRACT(json, '$.authors') authors,
JSON_EXTRACT(json, '$.works') works,
JSON_EXTRACT(json, '$.number_of_pages') number_of_pages,
JSON_EXTRACT(json, '$.publish_places') publish_places,
JSON_EXTRACT(json, '$.publish_country') publish_country,
JSON_EXTRACT(json, '$.subjects') subjects,
JSON_EXTRACT_SCALAR(json, '$.created.value') created,
JSON_EXTRACT_SCALAR(json, '$.pagination') pagination,
JSON_EXTRACT_SCALAR(json, '$.by_statement') by_statement,
JSON_EXTRACT(json, '$.isbn_10') isbn_10,
JSON_EXTRACT_SCALAR(json, '$.isbn_10[0]') isbn_10_0,
JSON_EXTRACT(json, '$.notes') notes,
JSON_EXTRACT(json, '$.lc_classifications') lc_classifications,
JSON_EXTRACT_SCALAR(json, '$.subtitle') subtitle,
JSON_EXTRACT(json, '$.lccn') lccn,
JSON_EXTRACT(json, '$.identifiers') identifiers,
JSON_EXTRACT(json, '$.contributions') contributions,
JSON_EXTRACT(json, '$.isbn_13') isbn_13,
JSON_EXTRACT_SCALAR(json, '$.isbn_13[0]') isbn_13_0,
JSON_EXTRACT(json, '$.physical_format') physical_format,
JSON_EXTRACT(json, '$.oclc_numbers') oclc_numbers,
JSON_EXTRACT(json, '$.series') series,
JSON_EXTRACT(json, '$.source_records') source_records,
JSON_EXTRACT(json, '$.covers') covers,
JSON_EXTRACT(json, '$.dewey_decimal_class') dewey_decimal_class,
JSON_EXTRACT_SCALAR(json, '$.edition_name') edition_name,
# ...
FROM [fh-bigquery:openlibrary.ol_dump_20151231]
WHERE type='/type/edition'
LIMIT 10
(sample data taken from an Open Library data dump https://openlibrary.org/developers/dumps, based on a reddit conversation)
Below version fixes some "issues" in original answer like:
1. only first level of keys was emitted
2. having to manually comppile and than run final query for extracting info based on discovered keys
SELECT type, key, value, COUNT(1) AS weight
FROM JS(
(SELECT json, type
FROM [fh-bigquery:openlibrary.ol_dump_20151231@0]
WHERE type = '/type/edition'
),
json, type, // Input columns
"[{name: 'type', type:'string'}, // Output schema
{name: 'key', type:'string'},
{name: 'value', type:'string'}]",
"function(r, emit) { // The function
x = JSON.parse(r.json);
processKey(x, '');
function processKey(node, parent) {
if (parent !== '') {parent += '.'};
Object.keys(node).map(function(key) {
value = node[key].toString();
if (value !== '[object Object]') {
emit({type:r.type, key:parent + key, value:value});
} else {
processKey(node[key], parent + key);
};
});
};
}"
)
GROUP EACH BY type, key, value
ORDER BY weight DESC
LIMIT 1000
The result is as below
Row type key value weight
1 /type/edition type.key /type/edition 25140209
2 /type/edition last_modified.type /type/datetime 25140209
3 /type/edition created.type /type/datetime 17092292
4 /type/edition languages.0.key /languages/eng 14514830
5 /type/edition notes.type /type/text 11681480
6 /type/edition revision 2 8714084
7 /type/edition latest_revision 2 8704217
8 /type/edition revision 3 5041680
9 /type/edition latest_revision 3 5040634
10 /type/edition created.value 2008-04-01T03:28:50.625462 3579095
11 /type/edition revision 1 3396868
12 /type/edition physical_format Paperback 3181270
13 /type/edition revision 4 3053266
14 /type/edition latest_revision 4 3053197
15 /type/edition revision 5 2076094
16 /type/edition latest_revision 5 2076072
17 /type/edition publish_country nyu 1727347
18 /type/edition created.value 2008-04-30T09:38:13.731961 1681227
19 /type/edition publish_country enk 1627969
20 /type/edition publish_places London 1613755
21 /type/edition physical_format Hardcover 1495864
22 /type/edition publish_places New York 1467779
23 /type/edition revision 6 1437467
24 /type/edition latest_revision 6 1437463
25 /type/edition publish_country xxk 1407624