In BigQuery, is there a way to replace HTML entity characters with their unicode equivalent?
For example, I have the following rows in a table:
id | text
1 | Hello World 😜
2 | Yes 😜 It works great 😜
And I would like to have:
id | text
1 | Hello World 😜
2 | Yes 😜 It works great 😜
The following general technique works:
😜
is considered a single characterOFFSET
SELECT
id,
ANY_VALUE(text) AS original,
STRING_AGG(
COALESCE(
-- Support hex codepoints
CODE_POINTS_TO_STRING(
[CAST(CONCAT('0x', REGEXP_EXTRACT(char, r'(?:&#x)(\w+)(?:;)')) AS INT64)]
),
-- Support decimal codepoints
CODE_POINTS_TO_STRING(
[CAST(CONCAT('0x', FORMAT('%x', CAST(REGEXP_EXTRACT(char, r'(?:&#)(\d+)(?:;)') AS INT64))) AS INT64)]
),
-- Fall back to the character itself
char
),
'' ORDER BY char_position) AS text
FROM UNNEST([
STRUCT(1 AS id, 'Hello World 😜' AS text),
STRUCT(2 AS id, 'Yes 😜 It works great 😜'),
STRUCT(3 AS id, '—' AS text),
STRUCT(4 AS id, '—' AS text)
])
CROSS JOIN
-- Extract all characters individually except for HTML entity characters
UNNEST(REGEXP_EXTRACT_ALL(text, r'(&#\w+;|.)')) char WITH OFFSET AS char_position
GROUP BY id