Ниже приведен пример для BigQuery Standard SQL
#standardSQL
WITH `project.dataset.spreadsheets` AS (
SELECT 1 AS id, 'Quality Control.xlsx' AS spreadsheet UNION ALL
SELECT 2, 'Product Structure.xlsx' UNION ALL
SELECT 3, 'Invoices Sent April.xslx' UNION ALL
SELECT 4, 'Invoices Sent March.xlsx' UNION ALL
SELECT 5, 'Total Costs April.xlsx' UNION ALL
SELECT 6, 'Total Costs March.xlsx' UNION ALL
SELECT 7, 'Process of Quality Control.xlsx'
)
SELECT
id, s1 spreadsheet, IF(words = 0, NULL, s2) matching_spreadsheet, words
FROM (
SELECT
id, s1,
ARRAY_AGG(STRUCT(s2, words) ORDER BY words DESC LIMIT 1)[OFFSET(0)].*
FROM (
SELECT t1.id, t1.spreadsheet s1, t2.spreadsheet s2,
( SELECT COUNTIF(word != 'xlsx')
FROM UNNEST(REGEXP_EXTRACT_ALL(t1.spreadsheet, r'\w+')) word
JOIN UNNEST(REGEXP_EXTRACT_ALL(t2.spreadsheet, r'\w+')) word
USING(word)) words
FROM `project.dataset.spreadsheets` t1
CROSS JOIN `project.dataset.spreadsheets` t2
WHERE t1.spreadsheet != t2.spreadsheet
)
GROUP BY id, s1
)
-- ORDER BY id
с результатом как
Row id spreadsheet matching_spreadsheet words
1 1 Quality Control.xlsx Process of Quality Control.xlsx 2
2 2 Product Structure.xlsx null 0
3 3 Invoices Sent April.xslx Invoices Sent March.xlsx 2
4 4 Invoices Sent March.xlsx Invoices Sent April.xslx 2
5 5 Total Costs April.xlsx Total Costs March.xlsx 2
6 6 Total Costs March.xlsx Total Costs April.xlsx 2
7 7 Process of Quality Control.xlsx Quality Control.xlsx 2