Я использовал sys.dm_fts_parser
ниже, чтобы разбить предложения на слова. * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * . Если вы не пользуетесь SQL Server 2008 или считаете, что по какой-то причине это не подходит
Требование, чтобы каждое A.id
могло быть сопряжено только с B.id
, который ранее не использовался, и наоборот, я не мог придумать эффективного решения на основе множеств.
;WITH A(Id, sentence) As
(
SELECT 1,'What other text in here' UNION ALL
SELECT 2,'What am I doing here' UNION ALL
SELECT 3,'I need to find another job' UNION ALL
SELECT 4,'Other text in here'
),
B(Id, sentence) As
(
SELECT 5,'Other text in here' UNION ALL
SELECT 6,'I am doing what here' UNION ALL
SELECT 7,'Purple unicorns' UNION ALL
SELECT 8,'What are you doing in here'
), A_Split
AS (SELECT Id AS A_Id,
display_term,
COUNT(*) OVER (PARTITION BY Id) AS A_Cnt
FROM A
CROSS APPLY
sys.dm_fts_parser('"' + REPLACE(sentence, '"', '""')+'"',1033, 0,0)),
B_Split
AS (SELECT Id AS B_Id,
display_term,
COUNT(*) OVER (PARTITION BY Id) AS B_Cnt
FROM B
CROSS APPLY
sys.dm_fts_parser('"' + REPLACE(sentence, '"', '""')+'"',1033, 0,0)),
Joined
As (SELECT A_Id,
B_Id,
B_Cnt,
Cnt = COUNT(*),
CAST(COUNT(*) as FLOAT)/B_Cnt AS PctMatchBToA,
CAST(COUNT(*) as FLOAT)/A_Cnt AS PctMatchAToB
from A_Split A
JOIN B_Split B
ON A.display_term = B.display_term
GROUP BY A_Id,
B_Id,
B_Cnt,
A_Cnt)
SELECT IDENTITY(int, 1, 1) as id, *
INTO #IntermediateResults
FROM Joined
ORDER BY PctMatchBToA DESC,
PctMatchAToB DESC
DECLARE @A_Id INT,
@B_Id INT,
@Cnt INT
DECLARE @Results TABLE (
A_Id INT,
B_Id INT,
Cnt INT)
SELECT TOP(1) @A_Id = A_Id,
@B_Id = B_Id,
@Cnt = Cnt
FROM #IntermediateResults
ORDER BY id
WHILE ( @@ROWCOUNT > 0 )
BEGIN
INSERT INTO @Results
SELECT @A_Id,
@B_Id,
@Cnt
DELETE FROM #IntermediateResults
WHERE A_Id = @A_Id
OR B_Id = @B_Id
SELECT TOP(1) @A_Id = A_Id,
@B_Id = B_Id,
@Cnt = Cnt
FROM #IntermediateResults
ORDER BY id
END
DROP TABLE #IntermediateResults
SELECT *
FROM @Results
ORDER BY A_Id
Возвращает
A_Id B_Id Cnt
----------- ----------- -----------
1 8 3
2 6 5
4 5 4