Tôi đã thay đổi mô hình dữ liệu của bạn một chút để thử và làm cho nó rõ ràng hơn một chút về những gì đang xảy ra ..
CREATE TABLE [dbo].[Customer]
(
[CustomerName] VARCHAR(20) NOT NULL,
[CustomerLink] VARBINARY(20) NULL
)
CREATE TABLE [dbo].[CustomerIdentification]
(
[CustomerName] VARCHAR(20) NOT NULL,
[ID] VARCHAR(50) NOT NULL,
[IDType] VARCHAR(16) NOT NULL
)
Và tôi đã thêm một số dữ liệu thử nghiệm khác ..
INSERT [dbo].[Customer]
([CustomerName])
VALUES ('Fred'),
('Bob'),
('Vince'),
('Tom'),
('Alice'),
('Matt'),
('Dan')
INSERT [dbo].[CustomerIdentification]
VALUES
('Fred', 'A', 'Passport'),
('Fred', 'A', 'SIN'),
('Fred', 'A', 'Drivers Licence'),
('Bob', 'A', 'Passport'),
('Bob', 'B', 'Drivers Licence'),
('Bob', 'C', 'Credit Card'),
('Vince', 'A', 'Passport'),
('Vince', 'B', 'SIN'),
('Vince', 'C', 'Credit Card'),
('Tom', 'A', 'Passport'),
('Tom', 'B', 'SIN'),
('Tom', 'B', 'Drivers Licence'),
('Alice', 'B', 'Drivers Licence'),
('Matt', 'X', 'Drivers Licence'),
('Dan', 'X', 'Drivers Licence')
Đây có phải là những gì bạn đang tìm kiếm:
;WITH [cteNonMatchingIDs] AS (
-- Pairs where the IDType is the same, but
-- name and ID don't match
SELECT ci3.[CustomerName] AS [CustomerName1],
ci4.[CustomerName] AS [CustomerName2]
FROM [dbo].[CustomerIdentification] ci3
INNER JOIN [dbo].[CustomerIdentification] ci4
ON ci3.[IDType] = ci4.[IDType]
WHERE ci3.[CustomerName] <> ci4.[CustomerName]
AND ci3.[ID] <> ci4.[ID]
),
[cteMatchedPairs] AS (
-- Pairs where the IDType and ID match, and
-- there aren't any non matching IDs for the
-- CustomerName
SELECT DISTINCT
ci1.[CustomerName] AS [CustomerName1],
ci2.[CustomerName] AS [CustomerName2]
FROM [dbo].[CustomerIdentification] ci1
LEFT JOIN [dbo].[CustomerIdentification] ci2
ON ci1.[CustomerName] <> ci2.[CustomerName]
AND ci1.[IDType] = ci2.[IDType]
WHERE ci1.[ID] = ISNULL(ci2.[ID], ci1.[ID])
AND NOT EXISTS (
SELECT 1
FROM [cteNonMatchingIDs]
WHERE ci1.[CustomerName] = [CustomerName1] -- correlated subquery
AND ci2.[CustomerName] = [CustomerName2]
)
AND ci1.[CustomerName] < ci2.[CustomerName]
),
[cteMatchedList] ([CustomerName], [CustomerNameList]) AS (
-- Turn the matched pairs into list of matching
-- CustomerNames
SELECT [CustomerName1],
[CustomerNameList]
FROM (
SELECT [CustomerName1],
CONVERT(VARCHAR(1000), '$'
+ [CustomerName1] + '$'
+ [CustomerName2]) AS [CustomerNameList]
FROM [cteMatchedPairs]
UNION ALL
SELECT [CustomerName2],
CONVERT(VARCHAR(1000), '$'
+ [CustomerName2]) AS [CustomerNameList]
FROM [cteMatchedPairs]
) [cteMatchedPairs]
UNION ALL
SELECT [cteMatchedList].[CustomerName],
CONVERT(VARCHAR(1000),[CustomerNameList] + '$'
+ [cteMatchedPairs].[CustomerName2])
FROM [cteMatchedList] -- recursive CTE
INNER JOIN [cteMatchedPairs]
ON RIGHT([cteMatchedList].[CustomerNameList],
LEN([cteMatchedPairs].[CustomerName1])
) = [cteMatchedPairs].[CustomerName1]
),
[cteSubstringLists] AS (
SELECT r1.[CustomerName],
r2.[CustomerNameList]
FROM [cteMatchedList] r1
INNER JOIN [cteMatchedList] r2
ON r2.[CustomerNameList] LIKE '%' + r1.[CustomerNameList] + '%'
),
[cteCustomerLink] AS (
SELECT DISTINCT
x1.[CustomerName],
HASHBYTES('SHA1', x2.[CustomerNameList]) AS [CustomerLink]
FROM (
SELECT [CustomerName],
MAX(LEN([CustomerNameList])) AS [MAX LEN CustomerList]
FROM [cteSubstringLists]
GROUP BY [CustomerName]
) x1
INNER JOIN (
SELECT [CustomerName],
LEN([CustomerNameList]) AS [LEN CustomerList],
[CustomerNameList]
FROM [cteSubstringLists]
) x2
ON x1.[MAX LEN CustomerList] = x2.[LEN CustomerList]
AND x1.[CustomerName] = x2.[CustomerName]
)
UPDATE c
SET [CustomerLink] = cl.[CustomerLink]
FROM [dbo].[Customer] c
INNER JOIN [cteCustomerLink] cl
ON cl.[CustomerName] = c.[CustomerName]
SELECT *
FROM [dbo].[Customer]