Хотя это решение немного некрасиво, я придумала этот подход. К вашему сведению, лучше всего сначала преобразовать прописную букву DAVID в David. Надеюсь, кто-то может найти это полезным или придумать лучшее решение. Спасибо
with table1 as (
SELECT ROW_NUMBER() OVER (ORDER BY firstID) as rowno,A.* FROM (
select
t1.name
,t1.ID
, case when t1.ID>t1.Fid then fid else T1.ID end as FIRSTID
, case when t1.ID>t1.Fid then T1.id else fID end as SECONDID
, case when t1.ID>t1.Fid then t1.NAME else t1.FNAME end as FIRSTNAME
, case when t1.ID>t1.Fid then t1.FNAME else t1.NAME end as SECONDNAME
, case when count(*) over (partition by id) =1 then 'nodups' else 'dups' end as ID_chk
from (
SELECT h1.NAME,
h1.ID,
h2.id as Fid,
h2.name as Fname,
SYS.UTL_MATCH.JARO_WINKLER_SIMILARITY(h1.name,h2.name) as match1
FROM (select
NAME,
ID from HAVE)h1 , (
select
NAME,
ID FROM
have)
h2 where SYS.UTL_MATCH.JARO_WINKLER_SIMILARITY((h1.name),(h2.name)) > 75
order by h1.id
)
t1
)A
)
, no_dups as
(
select * from table1 where ID_chk='nodups'
)
,dups as
(
select * from table1 where ID_chk<>'nodups'
)
, dups_stp1 as
(
select * from dups
WHERE FIRSTID <> SECONDID
)
, dups_stp2 as
(
select rowno,ID,FIRSTID,SECONDNAME from dups_stp1
where FIRSTID not in (select SECONDID from dups_stp1)
)
select t2.ID,t3.NAME,rnk as mtch_ind from (
select ID,SECONDNAME as NAME, dense_rank() OVER ( ORDER BY SECONDNAME asc)as rnk from (
select distinct ID, FIRSTID, SECONDNAME from dups_stp2
union all
select ID, FIRSTID, SECONDNAME from no_dups
)t1
)t2
inner join HAVE t3 on t2.ID=t3.ID
;
Ссылка https://www.decisivedata.net/blog/cleaning-messy-data-sql-part-1-fuzzy-matching-names