Интерпретация первичного ключа из табличных данных - PullRequest
2 голосов
/ 11 апреля 2011

У меня есть устаревшая база данных, импортированная в sql server 2008 r2, которая не содержит индексов и первичных / внешних ключей для сотен таблиц (некоторые с сотнями столбцов). Первичные ключи, которые я определил вручную (некоторые являются составными), обычно расположены в верхнем порядковом столбце, но я не собираюсь тратить недели на их разработку, если смогу помочь.

Существует ли какой-либо инструмент или сценарий, который можно использовать для анализа количества данных, чтобы предложить или выписать вероятные первичные ключи?

Пока я нашел следующий скрипт , но он выдает некоторые ошибки. Я отлаживаю его, чтобы посмотреть, смогу ли я решить, что идет не так, но мне интересно, сталкивался ли кто-нибудь с подобной проблемой и сумел ли она что-то заработать.

Текст скрипта по ссылке ниже

--Natural Key Finder
--blindman, 6/2005
--This script searches up to 63 columns of any table for permutations of fields that represent unique keys within the dataset.
--The search can be limited by the maximum columns per key, and by the maximum minutes for processing.

declare @TableName varchar(50)
declare @MaxColumns int
declare @MaxMinutes int

--Get a suitable test table from the current database
/*
set @TableName = 
        (select top 1 sysobjects.name
        from    sysobjects
            inner join sysindexes
                on sysobjects.id = sysindexes.id
                and indid in (0, 1)
            inner join syscolumns on sysobjects.id = syscolumns.id
        where   sysobjects.type = 'U'
        group by sysobjects.name,
            sysindexes.rowcnt
        having count(*) between 5 and 15
        order by rowcnt * count(*) desc)
*/

set @TableName = 'calendarEvents' --Enter the name of the table to analyze.  Do NOT enclose it in brackets: []!
set @MaxColumns = 63 --Set the maximum number of columns per key you want to search for.
set @MaxMinutes = 5 --Set a maximum time limit on the search process.

set nocount on

--Procedure variables
declare @RecordCount bigint --The number of records in the table, and the maximum possible cardinality.
declare @ColumnMask bigint --A bitmask representing the presence or absence of columns in a column set.
declare @ColumnString varchar(8000) --A comma-delimited string including all the column names in a column set.
declare @KeyLength int --The permutation size in columns currently being searched.
declare @StartTime datetime --Used to track execution time of the script.
declare @sqlstring varchar(8000) --Dynamically created sql statement.
declare @ProcessGUID char(32) --This random value will be used to name dynamically created database objects.
declare @SampleTableName varchar(50) --This value will hold the name of a dynamically created subset of the table data.
declare @SubMask1 bigint --Bitmask for storing temporary column mask.
declare @SubMask2 bigint --Bitmask for storing temporary column mask.
declare @SubMask3 bigint --Bitmask for storing temporary column mask.
declare @BitMask bigint --Temporary bitmask for stepping through a column mask to find active columns.
declare @Counter int --counts the number permutations analyzed

set @StartTime = getdate()
set @ProcessGUID = replace(newid(), '-', '') --Generate a random string.
set @SampleTableName = 'tbl' + @ProcessGUID --Generate a random table name.
set @Counter = 0
if  @MaxColumns > 63 set @MaxColumns = 63

--Create a temp table to hold the record count processed through dynamic SQL
set @RecordCount = 
    (select rowcnt
    from    sysobjects
        inner join sysindexes
            on sysobjects.id = sysindexes.id
    where   sysobjects.name = @TableName
        and sysindexes.indid in (1, 0))

--Create a table for listing and tracking permutations of columns.
create table #ColumnSets
    (ColumnSetID int identity,
    ColumnMask bigint, --a bitmask value reprenting the presence or absence of columns in a column set.
    ColumnCount int, --the number of columns in the set.
    Cardinality bigint,  --the number of unique values in the column set.
    IsUnique int, -- 0: not a naturalkey.  1: natural key.
    ColumnString varchar(4000)) --comma-delimited string listing all the columns in the column set.

exec ('CREATE CLUSTERED INDEX IX_' + @ProcessGUID + '_1 ON #ColumnSets (ColumnMask)')

--Create the parent records in the ColumnSets table
insert into #ColumnSets
    (ColumnMask,
    ColumnCount,
    Cardinality,
    ColumnString)
select  case when colid < 64 then power(cast(2 as bigint), colid-1) else 0 end, --Converts the colid value to a bitmap value.
    1,
    @RecordCount + 1, --Cardinality is unknown, so estimate 1 higher than the maximum possible cardinality.
    '[' + syscolumns.name + ']'
from    syscolumns
    inner join sysobjects on syscolumns.id = sysobjects.id
where   sysobjects.name = @TableName
    and syscolumns.prec is not null
    and colid < 64

--Informational message
select  'Analyzing table "' + @TableName + '" containing ' + cast(@RecordCount as varchar(20)) + ' records in ' + cast(count(*) as varchar(3)) + ' columns.'
from    #ColumnSets
where   ColumnCount = 1

--First we will search for large non-primary keys in a sample set of the records.  If we find any large permutations that
--are known not to be unique, then we can eliminate any subsets of these permutations from processing later on.
select cast(power(@MaxMinutes, 0.5) -1 as varchar(4)) + ' minutes will be alloted for pre-searching.'

declare @SampleSize int
set @SampleSize = power(@RecordCount, (0.5)) --Sample sqrt(RecordCount)
set @SQLString = 'select top ' + cast(@SampleSize as varchar(10)) + ' * into ' + @SampleTableName + ' from ' + @TableName
exec    (@SQLString)

--Start with a columnset including all columns
insert into #ColumnSets
    (ColumnMask,
    ColumnCount)
select  sum(ColumnMask),
    count(*)
from    #ColumnSets
where   ColumnCount = 1

set @KeyLength = (select count(*) from #ColumnSets where ColumnCount = 1)

while   exists (select * from #ColumnSets where IsUnique is null and ColumnCount > 1)--isnull((Select max(ColumnCount) from #ColumnSets where IsUnique = 0), 1))
    begin
        --show status
        select  'Largest non-unique permutation found: ' + cast(isnull(max(ColumnCount), 0) as varchar(3)) + ' columns.'
        from    #ColumnSets
        where   IsUnique = 0

        --If the estimated cardinality is less than the record count, we know the column set cannot
        --possibly be unique, so mark it as non-unique.
        update  #ColumnSets
        set IsUnique = 0
        where   Cardinality < @RecordCount
            and IsUnique is null

        while exists (select * from #ColumnSets where IsUnique is null and ColumnCount > 1)--isnull((Select max(ColumnCount) from #ColumnSets where IsUnique = 0), 1))
            begin

                --set   @KeyLength = (select max(ColumnCount) from #ColumnSets where IsUnique is null and ColumnCount > 1)
                --Get the cardinality of all untested column sets
                set @Counter = @Counter + 
                    (Select Count(*)
                    from    #ColumnSets
                    where   IsUnique is null
                        and ColumnCount = (select max(ColumnCount) from #ColumnSets where IsUnique is null and ColumnCount > 1))

                declare CSCursor cursor for
                    Select  ColumnMask,
                        ColumnString
                    from    #ColumnSets
                    where   IsUnique is null
                        and ColumnCount = (select max(ColumnCount) from #ColumnSets where IsUnique is null and ColumnCount > 1)

                Open    CSCursor
                Fetch next from CSCursor into @ColumnMask, @ColumnString
                while @@FETCH_STATUS = 0
                    begin
                        --Create the column string for the bitmask
                        select  @ColumnString = isnull(@ColumnString + ', ', '') + ColumnString
                        from    #ColumnSets
                        where   ColumnCount = 1
                            and ColumnMask & @ColumnMask = ColumnMask
                        order by ColumnString

                        set @SQLString = 'Update #ColumnSets set ColumnString = ''' + @ColumnString + ''', Cardinality = (select count(*) from (select distinct ' + @ColumnString + ' from ' + @SampleTableName + ') DistinctValues), IsUnique = 0 where IsUnique is null and ColumnMask = ' + cast(@ColumnMask as varchar(20))
                        exec (@SQLString)
                        fetch next from CSCursor into @ColumnMask, @ColumnString

                        --Break out of this loop if we have used more than half of the alloted processing time.
                        if dateadd(mi, power(@MaxMinutes, 0.5), @StartTime) < getdate() break
                    end
                Close   CSCursor
                Deallocate CSCursor

                --Any columnsets with a cardinality equal to the rowcount represent natural keys
                Update  #ColumnSets set IsUnique = 1 where Cardinality = @SampleSize and ColumnCount > 1
                if dateadd(mi, power(@MaxMinutes, 0.5), @StartTime) < getdate() break

                --Delete subsets of combinations known not to be unique.
                delete  CandidateSets
                from    #ColumnSets CandidateSets
                    inner join #ColumnSets Supersets on CandidateSets.ColumnMask & Supersets.ColumnMask = CandidateSets.ColumnMask
                where   SuperSets.IsUnique = 0
                    and CandidateSets.ColumnCount > 1
                    and CandidateSets.ColumnCount < SuperSets.ColumnCount
            end

        if dateadd(mi, power(@MaxMinutes, 0.5) -1, @StartTime) < getdate() break

        --Split non-minimal unique sets into three subsets.  We will shuffle these subset together
        --to creat new permutations for searching.
        declare CSCursor cursor for
            Select  ColumnMask
            from    #ColumnSets
            where   IsUnique = 1
                and ColumnCount >= 6

        Open    CSCursor
        Fetch next from CSCursor into @ColumnMask
        while @@FETCH_STATUS = 0
            begin
                set @BitMask = 1
                set @SubMask1 = 0
                set @SubMask2 = 0
                set @SubMask3 = 0

                --Note that this permutation has already been split
                update  #ColumnSets set IsUnique = 2 where ColumnMask = @ColumnMask

                --split the TargetMask into three distinct masks
                while   @BitMask < @ColumnMask
                    begin
                    while   @BitMask < @ColumnMask and @BitMask & @ColumnMask <> @BitMask set @BitMask = @BitMask * 2
                    set @SubMask1 = @SubMask1 | @BitMask
                    if  @BitMask > @ColumnMask/2 break
                    set @BitMask = @BitMask * 2
                    while   @BitMask < @ColumnMask and @BitMask & @ColumnMask <> @BitMask set @BitMask = @BitMask * 2
                    set @SubMask2 = @SubMask2 | @BitMask
                    if  @BitMask > @ColumnMask/2 break
                    set @BitMask = @BitMask * 2
                    while   @BitMask < @ColumnMask and @BitMask & @ColumnMask <> @BitMask set @BitMask = @BitMask * 2
                    set @SubMask3 = @SubMask3 | @BitMask
                    if  @BitMask > @ColumnMask/2 break
                    set @BitMask = @BitMask * 2
                    end     

                insert into #ColumnSets
                    (ColumnMask,
                    ColumnCount,
                    Cardinality)
                select  @SubMask1,
                    Count(*),
                    @RecordCount + 1
                from    #ColumnSets
                where   ColumnCount = 1
                    and ColumnMask & @SubMask1 = ColumnMask

                insert into #ColumnSets
                    (ColumnMask,
                    ColumnCount,
                    Cardinality)
                select  @SubMask2,
                    Count(*),
                    @RecordCount + 1
                from    #ColumnSets
                where   ColumnCount = 1
                    and ColumnMask & @SubMask2 = ColumnMask

                insert into #ColumnSets
                    (ColumnMask,
                    ColumnCount,
                    Cardinality)
                select  @SubMask3,
                    Count(*),
                    @RecordCount + 1
                from    #ColumnSets
                where   ColumnCount = 1
                    and ColumnMask & @SubMask3 = ColumnMask

                fetch next from CSCursor into @ColumnMask
            end
        Close   CSCursor
        Deallocate CSCursor

        --Create larger combinations of non-unique permutations for testing
        insert into #ColumnSets
            (ColumnMask,
            ColumnCount,
            Cardinality)
        select  distinct
            ColumnSetsA.ColumnMask | ColumnSetsB.ColumnMask,
            ColumnSetsA.ColumnCount + ColumnSetsB.ColumnCount,
            @RecordCount + 1
        from    #ColumnSets ColumnSetsA,
            #ColumnSets ColumnSetsB
        where   ColumnSetsA.ColumnCount + ColumnSetsB.ColumnCount > isnull((Select max(ColumnCount) from #ColumnSets where IsUnique = 0), 0)
            and ColumnSetsA.IsUnique = 0
            and isnull(ColumnSetsB.IsUnique, 0) = 0
            and ColumnSetsA.ColumnMask <> ColumnSetsB.ColumnMask
            and ColumnSetsA.ColumnMask & ColumnSetsB.ColumnMask = 0

        --Remove duplicates
        delete  #ColumnSets
        from    #ColumnSets
            left outer join --DistinctSets
                (select ColumnMask,
                    min(ColumnSetID) ColumnSetID
                from    #ColumnSets
                group by ColumnMask) DistinctSets
                on #ColumnSets.ColumnMask = DistinctSets.ColumnMask
                and #ColumnSets.ColumnSetID = DistinctSets.ColumnSetID
        where   DistinctSets.ColumnSetID is null

        --Delete subsets of combinations known not to be unique.
        delete  CandidateSets
        from    #ColumnSets CandidateSets
            inner join #ColumnSets Supersets on CandidateSets.ColumnMask & Supersets.ColumnMask = CandidateSets.ColumnMask
        where   SuperSets.IsUnique = 0
            and CandidateSets.ColumnCount > 1
            and CandidateSets.ColumnCount < SuperSets.ColumnCount
            and CandidateSets.IsUnique is null

        --Delete supersets of combinations already known to be unique.
        delete  CandidateSets
        from    #ColumnSets CandidateSets
            inner join #ColumnSets UniqueSets on CandidateSets.ColumnMask & UniqueSets.ColumnMask = UniqueSets.ColumnMask
        where   UniqueSets.IsUnique > 0 
            and CandidateSets.ColumnCount > UniqueSets.ColumnCount
            and CandidateSets.IsUnique is null
    end     

delete  CandidateSets
from    #ColumnSets CandidateSets
    inner join #ColumnSets Supersets on CandidateSets.ColumnMask & Supersets.ColumnMask = CandidateSets.ColumnMask
where   SuperSets.IsUnique = 0
    and CandidateSets.ColumnCount > 1
    and CandidateSets.ColumnCount < SuperSets.ColumnCount

delete
from    #ColumnSets
where   (IsUnique <> 0 or IsUnique is null)
    and ColumnCount > 1

set @SQLString = 'drop table ' + @SampleTableName
exec    (@SQLString)


--Now start building permutations of columns and checking them for uniqueness.
set @KeyLength = 1
While @KeyLength <= @MaxColumns
    begin
        --If there are no more candidates left, then stop looking.
        if (select count(*) from #ColumnSets where IsUnique is null) = 0 break

        --If the estimated cardinality is less than the record count, we know the column set cannot possibly be unique.
        update  #ColumnSets
        set IsUnique = 0
        where   Cardinality < @RecordCount
            and IsUnique is null

        --show status
        select  ColumnCount,
            sum(case when IsUnique is null then 1 else 0 end) as Unknown,
            sum(case when IsUnique = 0 then 1 else 0 end) as NotUnique,
            sum(case when IsUnique = 1 then 1 else 0 end) as IsUnique
        from    #ColumnSets
        group by ColumnCount
        order by ColumnCount

        --Get the cardinality of all untested column sets
        set @Counter = @Counter + 
            (Select Count(*)
            from    #ColumnSets
            where   Cardinality >= @RecordCount
                and IsUnique is null)

        declare CSCursor cursor for
            Select  ColumnMask,
                ColumnString
            from    #ColumnSets
            where   Cardinality >= @RecordCount
                and IsUnique is null

        Open    CSCursor
        Fetch next from CSCursor into @ColumnMask, @ColumnString
        while @@FETCH_STATUS = 0
            begin
                set @SQLString = 'Update #ColumnSets set Cardinality = (select count(*) from (select distinct ' + @ColumnString + ' from ' + @TableName + ') DistinctValues), IsUnique = 0 where IsUnique is null and ColumnMask = ' + cast(@ColumnMask as varchar(20))
                exec (@SQLString)
                fetch next from CSCursor into @ColumnMask, @ColumnString
                if dateadd(mi, @MaxMinutes, @StartTime) < getdate() break
            end
        Close   CSCursor
        Deallocate CSCursor

        --Delete any ColumnSets with fewer than two distinct values, as they cannot be part of a natural key
        Delete from #ColumnSets where Cardinality < 2

        --Any columnsets with a cardinality equal to the rowcount represent natural keys.
        Update  #ColumnSets set IsUnique = 1 where Cardinality = @RecordCount
        if dateadd(mi, @MaxMinutes, @StartTime) < getdate() break

        set @SQLString = 'select ColumnString as ''' + cast(@KeyLength as varchar(3)) + '-column keys found in ' + cast(datediff(second, @StartTime, getdate()) as varchar(10)) + ' seconds.'' from #ColumnSets where IsUnique = 1 and ColumnCount = ' + cast(@KeyLength as varchar(3)) + ' order by ColumnCount, ColumnString'
        exec (@SQLString)

        --Remove superfluous permutations (ColumnSet has same cardinality as one of its subsets)
        delete  ColumnSets
        from    #ColumnSets ColumnSets
            inner join #ColumnSets SubSets
                on ColumnSets.Cardinality = SubSets.Cardinality
        where   ColumnSets.ColumnCount > SubSets.ColumnCount
            and ColumnSets.ColumnCount = @KeyLength
            and ColumnSets.ColumnMask & SubSets.ColumnMask = SubSets.ColumnMask

        --Identify and insert combinations of non-unique permutations that have insufficient potential cardinality to be unique
        insert into #ColumnSets
            (ColumnMask,
            ColumnCount,
            Cardinality, --estimate the cardinality as the product of the cardinalities of the component columns.
            IsUnique,
            ColumnString)
        select  ColumnSetsA.ColumnMask + ColumnSetsB.ColumnMask as ColumnMask,
            ColumnSetsA.ColumnCount + ColumnSetsB.ColumnCount as ColumnCount,
            min(ColumnSetsA.Cardinality * ColumnSetsB.Cardinality) as Cardinality,
            0 as IsUnique,
            min(ColumnSetsA.ColumnString + ', ' + ColumnSetsB.ColumnString) as ColumnString
        from    #ColumnSets ColumnSetsA
            inner join #ColumnSets ColumnSetsB on ColumnSetsB.ColumnCount < ColumnSetsA.ColumnCount
        where   ColumnSetsA.ColumnCount = @KeyLength
            and ColumnSetsA.Cardinality * ColumnSetsB.Cardinality <= @RecordCount
            and ColumnSetsA.ColumnMask & ColumnSetsB.ColumnMask = 0
            and ColumnSetsA.IsUnique = 0
            and ColumnSetsB.IsUnique = 0
        group by ColumnSetsA.ColumnMask + ColumnSetsB.ColumnMask,
            ColumnSetsA.ColumnCount + ColumnSetsB.ColumnCount

        --Insert new column sets
        set @KeyLength = @KeyLength + 1

        insert into #ColumnSets
            (ColumnMask,
            ColumnCount,
            Cardinality, --estimate the cardinality as the product of the cardinalities of the component columns.
            ColumnString)
        select  ParentSets.ColumnMask | UnarySets.ColumnMask as ColumnMask,
            ParentSets.ColumnCount + UnarySets.ColumnCount as ColumnCount,
            ParentSets.Cardinality * UnarySets.Cardinality as Cardinality,
            ParentSets.ColumnString + ', ' + UnarySets.ColumnString as ColumnString
        from    #ColumnSets ParentSets
            inner join #ColumnSets UnarySets on ParentSets.IsUnique = UnarySets.IsUnique
        where   ParentSets.IsUnique = 0
            and ParentSets.ColumnCount = @KeyLength - 1
            and UnarySets.ColumnCount = 1
            and ParentSets.ColumnMask & UnarySets.ColumnMask = 0

        --Remove duplicates
        delete  #ColumnSets
        from    #ColumnSets
            left outer join --DistinctSets
                (select ColumnMask,
                    min(ColumnString) ColumnString
                from    #ColumnSets
                group by ColumnMask) DistinctSets
                on #ColumnSets.ColumnMask = DistinctSets.ColumnMask
                and #ColumnSets.ColumnString = DistinctSets.ColumnString
        where   DistinctSets.ColumnString is null

        --Remove supersets of combinations already known to be unique.
        delete  CandidateSets
        from    #ColumnSets CandidateSets
            inner join #ColumnSets UniqueSets on CandidateSets.ColumnMask & UniqueSets.ColumnMask = UniqueSets.ColumnMask
        where   UniqueSets.IsUnique = 1
            and CandidateSets.IsUnique is null

        --Remove subsets of combinations known not to be unique.
        delete  CandidateSets
        from    #ColumnSets CandidateSets
            inner join #ColumnSets Supersets on CandidateSets.ColumnMask & Supersets.ColumnMask = CandidateSets.ColumnMask
        where   SuperSets.IsUnique = 0
            and CandidateSets.IsUnique is null
    end

-- Make sure that all natural keys present their columns in a uniform alphabetical order
declare CSCursor cursor for
    Select  ColumnMask
    from    #ColumnSets
    where   IsUnique = 1

Open    CSCursor
Fetch next from CSCursor into @ColumnMask
while @@FETCH_STATUS = 0
    begin
        set @ColumnString = null
        select  @ColumnString = isnull(@ColumnString + ', ', '') + ColumnString
        from    #ColumnSets
        where   ColumnCount = 1
            and ColumnMask & @ColumnMask = ColumnMask
        order by ColumnString
        update  #ColumnSets set ColumnString = @ColumnString where ColumnMask = @ColumnMask
        fetch next from CSCursor into @ColumnMask
    end
Close   CSCursor
Deallocate CSCursor


--Final output:
select  'Found ' + cast(count(*) as varchar(10)) + ' natural keys with ' + cast(@MaxColumns as varchar(10)) + ' or fewer columns in ' + cast(datediff(minute, @StartTime, getdate()) as varchar(10)) + ' minutes.' from #ColumnSets where IsUnique = 1

select  ColumnString as NaturalKeys
from    #ColumnSets
where   IsUnique = 1
order by ColumnCount, ColumnString

select  convert(varchar(20), @Counter) + ' permutations tested.'

drop table #ColumnSets

1 Ответ

3 голосов
/ 11 апреля 2011

Если вы используете Microsoft Suite, возможно использовать SSIS. Службы интеграции имеют задачу профилирования данных.

Этот сайт имеет основы о том, как его использовать: http://consultingblogs.emc.com/jamiethomson/archive/2008/03/04/ssis-data-profiling-task-part-8-candidate-key.aspx

В качестве предупреждения это может занять очень много времени в зависимости от размера таблицы и количества столбцов, которые необходимо включить в ключ-кандидат. Вывод очень полезен для определения силы клавиш. В вашем случае вы будете искать предметы, которые на 100%.

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...