Проблема с нечетким соответствием - PullRequest
0 голосов
/ 14 мая 2019

Я пытаюсь извлечь набор строк из 2-х фреймов данных на основе условия.После этого я пытаюсь сравнить значения каждого столбца строк, извлеченных на основе ключа (RECORD_ID).Когда я пытаюсь сравнить полные кадры данных, я получаю другой результат по сравнению со сравнением извлеченных строк информационного кадра.Исходя из данных, доступных на фреймах данных, оба результата должны быть одинаковыми.

import pandas as pd
from pathlib import Path
import datetime
import numpy as np
#import sys
from fuzzywuzzy import fuzz
from fuzzywuzzy import process    

use_case_num = input("Enter the use case number being tested : ")
#pol_num = input("Enter the Policy Number that is being tested : ")
use_case_fl = 'N'

#The Use Case matrix is being specified here
path_use_case = Path('use_case_matrix.xlsx')
df_use_case = pd.read_excel(path_use_case).fillna(0)
use_case_index = df_use_case.columns[0]

#Check if use case entered by user is present in the use_case_matrix
for row in df_use_case.index:
    if str(use_case_num) == str(df_use_case.at[row,'CASE_IDENTIFIER']):
        use_case_fl = 'Y'
        break
    else:
        use_case_fl = 'N'

if use_case_fl == 'Y':
    #Add the names of the spreadsheets with data that you would want to compare
    path_ideal = Path('stat_prem_ideal_0502.xlsx')
    path_actual = Path('stat_prem_actual_0502.xlsx')

    df = pd.read_excel(path_actual)

    #The first column should ideally be the key/index for this comparison. If not, change the index to represent the right index
    index_col = df.columns[0]
    print('\nIndex column: {}\n'.format(index_col))

    df_ideal = pd.read_excel(path_ideal, index_col=index_col).fillna(0)
    df_actual = pd.read_excel(path_actual, index_col=index_col).fillna(0)

    for row in df_ideal.index:
        df_ideal_select = df_ideal.loc[df_ideal.CASE_IDENTIFIER == int(use_case_num)]

    for row in df_actual.index:
        df_actual_select = df_actual.loc[df_actual.CASE_IDENTIFIER == int(use_case_num)]


    i=0
    match=0
    check=0
    alert=0
    error=0
    no_of_rows = len(df_actual)
    column_names=df_ideal.columns
    var_names_ideal=column_names+'_IDEAL'
    var_names_actual=column_names+'_ACTUAL'
    tokens=column_names+'_TOKEN'
    matched_rows = []

    #Looping through the rows and columns of both spreadsheets to compare values
    for i in range(0,len(column_names)):
        for row in df_ideal_select.index: <---- when i use df_ideal in this entire loop, I get the expected results
            var_names_ideal.i=df_ideal_select.at[row,column_names[i]]
        for row in df_actual_select.index: <---- when i use df_actual in this entire loop, I get the expected results
            var_names_actual.i=df_actual_select.at[row,column_names[i]]
        for row in df_actual_select.index:
            tokens.i = fuzz.partial_ratio(str(var_names_ideal.i),str(var_names_actual.i))
            matched_rows.append([row,column_names[i],var_names_ideal.i,var_names_actual.i,tokens.i])

            matched_rows.sort(key=lambda x : x[0])
            dfDiff=pd.DataFrame(matched_rows)

            if tokens.i == 100:
                match = match+1
            elif (tokens.i > 70) and (tokens.i < 100):
                check = check+1
            elif (tokens.i > 50) and (tokens.i <= 70):
                alert = alert+1
            elif (tokens.i <= 50):
                error = error+1

    #Writing the output spreadsheet with comparison ratios
    fname = 'test-fuzzy.xlsx'
    writer = pd.ExcelWriter(fname, engine='xlsxwriter')

    dfDiff.to_excel(writer, sheet_name='Fuzz Match Details', index=True)
    df_actual_select.to_excel(writer, sheet_name=path_actual.stem, index=True)
    df_ideal_select.to_excel(writer,sheet_name=path_ideal.stem,index=True)
    workbook  = writer.book

    #Excel edits for detail sheet
    worksheet = writer.sheets['Fuzz Match Details']
    worksheet.set_default_row(15)
    worksheet.set_column('B:B',5)
    worksheet.set_column('C:E',40)
    worksheet.set_column('F:F',10)
    worksheet.write('B1','INDEX')
    worksheet.write('C1','COLUMN NAME')
    worksheet.write('D1','IDEAL VALUES')
    worksheet.write('E1','ACTUAL VALUES')
    worksheet.write('F1','MATCH RATIO')

    match_fmt = workbook.add_format({'bg_color':'green'})
    check_fmt = workbook.add_format({'bg_color':'blue'})
    alert_fmt = workbook.add_format({'bg_color':'#FFA500'})
    error_fmt = workbook.add_format({'bg_color':'red'})

    #Formatting colors for the spreadsheet depicting problematic fields
    worksheet.conditional_format('F2:F10000', {'type': 'cell',
                                            'criteria': 'between',
                                            'minimum':71,
                                            'maximum':99,
                                            'format': check_fmt})
    worksheet.conditional_format('F2:F10000', {'type': 'cell',
                                            'criteria': 'between',
                                            'minimum':51,
                                            'maximum':70,
                                            'format': alert_fmt})
    worksheet.conditional_format('F2:F10000', {'type': 'cell',
                                            'criteria': '<=', 
                                            'value':50,
                                            'format': error_fmt})
    worksheet.conditional_format('F2:F10000', {'type': 'cell',
                                            'criteria': '==',
                                            'value':100,
                                            'format': match_fmt})


    writer.save()
    print('Number of matched rows: {}'.format(match))
    print('Number of rows to be checked: {}'.format(check))
    print('Number of rows on alert: {}'.format(alert))
    print('Number of erroneous rows: {}'.format(error))
    print('\n Done')
else:
    print ("Use case not defined!")
...