Я пытаюсь извлечь набор строк из 2-х фреймов данных на основе условия.После этого я пытаюсь сравнить значения каждого столбца строк, извлеченных на основе ключа (RECORD_ID).Когда я пытаюсь сравнить полные кадры данных, я получаю другой результат по сравнению со сравнением извлеченных строк информационного кадра.Исходя из данных, доступных на фреймах данных, оба результата должны быть одинаковыми.
import pandas as pd
from pathlib import Path
import datetime
import numpy as np
#import sys
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
use_case_num = input("Enter the use case number being tested : ")
#pol_num = input("Enter the Policy Number that is being tested : ")
use_case_fl = 'N'
#The Use Case matrix is being specified here
path_use_case = Path('use_case_matrix.xlsx')
df_use_case = pd.read_excel(path_use_case).fillna(0)
use_case_index = df_use_case.columns[0]
#Check if use case entered by user is present in the use_case_matrix
for row in df_use_case.index:
if str(use_case_num) == str(df_use_case.at[row,'CASE_IDENTIFIER']):
use_case_fl = 'Y'
break
else:
use_case_fl = 'N'
if use_case_fl == 'Y':
#Add the names of the spreadsheets with data that you would want to compare
path_ideal = Path('stat_prem_ideal_0502.xlsx')
path_actual = Path('stat_prem_actual_0502.xlsx')
df = pd.read_excel(path_actual)
#The first column should ideally be the key/index for this comparison. If not, change the index to represent the right index
index_col = df.columns[0]
print('\nIndex column: {}\n'.format(index_col))
df_ideal = pd.read_excel(path_ideal, index_col=index_col).fillna(0)
df_actual = pd.read_excel(path_actual, index_col=index_col).fillna(0)
for row in df_ideal.index:
df_ideal_select = df_ideal.loc[df_ideal.CASE_IDENTIFIER == int(use_case_num)]
for row in df_actual.index:
df_actual_select = df_actual.loc[df_actual.CASE_IDENTIFIER == int(use_case_num)]
i=0
match=0
check=0
alert=0
error=0
no_of_rows = len(df_actual)
column_names=df_ideal.columns
var_names_ideal=column_names+'_IDEAL'
var_names_actual=column_names+'_ACTUAL'
tokens=column_names+'_TOKEN'
matched_rows = []
#Looping through the rows and columns of both spreadsheets to compare values
for i in range(0,len(column_names)):
for row in df_ideal_select.index: <---- when i use df_ideal in this entire loop, I get the expected results
var_names_ideal.i=df_ideal_select.at[row,column_names[i]]
for row in df_actual_select.index: <---- when i use df_actual in this entire loop, I get the expected results
var_names_actual.i=df_actual_select.at[row,column_names[i]]
for row in df_actual_select.index:
tokens.i = fuzz.partial_ratio(str(var_names_ideal.i),str(var_names_actual.i))
matched_rows.append([row,column_names[i],var_names_ideal.i,var_names_actual.i,tokens.i])
matched_rows.sort(key=lambda x : x[0])
dfDiff=pd.DataFrame(matched_rows)
if tokens.i == 100:
match = match+1
elif (tokens.i > 70) and (tokens.i < 100):
check = check+1
elif (tokens.i > 50) and (tokens.i <= 70):
alert = alert+1
elif (tokens.i <= 50):
error = error+1
#Writing the output spreadsheet with comparison ratios
fname = 'test-fuzzy.xlsx'
writer = pd.ExcelWriter(fname, engine='xlsxwriter')
dfDiff.to_excel(writer, sheet_name='Fuzz Match Details', index=True)
df_actual_select.to_excel(writer, sheet_name=path_actual.stem, index=True)
df_ideal_select.to_excel(writer,sheet_name=path_ideal.stem,index=True)
workbook = writer.book
#Excel edits for detail sheet
worksheet = writer.sheets['Fuzz Match Details']
worksheet.set_default_row(15)
worksheet.set_column('B:B',5)
worksheet.set_column('C:E',40)
worksheet.set_column('F:F',10)
worksheet.write('B1','INDEX')
worksheet.write('C1','COLUMN NAME')
worksheet.write('D1','IDEAL VALUES')
worksheet.write('E1','ACTUAL VALUES')
worksheet.write('F1','MATCH RATIO')
match_fmt = workbook.add_format({'bg_color':'green'})
check_fmt = workbook.add_format({'bg_color':'blue'})
alert_fmt = workbook.add_format({'bg_color':'#FFA500'})
error_fmt = workbook.add_format({'bg_color':'red'})
#Formatting colors for the spreadsheet depicting problematic fields
worksheet.conditional_format('F2:F10000', {'type': 'cell',
'criteria': 'between',
'minimum':71,
'maximum':99,
'format': check_fmt})
worksheet.conditional_format('F2:F10000', {'type': 'cell',
'criteria': 'between',
'minimum':51,
'maximum':70,
'format': alert_fmt})
worksheet.conditional_format('F2:F10000', {'type': 'cell',
'criteria': '<=',
'value':50,
'format': error_fmt})
worksheet.conditional_format('F2:F10000', {'type': 'cell',
'criteria': '==',
'value':100,
'format': match_fmt})
writer.save()
print('Number of matched rows: {}'.format(match))
print('Number of rows to be checked: {}'.format(check))
print('Number of rows on alert: {}'.format(alert))
print('Number of erroneous rows: {}'.format(error))
print('\n Done')
else:
print ("Use case not defined!")