Я сомневаюсь, как использовать инструменты функций для очень большого набора данных? Я решаю проблему Kaggle по обнаружению мошенничества IEEE https://www.kaggle.com/c/ieee-fraud-detection.
Это огромный набор данных. Код из моего ядра Kaggle:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
train_transaction = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
train_identity = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')
test_transaction = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')
test_identity = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')
Train = pd.merge(train_transaction,train_identity,on = 'TransactionID',how = 'left')
Test = pd.merge(test_transaction,test_identity,on = 'TransactionID',how = 'left')
Train.info()
train_identity_columns = list(train_identity.columns)
print(train_identity_columns)
del train_transaction,train_identity,test_transaction,test_identity
Test_Col = []
for col in Test.columns:
Test_Col.append(col.replace('-','_'))
Test.columns = Test_Col
Test['isFraud'] = np.nan
Full_Frame = Train.append(Test,ignore_index = True)
del Train,Test
# Lets First Take Care of Object Columns
for Col in Full_Frame.select_dtypes('object'):
Full_Frame[Col] = pd.factorize(Full_Frame[Col])[0]
Full_Frame.loc[Full_Frame[Col] == -1,Col] = np.nan
Full_Frame[Col] = Full_Frame[Col].astype(np.float64)
# lets see the Distribution of Integer Columns
Labels = Full_Frame.select_dtypes(np.int64).columns
print(Labels)
Full_Frame.select_dtypes(np.int64).nunique().value_counts().sort_index().plot(kind = 'bar',color = 'red',figsize = (8,8))
plt.xlabel('Unique Values')
plt.ylabel('Number of Columns')
plt.title('Unique Value Distribution')
Full_Frame['isFraud'].value_counts().plot(kind = 'bar',figsize = (5,5),edgecolor = 'black')
# Highly Imbalanced Class
from collections import OrderedDict
def Plot_Distribution(Train,Keyword):
Card_Columns = Train.columns[Train.columns.str.startswith(Keyword)]
plt.figure(figsize = (20,16))
Color = OrderedDict({0 : 'red',1 : 'blue'})
Labels = OrderedDict({0 : 'Not Fraud', 1 : 'Fraud'})
for i,Columns in enumerate(Card_Columns):
ax = plt.subplot(np.ceil(len(Card_Columns)/3),3,i+1)
for Key,color in Color.items():
sns.kdeplot(Train.loc[Train['isFraud'] == Key,Columns].dropna(),color = color,ax = ax,Label = Labels[Key],bw = 1.5)
plt.title(f'{Columns.capitalize()} Distribution')
plt.xlabel(f'{Columns}')
plt.ylabel('Density')
# Lets First see Distribution for Card
Plot_Distribution(Full_Frame,'card')
Plot_Distribution(Full_Frame,'addr')
# Lets See the M attribute
Plot_Distribution(Full_Frame,'M')
# Lets see distribution of id attributes
Plot_Distribution(Full_Frame,'id_0')
Plot_Distribution(Full_Frame,'id_1')
Plot_Distribution(Full_Frame,'id_2')
Plot_Distribution(Full_Frame,'id_3')
# Distribution of dist attribute
Plot_Distribution(Full_Frame,'dist')
Plot_Distribution(Full_Frame,'Transaction')
# Lets Check the Null values
Null_Values = pd.DataFrame(Full_Frame.isnull().sum()).rename(columns = {0:'Total'})
Null_Values['Percentage'] = Null_Values['Total'] / len(Full_Frame)
print(Null_Values.sort_values('Percentage',ascending = False).head(20))
# Lets Remove attributes that have NULL Values more than 95 % cuz it wont do anything good.
Attributes = ['id_24','id_25','id_07','id_08','id_21','id_26','id_27','id_23','id_22']
Full_Frame.drop(Attributes,axis = 1,inplace = True)
# Lets Do Feature Engineering using Feature Tools
import featuretools as ft
es = ft.EntitySet(id = 'IEEE')
es = es.entity_from_dataframe(entity_id = 'FullFrame',dataframe = Full_Frame,index = 'Transaction_id'
,make_index = True)
es
for V in Attributes:
train_identity_columns.remove(V)
train_identity_columns.remove('TransactionID')
es = es.normalize_entity(base_entity_id = 'FullFrame',new_entity_id = 'identity',index = 'TransactionID',make_time_index = 'TransactionDT',
additional_variables = train_identity_columns,copy_variables = ['TransactionDT'])
es
es['identity'].df.head(10)
es['FullFrame'].variables
feature_matrix,feature_dfs = ft.dfs(entityset = es,target_entity = 'FullFrame')
После этой строки память идет БУМ !! и все останавливается. Что я должен делать? Пожалуйста, скажите мне:
- Рекомендации по работе с большими наборами данных.
- Где мне здесь делать изменения и почему?
Заранее спасибо.