Как применить инструменты функций к очень большому набору данных? - PullRequest
0 голосов
/ 26 мая 2020

Я сомневаюсь, как использовать инструменты функций для очень большого набора данных? Я решаю проблему Kaggle по обнаружению мошенничества IEEE https://www.kaggle.com/c/ieee-fraud-detection.

Это огромный набор данных. Код из моего ядра Kaggle:

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

train_transaction = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
train_identity = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')
test_transaction = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')
test_identity = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')

Train = pd.merge(train_transaction,train_identity,on = 'TransactionID',how = 'left')
Test = pd.merge(test_transaction,test_identity,on = 'TransactionID',how = 'left')

Train.info()

train_identity_columns = list(train_identity.columns)
print(train_identity_columns)

del train_transaction,train_identity,test_transaction,test_identity

Test_Col = []
for col in Test.columns:
    Test_Col.append(col.replace('-','_'))
Test.columns = Test_Col

Test['isFraud'] = np.nan

Full_Frame = Train.append(Test,ignore_index = True)

del Train,Test

# Lets First Take Care of Object Columns
for Col in Full_Frame.select_dtypes('object'):
    Full_Frame[Col] = pd.factorize(Full_Frame[Col])[0]
    Full_Frame.loc[Full_Frame[Col] == -1,Col] = np.nan
    Full_Frame[Col] = Full_Frame[Col].astype(np.float64)

# lets see the Distribution of Integer Columns
Labels = Full_Frame.select_dtypes(np.int64).columns
print(Labels)
Full_Frame.select_dtypes(np.int64).nunique().value_counts().sort_index().plot(kind = 'bar',color = 'red',figsize = (8,8))
plt.xlabel('Unique Values')
plt.ylabel('Number of Columns')
plt.title('Unique Value Distribution')


Full_Frame['isFraud'].value_counts().plot(kind = 'bar',figsize = (5,5),edgecolor = 'black') 
# Highly Imbalanced Class


from collections import OrderedDict

def Plot_Distribution(Train,Keyword):
    Card_Columns = Train.columns[Train.columns.str.startswith(Keyword)]
    plt.figure(figsize = (20,16))
    Color = OrderedDict({0 : 'red',1 : 'blue'})
    Labels = OrderedDict({0 : 'Not Fraud', 1 : 'Fraud'})
    for i,Columns in enumerate(Card_Columns):
        ax = plt.subplot(np.ceil(len(Card_Columns)/3),3,i+1)
        for Key,color in Color.items():
            sns.kdeplot(Train.loc[Train['isFraud'] == Key,Columns].dropna(),color = color,ax = ax,Label = Labels[Key],bw = 1.5)
        plt.title(f'{Columns.capitalize()} Distribution')
        plt.xlabel(f'{Columns}')
        plt.ylabel('Density')


# Lets First see Distribution for Card
Plot_Distribution(Full_Frame,'card')


Plot_Distribution(Full_Frame,'addr')


# Lets See the M attribute
Plot_Distribution(Full_Frame,'M')



# Lets see distribution of id attributes
    Plot_Distribution(Full_Frame,'id_0')
    Plot_Distribution(Full_Frame,'id_1')
    Plot_Distribution(Full_Frame,'id_2')
    Plot_Distribution(Full_Frame,'id_3')



# Distribution of dist attribute
Plot_Distribution(Full_Frame,'dist')

Plot_Distribution(Full_Frame,'Transaction')


# Lets Check the Null values
Null_Values = pd.DataFrame(Full_Frame.isnull().sum()).rename(columns = {0:'Total'})
Null_Values['Percentage'] = Null_Values['Total'] / len(Full_Frame)
print(Null_Values.sort_values('Percentage',ascending = False).head(20))


# Lets Remove attributes that have NULL Values more than 95 % cuz it wont do anything good.
Attributes = ['id_24','id_25','id_07','id_08','id_21','id_26','id_27','id_23','id_22']
Full_Frame.drop(Attributes,axis = 1,inplace = True)




# Lets Do Feature Engineering using Feature Tools
import featuretools as ft
es = ft.EntitySet(id = 'IEEE')
es = es.entity_from_dataframe(entity_id = 'FullFrame',dataframe = Full_Frame,index = 'Transaction_id'
                              ,make_index = True)

es


for V in Attributes:
    train_identity_columns.remove(V)

train_identity_columns.remove('TransactionID')

es = es.normalize_entity(base_entity_id = 'FullFrame',new_entity_id = 'identity',index = 'TransactionID',make_time_index = 'TransactionDT',
                        additional_variables = train_identity_columns,copy_variables = ['TransactionDT'])


es


es['identity'].df.head(10)


es['FullFrame'].variables


feature_matrix,feature_dfs = ft.dfs(entityset = es,target_entity = 'FullFrame')

После этой строки память идет БУМ !! и все останавливается. Что я должен делать? Пожалуйста, скажите мне:

  1. Рекомендации по работе с большими наборами данных.
  2. Где мне здесь делать изменения и почему?

Заранее спасибо.

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...