Datricks koalas не распознает имя столбца в функции - PullRequest
0 голосов
/ 03 марта 2020

У меня проблема с пользовательской функцией (UDF), примененной к кадру данных Koalas в Python. Конкретный столбец распознается другими UDF в скрипте, но конкретный UDF term_frq_func, похоже, завершается с ошибкой:

raise AnalysisException(s.split(': ', 1)[1], stackTrace)
AnalysisException: 'Cannot resolve column name 
"`term_frq_fun(customer_comment)`" among 
(__index_level_0__, customer_comment, 
advisor_comment, vmi_comment, customer_comment_nopuncs, 
advisor_comment_nopuncs, vmi_comment_nopuncs, customer_comment_no_ws, 
advisor_comment_no_ws, vmi_comment_no_ws, 
customer_comment_lc, advisor_comment_lc, 
vmi_comment_lc, customer_comment_ns, advisor_comment_ns, 
vmi_comment_ns, customer_comment_sing, advisor_comment_sing, 
vmi_comment_sing, customer_comment_cleaned, advisor_comment_cleaned, 
vmi_comment_cleaned, customer_comment_lem, advisor_comment_lem, vmi_comment_lem, 
customer_comment_tokens, advisor_comment_tokens, vmi_comment_tokens, 
customer_comment_sntmnt, advisor_comment_sntmnt, vmi_comment_sntmnt);

Я протестировал код, и у меня нет ошибки с функции helper_funcs_test.text_preprocess_func или helper_funcs_test.ngram_text_metrics, это происходит только когда я вызываю helper_funcs_test.tf_text_metrics

Вот краткое подмножество данных, которые я использую:

data = {'customer_comment': {0: 'PDI,PERFORM', 1: 'PERFORM TIRE INSPECTION FOR TREAD WEAR, CHECK AND INFLATE TIRES AS PER MANUFACTURERS SPECIFICATIONS', 2: 'INSTALL FRONT & REAR LICENSE PLATE FRAMES/INSERTS & TAMPER PROOF SECURED SCREWS. AS PER DEALER REQUEST'}, 'advisor_comment': {0: 'PDI,PERFORM', 1: "PERFORM TIRE INSPECTION FOR TREAD WEAR, CHECK AND INFLATE TIRES AS PER MANUFACTURER'S SPECIFICATIONS", 2: "PERFORM TIRE INSPECTION FOR TREAD WEAR, CHECK AND INFLATE TIRES AS PER MANUFACTURER'S SPECIFICATIONS"}, 'vmi_comment': {0: 'nan', 1: 'nan', 2: 'nan'}}

ds = pd.DataFrame.from_dict(data, orient = 'index')

Это код, который используется, и связанные библиотеки

from nltk.corpus import stopwords
import string
from textblob import TextBlob
from textblob import Word
import pandas as pd
import numpy as np
import swifter 
from nltk import word_tokenize
from nltk import bigrams,trigrams
from databricks.koalas import pandas_wraps
import databricks.koalas as ks

#make sure all comment fields are string
ds['customer_comment'] = ds['customer_comment'].astype(str)
ds['advisor_comment'] = ds['advisor_comment'].astype(str)
ds['vmi_comment'] = ds['vmi_comment'].astype(str)

#now perform basic pre-processing steps for each of the comment columns
#as well as the more advanced pre-processing including
#lemmatization, bigram, trigram formation, and tf calculation

#import helper functions
import helper_funcs_test

k_data = ks.from_pandas(ds)

processed_data = helper_funcs_test.text_preprocess_func(k_data)
processed_data.columns

#bigram_df,trigram_df = helper_funcs_test.additional_text_metrics(processed_data_sub)
bigram_df,trigram_df = helper_funcs_test.ngram_text_metrics(processed_data)

tf_cust, tf_advisor, tf_vmi = helper_funcs_test.tf_text_metrics(processed_data)

Это код, указанный в импортированном файле helper_funcs_test

#create list of the English stopwords
stop = stopwords.words('english')

#extend the list 'stop' to include additional words
extras = ["customer", "vehicle", "please", "states", "mercedes", "benz", "would", "advise", "goes", "a", "on", "client", "guest", "cust", "c", "mb", "reports", "r", "f", "shows", "check","mbusa"]
stop.extend(extras)

#set list of digits
numbers = [*range(1001)]
numbers = [str(x) for x in numbers]

#get punctuation marks from 'string' module
puncs = list(string.punctuation)

#download all df in nltk including the corpora, this is needed for lemmatization process
#technically you could download a single corpus, but I want all the df 'nltk'
#has to offer
#THIS CAN TAKE A WHILE IF YOU HAVEN'T ALREADY DONE SO

#nltk.download('all')

#Write function to replace a set of multiple sub strings with a new string in main string.
def replaceMultiple(mainString, toBeReplaced, newString):
    # Iterate over the strings to be replaced
    for elem in toBeReplaced :
        # Check if string is in the main string
        if elem in mainString :
            # Replace the string
            mainString = mainString.replace(elem, newString)

    return  mainString

#write a series of text preprocessing functions making use of koalas decorator function '@pandas_wraps'
@pandas_wraps(return_col = str)
def punc_fun(col1):
    return col1.apply(lambda x: replaceMultiple(x, puncs, " "))

@pandas_wraps(return_col = str)
def ws_fun(col1):
    return col1.str.strip()

@pandas_wraps(return_col = str)
def lc_fun(col1):
    return col1.apply(lambda x: " ".join(x.lower() for x in x.split()))

@pandas_wraps(return_col = str)
def sw_fun(col1):
    return col1.apply(lambda x: " ".join(x for x in x.split() if x not in stop))

@pandas_wraps(return_col = str)
def dws_fun(col1):
    return col1.str.replace("  "," ")

@pandas_wraps(return_col = str)
def txt_remove_fun(col1):
    return col1.str.replace("pre delivery inspection","pdi")

@pandas_wraps(return_col = str)
def lem_fun(col1):
    return col1.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

@pandas_wraps(return_col = str)
def tok_fun(col1):
    return col1.apply(lambda x: word_tokenize(x))

@pandas_wraps(return_col = float)
def sent_fun(col1):
    return col1.apply(lambda x: TextBlob(x).sentiment[0])

def text_preprocess_func(df):

    #replace the punctuation characters
    df['customer_comment_nopuncs'] = punc_fun(df['customer_comment'])
    df['advisor_comment_nopuncs'] = punc_fun(df['advisor_comment'])
    df['vmi_comment_nopuncs'] = punc_fun(df['vmi_comment'])

    #strip leading/trailing white space
    df['customer_comment_no_ws'] = ws_fun(df['customer_comment_nopuncs'])
    df['advisor_comment_no_ws'] = ws_fun(df['advisor_comment_nopuncs'])
    df['vmi_comment_no_ws'] = ws_fun(df['vmi_comment_nopuncs'])

    #convert all text to lowercase so that there aren't instances where the same word in different
    #cases are recognized as different words    
    df['customer_comment_lc'] = lc_fun(df['customer_comment_no_ws'])
    df['advisor_comment_lc'] = lc_fun(df['advisor_comment_no_ws'])
    df['vmi_comment_lc'] = lc_fun(df['vmi_comment_no_ws'])

    #remove all stopwords
    df['customer_comment_ns'] = sw_fun(df['customer_comment_lc'])
    df['advisor_comment_ns'] = sw_fun(df['advisor_comment_lc'])
    df['vmi_comment_ns'] = sw_fun(df['vmi_comment_lc'])

     #change double white spaces into single white spaces
    df['customer_comment_sing'] = dws_fun(df['customer_comment_ns'])
    df['advisor_comment_sing'] = dws_fun(df['advisor_comment_ns'])
    df['vmi_comment_sing'] = dws_fun(df['vmi_comment_ns'])

    #change the text 'pre delivery inspection' to 'pdi'
    df['customer_comment_cleaned'] = txt_remove_fun(df['customer_comment_sing'])
    df['advisor_comment_cleaned'] = txt_remove_fun(df['advisor_comment_sing'])
    df['vmi_comment_cleaned'] = txt_remove_fun(df['vmi_comment_sing'])

    #perform lemmatization on the text to get the root words
    df['customer_comment_lem'] = lem_fun(df['customer_comment_cleaned'])
    df['advisor_comment_lem'] = lem_fun(df['advisor_comment_cleaned'])
    df['vmi_comment_lem'] = lem_fun(df['vmi_comment_cleaned'])

    #tokenize the various comment fields
    df['customer_comment_tokens'] = tok_fun(df['customer_comment_cleaned'])
    df['advisor_comment_tokens'] = tok_fun(df['advisor_comment_cleaned'])
    df['vmi_comment_tokens'] = tok_fun(df['vmi_comment_cleaned'])

    #get basic sentiment of comments; values closer to +1 indicate positive sentiment
    #values closer to -1 indicate negative sentiment
    df['customer_comment_sntmnt'] = sent_fun(df['customer_comment_cleaned'])
    df['advisor_comment_sntmnt'] = sent_fun(df['advisor_comment_cleaned'])
    df['vmi_comment_sntmnt'] = sent_fun(df['vmi_comment_cleaned'])

    df.drop(columns = ['customer_comment_sing', 'vmi_comment_ns', 'vmi_comment_lc', 'vmi_comment_no_ws', 'vmi_comment_nopuncs'])

    return df

@pandas_wraps(return_col = str)
def bigram_fun(col1):
    return col1.apply(lambda x: list(bigrams(x)))

@pandas_wraps(return_col = str)
def trigram_fun(col1):
    return col1.apply(lambda x: list(trigrams(x)))

@pandas_wraps(return_col = int)
def term_frq_fun(col1):
    #return col1.apply(lambda x: x.split(" ")).value_counts().sum()
    return col1.apply(lambda x: x.split()).value_counts().sum(axis = 0).reset_index()

@pandas_wraps(return_col = str)
def char_replace_fun(col1):
    return col1.str.replace("*",'').str.replace("+",'').str.replace("?",'').str.replace("\\","")

def ngram_text_metrics(df):

    #create bigrams and trigrams
    cust_bigrams = bigram_fun(df['customer_comment_tokens'])
    advisor_bigrams = bigram_fun(df['advisor_comment_tokens'])
    vmi_bigrams = bigram_fun(df['vmi_comment_tokens'])

    cust_trigrams = trigram_fun(df['customer_comment_tokens'])
    advisor_trigrams = trigram_fun(df['advisor_comment_tokens'])
    vmi_trigrams = trigram_fun(df['vmi_comment_tokens'])

    #create bigrams df
    bigram_df = ks.concat([cust_bigrams,advisor_bigrams,vmi_bigrams])
    bigram_df.columns = ['customer', 'advisor', 'vmi'] 

    #create trigrams df
    trigram_df = ks.concat([cust_trigrams,advisor_trigrams,vmi_trigrams])
    trigram_df.columns = ['customer', 'advisor', 'vmi'] 

    return bigram_df, trigram_df

def tf_text_metrics(df):

    #create term frequency dfframes for each of the comment fields
    tf_cust = term_frq_fun(df['customer_comment'])
    tf_cust.columns = ['words','tf']
    tf_cust['words'] = char_replace_fun(tf_cust['words'])

    tf_advisor = term_frq_fun(df['advisor_comment'])
    tf_advisor.columns = ['words','tf']
    tf_advisor['words'] = char_replace_fun(tf_advisor['words'])

    tf_vmi = term_frq_fun(df['vmi_comment'])
    tf_vmi.columns = ['words','tf']
    tf_vmi['words'] = char_replace_fun(tf_vmi['words'])

    return tf_cust, tf_advisor, tf_vmi

Не знаю Понимаете, почему поле customer_comment распознается в некоторых функциях, а не функция term_frq_fun? Любая помощь в том, почему эта ошибка может возникнуть, будет принята с благодарностью.

Спасибо, Кертис

...