У меня проблема с пользовательской функцией (UDF), примененной к кадру данных Koalas в Python. Конкретный столбец распознается другими UDF в скрипте, но конкретный UDF term_frq_func
, похоже, завершается с ошибкой:
raise AnalysisException(s.split(': ', 1)[1], stackTrace)
AnalysisException: 'Cannot resolve column name
"`term_frq_fun(customer_comment)`" among
(__index_level_0__, customer_comment,
advisor_comment, vmi_comment, customer_comment_nopuncs,
advisor_comment_nopuncs, vmi_comment_nopuncs, customer_comment_no_ws,
advisor_comment_no_ws, vmi_comment_no_ws,
customer_comment_lc, advisor_comment_lc,
vmi_comment_lc, customer_comment_ns, advisor_comment_ns,
vmi_comment_ns, customer_comment_sing, advisor_comment_sing,
vmi_comment_sing, customer_comment_cleaned, advisor_comment_cleaned,
vmi_comment_cleaned, customer_comment_lem, advisor_comment_lem, vmi_comment_lem,
customer_comment_tokens, advisor_comment_tokens, vmi_comment_tokens,
customer_comment_sntmnt, advisor_comment_sntmnt, vmi_comment_sntmnt);
Я протестировал код, и у меня нет ошибки с функции helper_funcs_test.text_preprocess_func
или helper_funcs_test.ngram_text_metrics
, это происходит только когда я вызываю helper_funcs_test.tf_text_metrics
Вот краткое подмножество данных, которые я использую:
data = {'customer_comment': {0: 'PDI,PERFORM', 1: 'PERFORM TIRE INSPECTION FOR TREAD WEAR, CHECK AND INFLATE TIRES AS PER MANUFACTURERS SPECIFICATIONS', 2: 'INSTALL FRONT & REAR LICENSE PLATE FRAMES/INSERTS & TAMPER PROOF SECURED SCREWS. AS PER DEALER REQUEST'}, 'advisor_comment': {0: 'PDI,PERFORM', 1: "PERFORM TIRE INSPECTION FOR TREAD WEAR, CHECK AND INFLATE TIRES AS PER MANUFACTURER'S SPECIFICATIONS", 2: "PERFORM TIRE INSPECTION FOR TREAD WEAR, CHECK AND INFLATE TIRES AS PER MANUFACTURER'S SPECIFICATIONS"}, 'vmi_comment': {0: 'nan', 1: 'nan', 2: 'nan'}}
ds = pd.DataFrame.from_dict(data, orient = 'index')
Это код, который используется, и связанные библиотеки
from nltk.corpus import stopwords
import string
from textblob import TextBlob
from textblob import Word
import pandas as pd
import numpy as np
import swifter
from nltk import word_tokenize
from nltk import bigrams,trigrams
from databricks.koalas import pandas_wraps
import databricks.koalas as ks
#make sure all comment fields are string
ds['customer_comment'] = ds['customer_comment'].astype(str)
ds['advisor_comment'] = ds['advisor_comment'].astype(str)
ds['vmi_comment'] = ds['vmi_comment'].astype(str)
#now perform basic pre-processing steps for each of the comment columns
#as well as the more advanced pre-processing including
#lemmatization, bigram, trigram formation, and tf calculation
#import helper functions
import helper_funcs_test
k_data = ks.from_pandas(ds)
processed_data = helper_funcs_test.text_preprocess_func(k_data)
processed_data.columns
#bigram_df,trigram_df = helper_funcs_test.additional_text_metrics(processed_data_sub)
bigram_df,trigram_df = helper_funcs_test.ngram_text_metrics(processed_data)
tf_cust, tf_advisor, tf_vmi = helper_funcs_test.tf_text_metrics(processed_data)
Это код, указанный в импортированном файле helper_funcs_test
#create list of the English stopwords
stop = stopwords.words('english')
#extend the list 'stop' to include additional words
extras = ["customer", "vehicle", "please", "states", "mercedes", "benz", "would", "advise", "goes", "a", "on", "client", "guest", "cust", "c", "mb", "reports", "r", "f", "shows", "check","mbusa"]
stop.extend(extras)
#set list of digits
numbers = [*range(1001)]
numbers = [str(x) for x in numbers]
#get punctuation marks from 'string' module
puncs = list(string.punctuation)
#download all df in nltk including the corpora, this is needed for lemmatization process
#technically you could download a single corpus, but I want all the df 'nltk'
#has to offer
#THIS CAN TAKE A WHILE IF YOU HAVEN'T ALREADY DONE SO
#nltk.download('all')
#Write function to replace a set of multiple sub strings with a new string in main string.
def replaceMultiple(mainString, toBeReplaced, newString):
# Iterate over the strings to be replaced
for elem in toBeReplaced :
# Check if string is in the main string
if elem in mainString :
# Replace the string
mainString = mainString.replace(elem, newString)
return mainString
#write a series of text preprocessing functions making use of koalas decorator function '@pandas_wraps'
@pandas_wraps(return_col = str)
def punc_fun(col1):
return col1.apply(lambda x: replaceMultiple(x, puncs, " "))
@pandas_wraps(return_col = str)
def ws_fun(col1):
return col1.str.strip()
@pandas_wraps(return_col = str)
def lc_fun(col1):
return col1.apply(lambda x: " ".join(x.lower() for x in x.split()))
@pandas_wraps(return_col = str)
def sw_fun(col1):
return col1.apply(lambda x: " ".join(x for x in x.split() if x not in stop))
@pandas_wraps(return_col = str)
def dws_fun(col1):
return col1.str.replace(" "," ")
@pandas_wraps(return_col = str)
def txt_remove_fun(col1):
return col1.str.replace("pre delivery inspection","pdi")
@pandas_wraps(return_col = str)
def lem_fun(col1):
return col1.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
@pandas_wraps(return_col = str)
def tok_fun(col1):
return col1.apply(lambda x: word_tokenize(x))
@pandas_wraps(return_col = float)
def sent_fun(col1):
return col1.apply(lambda x: TextBlob(x).sentiment[0])
def text_preprocess_func(df):
#replace the punctuation characters
df['customer_comment_nopuncs'] = punc_fun(df['customer_comment'])
df['advisor_comment_nopuncs'] = punc_fun(df['advisor_comment'])
df['vmi_comment_nopuncs'] = punc_fun(df['vmi_comment'])
#strip leading/trailing white space
df['customer_comment_no_ws'] = ws_fun(df['customer_comment_nopuncs'])
df['advisor_comment_no_ws'] = ws_fun(df['advisor_comment_nopuncs'])
df['vmi_comment_no_ws'] = ws_fun(df['vmi_comment_nopuncs'])
#convert all text to lowercase so that there aren't instances where the same word in different
#cases are recognized as different words
df['customer_comment_lc'] = lc_fun(df['customer_comment_no_ws'])
df['advisor_comment_lc'] = lc_fun(df['advisor_comment_no_ws'])
df['vmi_comment_lc'] = lc_fun(df['vmi_comment_no_ws'])
#remove all stopwords
df['customer_comment_ns'] = sw_fun(df['customer_comment_lc'])
df['advisor_comment_ns'] = sw_fun(df['advisor_comment_lc'])
df['vmi_comment_ns'] = sw_fun(df['vmi_comment_lc'])
#change double white spaces into single white spaces
df['customer_comment_sing'] = dws_fun(df['customer_comment_ns'])
df['advisor_comment_sing'] = dws_fun(df['advisor_comment_ns'])
df['vmi_comment_sing'] = dws_fun(df['vmi_comment_ns'])
#change the text 'pre delivery inspection' to 'pdi'
df['customer_comment_cleaned'] = txt_remove_fun(df['customer_comment_sing'])
df['advisor_comment_cleaned'] = txt_remove_fun(df['advisor_comment_sing'])
df['vmi_comment_cleaned'] = txt_remove_fun(df['vmi_comment_sing'])
#perform lemmatization on the text to get the root words
df['customer_comment_lem'] = lem_fun(df['customer_comment_cleaned'])
df['advisor_comment_lem'] = lem_fun(df['advisor_comment_cleaned'])
df['vmi_comment_lem'] = lem_fun(df['vmi_comment_cleaned'])
#tokenize the various comment fields
df['customer_comment_tokens'] = tok_fun(df['customer_comment_cleaned'])
df['advisor_comment_tokens'] = tok_fun(df['advisor_comment_cleaned'])
df['vmi_comment_tokens'] = tok_fun(df['vmi_comment_cleaned'])
#get basic sentiment of comments; values closer to +1 indicate positive sentiment
#values closer to -1 indicate negative sentiment
df['customer_comment_sntmnt'] = sent_fun(df['customer_comment_cleaned'])
df['advisor_comment_sntmnt'] = sent_fun(df['advisor_comment_cleaned'])
df['vmi_comment_sntmnt'] = sent_fun(df['vmi_comment_cleaned'])
df.drop(columns = ['customer_comment_sing', 'vmi_comment_ns', 'vmi_comment_lc', 'vmi_comment_no_ws', 'vmi_comment_nopuncs'])
return df
@pandas_wraps(return_col = str)
def bigram_fun(col1):
return col1.apply(lambda x: list(bigrams(x)))
@pandas_wraps(return_col = str)
def trigram_fun(col1):
return col1.apply(lambda x: list(trigrams(x)))
@pandas_wraps(return_col = int)
def term_frq_fun(col1):
#return col1.apply(lambda x: x.split(" ")).value_counts().sum()
return col1.apply(lambda x: x.split()).value_counts().sum(axis = 0).reset_index()
@pandas_wraps(return_col = str)
def char_replace_fun(col1):
return col1.str.replace("*",'').str.replace("+",'').str.replace("?",'').str.replace("\\","")
def ngram_text_metrics(df):
#create bigrams and trigrams
cust_bigrams = bigram_fun(df['customer_comment_tokens'])
advisor_bigrams = bigram_fun(df['advisor_comment_tokens'])
vmi_bigrams = bigram_fun(df['vmi_comment_tokens'])
cust_trigrams = trigram_fun(df['customer_comment_tokens'])
advisor_trigrams = trigram_fun(df['advisor_comment_tokens'])
vmi_trigrams = trigram_fun(df['vmi_comment_tokens'])
#create bigrams df
bigram_df = ks.concat([cust_bigrams,advisor_bigrams,vmi_bigrams])
bigram_df.columns = ['customer', 'advisor', 'vmi']
#create trigrams df
trigram_df = ks.concat([cust_trigrams,advisor_trigrams,vmi_trigrams])
trigram_df.columns = ['customer', 'advisor', 'vmi']
return bigram_df, trigram_df
def tf_text_metrics(df):
#create term frequency dfframes for each of the comment fields
tf_cust = term_frq_fun(df['customer_comment'])
tf_cust.columns = ['words','tf']
tf_cust['words'] = char_replace_fun(tf_cust['words'])
tf_advisor = term_frq_fun(df['advisor_comment'])
tf_advisor.columns = ['words','tf']
tf_advisor['words'] = char_replace_fun(tf_advisor['words'])
tf_vmi = term_frq_fun(df['vmi_comment'])
tf_vmi.columns = ['words','tf']
tf_vmi['words'] = char_replace_fun(tf_vmi['words'])
return tf_cust, tf_advisor, tf_vmi
Не знаю Понимаете, почему поле customer_comment
распознается в некоторых функциях, а не функция term_frq_fun
? Любая помощь в том, почему эта ошибка может возникнуть, будет принята с благодарностью.
Спасибо, Кертис