Конструктор DataFrame неправильно вызывается при выводе в CSV-файл - PullRequest
0 голосов
/ 08 ноября 2018

Привет, я новичок в области машинного обучения, но я получаю эту ошибку (DataFrame constructor not properly called) всякий раз, когда я пытаюсь вывести результаты программы в файл данных csv

MAX_FEATURES = 500
MIN_DF = 3
sw = stopwords.words('english')

df = pd.read_csv('sample.csv',  low_memory=False)
df1 = df[['RECIPIENT_ID', 'EVENT_TS']].copy()
#raise SystemExit

df['EVENT_NAME'] = np.where(df['EVENT_NAME'].apply(
    lambda s: all([c in string.ascii_uppercase + string.digits + '-,' for c in s])),
                            'PAINTID', df['EVENT_NAME'])

df['EVENT_NAME'] = df['EVENT_NAME'].replace(r'^paint&spMailing.*', 'paint', regex=True)

rn_extract = df['REFERRER_NAME'].str.extract(r'(google)|(sample|sale)|(persona.email)|(welcome)|(bing)|(yahoo)|(inspiration)|(interior)|(exterior)|(pathtopurchase)|(newsletter)|(memorial day)|(color of the month)|(fathers day)|(excite)|(designer ?story)|(ask jeeves)|(webcrawler)|(color.?clinic)|(personaexp|personalexp)|(rebate)|(rwb)|(pins to palettes)|(trend)|(20 days of color)|(4th of july)|(canada week)|(behr box 2.0)|(labor day)|(colorfullybehr)|(holiday collection)|(mindful moments)|(earthlink)|(dogpile)|(quick dry)|(swipes intro)|(swipes nationwide)|(event day)|(color discovery tool)|(cnet search)|(myway)|(civic day)|(summer savings)', flags=re.I, expand=False)
rn_extract = rn_extract.fillna('|').apply(''.join, axis=1).replace(r'\|', '', regex=True).str.strip()
df['REFERRER_NAME'] = rn_extract.values

str_columns = ['EVENT_NAME', 'SITE_NAME'] # 'EVENT_URL', 'REFERRER_URL', 'EVENT_TYPE_NAME',
#'SESSION_LEAD_SOURCE', 'REFERRER_TYPE', 'REFERRER_NAME','REFERRER_KEYWORDS',

drop_columns = ['DATABASE_ID', 'VISITOR_KEY', 'SESSION_KEY', 'SESSION_START_TS',
                'REFERRER_MAILING_ID', 'EVENT_ID', 'EVENT_TYPE_CODE',
                'EVENT_HYPERLINK_ID', 'PAGE_ID', 'PAGE_PARENT_ID',
                'PAGE_PARENT_NAME', 'SITE_DOMAIN_ID', 'SITE_ID',
                'SITE_TYPE_CODE', 'SITE_URL', 'Email']

timestamp_columns = ['EVENT_TS']

df = df.drop(drop_columns, axis=1)
df['text'] = df[str_columns].astype(str).apply('|'.join, axis=1).replace(r'(nan\|)|(nan$)', '', regex=True)
df = df.drop(str_columns, axis=1)
df = df.groupby('RECIPIENT_ID')['text'].apply('|'.join).str.lower().str.strip().to_frame()

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=1.0, min_df=MIN_DF, token_pattern=r'\b[a-z0-9]{2,20}\b',
                   stop_words=sw + ['behr', 'www', 'com', 'http']))])

File=pd.DataFrame(X).to_csv('grouby_Pipeline.csv')
...