Вот таблица данных:
Я следую этому уроку ML и настроил код для своих нуждкак следует, цель состоит в том, чтобы использовать некоторые предикторы для прогнозирования метки, которая является мультиклассовой меткой.Я также создал фиктивные переменные на основе столбца 'label'
, как в учебнике.
df = pd.read_csv(directory_data+final_data_file, encoding='utf-8', low_memory=False)
# Text cleaning
def clean_text(text):
text = text.lower()
text = re.sub(r"what's", "what is ", text)
text = text.strip(' ')
return text
df['study_title'] = df['study_title'].map(lambda com: clean_text(com))
df['study_desc'] = df['study_desc'].map(lambda com: clean_text(com))
df['condition'] = df['condition'].map(lambda com: clean_text(com))
df['min_age'] = df['min_age'].astype(int)
# Split data into train and test sets
# Need to do a sophisticated randomization since currently same study can occupy multiple columns
# Need to first randomize study ids into train or test sets
# Then remap the studies into the sets based on the matching study ids
unique_study_id_list = cf.unique(df, 'study_id')
rand_seed = 888
random.seed(rand_seed)
random.shuffle(unique_study_id_list)
percent_test = 0.50
test_study_id = unique_study_id_list[0:int(len(unique_study_id_list)*percent_test)]
train_study_id = unique_study_id_list[(int(len(unique_study_id_list)*percent_test)):]
test = df[df['study_id'].isin(test_study_id)]
train = df[df['study_id'].isin(train_study_id)]
# Specify what to traing with
X_train = train['study_desc']
X_test = test['study_desc']
# ML pipeline: define a pipeline combining a text feature extractor with multi lable classifier
NB_pipeline = Pipeline([
('tfidf', TfidfVectorizer(stop_words=stop_words)),
('clf', OneVsRestClassifier(MultinomialNB(
fit_prior=True, class_prior=None))),])
LogReg_pipeline = Pipeline([
('tfidf', TfidfVectorizer(stop_words=stop_words)),
('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),])
SVC_pipeline = Pipeline([
('tfidf', TfidfVectorizer(stop_words=stop_words)),
('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),])
#######################
#######################
#######################
# Testing outputs
categories = cf.unique(df, 'label')
output_switch = 'test2' # 'real' or 'test' or 'off'
if output_switch == 'test':
print (df['label'].value_counts())
elif output_switch == 'test2':
for category in categories:
print('... Processing {}'.format(category))
# train the model using X_dtm & y
NB_pipeline.fit(X_train, train[category])
# compute the testing accuracy
prediction = NB_pipeline.predict(X_test)
print('Test auc-score is {}'.format(roc_auc_score(test[category], prediction)))
else: pass
Однако я не понял, как изменить следующее, чтобы включить несколько предикторов.В настоящее время я использую только 'study_desc'
, но как я могу также включить 'study_title'
и 'min_age'
в качестве моих предикторов?
Я пробовал следующее, но получил ошибки:
X_train = train['study_desc', 'study_title', 'min_age']
X_test = test['study_desc', 'study_title', 'min_age']
KeyError: ('study_desc', 'study_title', 'min_age')
И
X_train = train[['study_desc', 'study_title', 'min_age']]
X_test = test[['study_desc', 'study_title', 'min_age']]
ValueError: Found input variables with inconsistent numbers of samples: [3, 3649]
//////////////////// Исправления: проверка предлагаемой ссылки:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
NB_pipeline_multi = Pipeline([
('union', FeatureUnion(
transformer_list=[
('min_age', Pipeline([
('selector', ItemSelector(key='min_age')),
])),
('study_title', Pipeline([
('selector', ItemSelector(key='study_title')),
('tfidf', TfidfVectorizer(stop_words=stop_words)),
])),
('study_desc', Pipeline([
('selector', ItemSelector(key='study_desc')),
('tfidf', TfidfVectorizer(stop_words=stop_words)),
])),
],
)),
('clf', OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None))),
])
KeyError: 'min_age'