Python - Как превратить n-граммовый CountVectorizer в конвейер с преобразователем, извлекающим столбцы - PullRequest
0 голосов
/ 28 июня 2018

У меня есть следующий код:

import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.grid_search import GridSearchCV

# Load ANSI file into pandas dataframe.
df = pd.read_csv(r'c:/papf.txt', encoding = 'latin1', usecols=['LAST_NAME', 'RACE'])

# Convert last name to lower case.
df['LAST_NAME'] = df['LAST_NAME'].str.lower()

# Remove the last name spaces.
df['LAST_NAME'] = df['LAST_NAME'].str.replace(' ', '')

# Remove all rows where race is NOT in African, White, Asian.
df = df.drop(df[~df['RACE'].isin(['African', 'White', 'Asian'])].index)

class AverageWordLengthExtractor(BaseEstimator, TransformerMixin):
    """Takes in dataframe, extracts last name column, outputs average word length"""

    def __init__(self):
        pass

    def average_word_length(self, name):
        """Helper code to compute average word length of a name"""
        return np.mean([len(word) for word in name.split()])

    def transform(self, df, y=None):
        """The workhorse of this feature extractor"""
        return df['LAST_NAME'].apply(self.average_word_length)

    def fit(self, df, y=None):
        """Returns self unless something different happens in train and test"""
        return self

# Split into train and test sets with 20% used for testing.
data_train, data_test, y_train, y_true = \
    train_test_split(df['LAST_NAME'], df['RACE'], test_size=0.2)

# Build the pipeline.
ngram_count_pipeline = Pipeline([
    ('ngram', CountVectorizer(ngram_range=(1, 4), analyzer='char'))
])
pipeline = Pipeline([
    ('feats', FeatureUnion([
        ('ngram', ngram_count_pipeline), # can pass in either a pipeline
        #('ngram', CountVectorizer(ngram_range=(1, 4), analyzer='char')),
        ('ave', AverageWordLengthExtractor()) # or a transformer
    ])),
    ('clf', LinearSVC())  # classifier
])

# Train the classifier.
classifier = LinearSVC()
model = pipeline.fit(data_train)

# Test the classifier.
y_test = model.predict(data_test)

# Print the accuracy percentage.
print(accuracy_score(y_true, y_test))
#one = ngram_counter.transform('chapman')
#print(model.predict(one))

Я придумал этот код, основанный на этом отличном сообщении в блоге Мишель Фуллвуд .

Однако сообщение в блоге не раскрывает следующую часть:

Обратите внимание, что первый элемент в FeatureUnion это ngram_count_pipeline. Это просто Pipeline, созданный из преобразователя, извлекающего столбцы, и CountVectorizer (средство извлечения столбцов необходимо сейчас, когда мы работаем с кадром данных Pandas, а не напрямую отправляем список имен дорог через конвейер).

Итак, мой вопрос: как я могу добавить n-грамм CountVectorizer в качестве конвейера и как мне сделать часть извлечения столбцов?

Кроме того, как бы я использовал модель, чтобы делать предсказания для фамилии Чепмен?

Получение точности и вероятности для каждого выходного класса также было бы потрясающим.

Мои входные данные - это в основном фамилия с выходом гонки.

Я также получаю следующие предупреждения, которые я не знаю, как решить:

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.  DeprecationWarning)

Я обновился до последней версии Anaconda (Python 3.6.5 | упакован в conda-forge | (по умолчанию, 6 апреля 2018 г., 16:13:16) [32-разрядная версия MSC v.1900 (Intel)] на win32), но это не решило предупреждение.

Пример данных CSV:

LAST_NAME,RACE
Ramaepadi,African
Motsamai,African
Van Rooyen,White
Khan,Asian
Du Plessis,White
Singh,Asian
Madlanga,African
Janse van Rensburg,

1 Ответ

0 голосов
/ 29 июня 2018

Ответ оказался таким, основываясь на той части блога, которую я пропустил:

http://nbviewer.jupyter.org/github/michelleful/SingaporeRoadnameOrigins/blob/master/notebooks/04%20Adding%20features%20with%20Pipelines.ipynb

import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV

# Based on the following which has more examples:
# http://nbviewer.jupyter.org/github/michelleful/SingaporeRoadnameOrigins/blob/master/notebooks/04%20Adding%20features%20with%20Pipelines.ipynb
# http://michelleful.github.io/code-blog//2015/06/18/classifying-roads/
# http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
# https://stackoverflow.com/questions/49466193/how-to-add-a-feature-to-a-vectorized-data-set/49501769#49501769

# Load ANSI file into pandas dataframe.
df = pd.read_csv(r'c:/race.txt', encoding = 'latin1', usecols=['LAST_NAME', 'RACE'])

# Convert last name to lower case.
df['LAST_NAME'] = df['LAST_NAME'].str.lower()

# Remove the last name spaces.
# df['LAST_NAME'] = df['LAST_NAME'].str.replace(' ', '')

# Remove all rows where race is NOT in African, Coloured, White, Indian.
df = df.drop(df[~df['RACE'].isin(['African', 'Coloured', 'White', 'Indian'])].index)

# Returns a column from the dataframe named df as a numpy array of type string.
class TextExtractor(BaseEstimator, TransformerMixin):
    """Adapted from code by @zacstewart
       https://github.com/zacstewart/kaggle_seeclickfix/blob/master/estimator.py
       Also see Zac Stewart's excellent blogpost on pipelines:
       http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
       """

    def __init__(self, column_name):
        self.column_name = column_name

    def transform(self, df):
        # Select the relevant column and return it as a numpy array.
        # Set the array type to be string.
        return np.asarray(df[self.column_name]).astype(str)

    def fit(self, *_):
        return self

class Apply(BaseEstimator, TransformerMixin):
    """Takes in a function and applies it element-wise to every element in the numpy array it's supplied with."""

    def __init__(self, fn):
        self.fn = np.vectorize(fn)

    def transform(self, data):
        # Note: reshaping is necessary because otherwise sklearn
        # interprets the 1-d array as a single sample.
        return self.fn(data.reshape(data.size, 1))

    def fit(self, *_):
        return self

class AverageWordLengthExtractor(BaseEstimator, TransformerMixin):
    """Takes in dataframe, extracts last name column, outputs average word length"""

    def __init__(self):
        pass

    def average_word_length(self, name):
        """Helper code to compute average word length of a name"""
        return np.mean([len(word) for word in name.split()])

    def transform(self, df, y=None):
        """The workhorse of this feature extractor"""
        return df['LAST_NAME'].apply(self.average_word_length)

    def fit(self, df, y=None):
        """Returns self unless something different happens in train and test"""
        return self

# Let's pick the same random 10% of the data to train with.
random.seed(1965)
train_test_set = df.loc[random.sample(list(df.index.values), int(len(df) / 10))]

# X = train_test_set[['road_name', 'has_malay_road_tag']]
X = train_test_set[['LAST_NAME']]
y = train_test_set['RACE']

vect = CountVectorizer(ngram_range=(1,4), analyzer='char')
clf = LinearSVC()

pipeline = Pipeline([
    ('name_extractor', TextExtractor('LAST_NAME')),    # Extract names from df.
    ('text_features', FeatureUnion([
        ('vect', vect),    # Extract ngrams from names.
        ('num_words', Apply(lambda s: len(s.split()))),    # Number of words.
        ('ave_word_length', Apply(lambda s: np.mean([len(w) for w in s.split()]))), # Average word length.
    ])),
    ('clf' , clf),     # Feed the output through a classifier.
])

def run_experiment(X, y, pipeline, num_expts=100):
    scores = list()
    for i in range(num_expts):
        X_train, X_test, y_train, y_true = train_test_split(X, y)
        model = pipeline.fit(X_train, y_train)  # Train the classifier.
        y_test = model.predict(X_test)          # Apply the model to the test data.
        score = accuracy_score(y_test, y_true)  # Compare the results to the gold standard.
        scores.append(score)

    print(sum(scores) / num_expts)

# Run x times (num_expts) and get the average accuracy.
run_experiment(X, y, pipeline, 100)
...