Ошибка загрузки ресурсов NLTK: «Пожалуйста, используйте загрузчик NLTK для получения ресурса: \ n \ n» - PullRequest
0 голосов
/ 06 февраля 2020

Я адаптировал следующий код из сообщения Сьюзен Ли , но допустил ошибку, когда код пытается токенизировать текст с использованием ресурсов NLTK (или, возможно, что-то не так с «векторными ключами») загружается из Интернета). Произошла ошибка в 5-м блоке кода (см. Ниже, загрузка из Интернета может занять некоторое время):

## 1. load packages and data

import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import sent_tokenize
STOPWORDS = set(stopwords.words('english'))
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup
%matplotlib inline

df = pd.read_csv('https://www.dropbox.com/s/b2w7iqi7c92uztt/stack-overflow-data.csv?dl=1')
df = df[pd.notnull(df['tags'])]

my_tags = ['java','html','asp.net','c#','ruby-on-rails','jquery','mysql','php','ios','javascript','python','c','css','android','iphone','sql','objective-c','c++','angularjs','.net']

## 2. cleaning

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):

    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
df['post'] = df['post'].apply(clean_text)

## 3. train test split

X = df.post
y = df.tags
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)

## 4. load keyed vectors from the web: will take a while to load

import gensim
word2vec_path = "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
wv = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
wv.init_sims(replace=True)


## 5. this is where it goes wrong

def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens
    
train, test = train_test_split(df, test_size=0.3, random_state = 42)

test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['post']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['post']), axis=1).values

X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)


## 6. perform logistic regression test

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_word_average, train['tags'])
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % accuracy_score(y_pred, test.tags))
print(classification_report(test.tags, y_pred,target_names=my_tags))

Обновление части 5 (за комментарии @luigigi)

## 5. download nltk and use apply() function without using lambda

import nltk
nltk.download()
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import sent_tokenize

    def w2v_tokenize_text(text):
        tokens = []
        for sent in nltk.sent_tokenize(text, language='english'):
            for word in nltk.word_tokenize(sent, language='english'):
                if len(word) < 2:
                    continue
                tokens.append(word)
        return tokens
        
    train, test = train_test_split(df, test_size=0.3, random_state = 42)

    def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens
    
train, test = train_test_split(df, test_size=0.3, random_state = 42)

test_tokenized = test['post'].apply(w2v_tokenize_text).values

train_tokenized = train['post'].apply(w2v_tokenize_text).values

    X_train_word_average = word_averaging_list(wv,train_tokenized)
    X_test_word_average = word_averaging_list(wv,test_tokenized)

## now run the test

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_word_average, train['tags'])
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % accuracy_score(y_pred, test.tags))
print(classification_report(test.tags, y_pred,target_names=my_tags))

Это должно работать.

1 Ответ

1 голос
/ 06 февраля 2020

Затем nltk tokenizer ожидает ресурс punkt , поэтому вам необходимо сначала загрузить его:

nltk.download('punkt')

Кроме того, вам не нужно выражение lambda для примените свою функцию токенизатора. Вы можете просто использовать:

test_tokenized = test['post'].apply(w2v_tokenize_text).values
train_tokenized = train['post'].apply(w2v_tokenize_text).values
...