НЛП, кластеризация документов - PullRequest
1 голос
/ 02 февраля 2020

Я использую алгоритм HDBSCAN для создания кластеров из имеющихся у меня документов. Но чтобы создать векторную матрицу из слов, я использую алгоритм tf-idf и хочу использовать GloVe или Word2ve c (потому что tf-idf основан на BoW, поэтому он не может захватить семантику).

Какой метод я могу использовать - GloV, Word2ve c или любые другие методы, которые будут использоваться для кластеризации текста? И как я могу это реализовать?

Любая помощь будет высоко оценена!

nltk.download('stopwords')

title = []
synopses = []
filename = "twitter-test-dataset.csv"
num_clusters = 10
pkl_file = "doc_cluster.pkl"
generate_pkl = False

# pre-process data
with open(filename, 'r') as csvfile:
    # creating a csv reader object
    csvreader = csv.reader(csvfile)

    # extracting field names through first row
    fields = csvreader.next()

    # extracting each data row one by one
    duplicates = 0
    for row in csvreader:
        # removes the characters specified
        line = re.sub(r'[.,"!]+', '', row[2], flags=re.MULTILINE)
        line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE)  # removes RT
        line = re.sub(r'https?:\/\/.*[\r\n]*', '',
                    line, flags=re.MULTILINE)  # remove link
        line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
        line = (re.sub(
            "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", line, flags=re.MULTILINE))
        line = filter(lambda x: x in string.printable,
                    line)  # filter non-ascii characers
        if line not in synopses:
            synopses.append(line)
            title.append(row[2])
        else:
            duplicates += 1

print("Removed " + str(duplicates) + " rows")


stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")


def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(
        text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for**strong text** token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text)
            for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens


totalvocab_stemmed = []
totalvocab_tokenized = []

for i in synopses:
    # for each item in 'synopses', tokenize/stem
    allwords_stemmed = tokenize_and_stem(i)
    # extend the 'totalvocab_stemmed' list
    totalvocab_stemmed.extend(allwords_stemmed)

    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

vocab_frame = pd.DataFrame(
    {'words': totalvocab_tokenized}, index=totalvocab_stemmed)

# print "there are " + str(vocab_frame.shape[0]) + " items in vocab_frame"


# define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                min_df=0.0, stop_words='english',
                                use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))

#CREATE TFIDF MATRIX
tfidf_matrix = tfidf_vectorizer.fit_transform(synopses)
terms = tfidf_vectorizer.get_feature_names()


c = hdbscan.HDBSCAN(min_cluster_size=5)
#PASS TFIDF_MATRIX TO HDBSCAN
c.fit(tfidf_matrix)
print(c.labels_)
sys.exit()

...