Я разговаривал со Стивеном Рассеттом на хронический . Я предложил пример с Python после того, как он предложил токенизацию.
Вот пример Python. Вы запускаете вывод в хронический.
import nltk
import MySQLdb
import time
import string
import re
#tokenize
sentence = 'Available June 9 -- August first week'
tokens = nltk.word_tokenize(sentence)
parts_of_speech = nltk.pos_tag(tokens)
print parts_of_speech
#allow white list
white_list = ['first']
#allow only prepositions
#NNP, CD
approved_prepositions = ['NNP', 'CD']
filtered = []
for word in parts_of_speech:
if any(x in word[1] for x in approved_prepositions):
filtered.append(word[0])
elif any(x in word[0] for x in white_list):
#if word in white list, append it
filtered.append(word[0])
print filtered
#normalize to alphanumeric only
normalized = re.sub(r'\s\W+', ' ', ' '.join(filtered))
print filtered