Вы можете попытаться создать решение на основе приведенного ниже кода.
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import matplotlib.pyplot as plt
Doc1 = 'Natural history is a domain of inquiry involving organisms including animals, fungi and plants in their environment; leaning more towards observational than experimental methods of study. A person who studies natural history is called a naturalist or natural historian'
Doc2 = 'Natural history encompasses scientific research but is not limited to it.'
Doc3 = 'It involves the systematic study of any category of natural objects or organisms'
Doc4 = "So while it dates from studies in the ancient Greco-Roman world and the mediaeval Arabic world, through to European Renaissance naturalists working in near isolation, today's natural history is a cross discipline umbrella of many specialty sciences"
my_vocabulary= ['natural history', 'natural historian', 'scientific research', 'natural objects', 'European Renaissance']
# Create a list of all documents
docs = [Doc1, Doc2, Doc3, Doc4]
docs = [doc.lower() for doc in docs]
# Train CountVectorizer using a list of bigrams
vectorizer = CountVectorizer(ngram_range=(2, 3))
vectorizer.fit(my_vocabulary)
# Calculate term frequency in docs and convert to a data frame
term_f = vectorizer.transform(docs)
df_freqs = pd.DataFrame(term_f.toarray(), columns=vectorizer.get_feature_names())
df_freqs.index = ['Doc{}'.format(i) for i, doc in enumerate(docs)]
# Plot the frequency of "natural history"
df_freqs['natural history'].plot.bar()
plt.ylabel('Frequency')
plt.title('Frequency of "natural history" in Docs')