Вы можете отфильтровать Nones:
data = d.select("*").toPandas()
train, test = train_test_split(data, test_size = .20, random_state = True)
train['set'] = 'train'
test['set'] = 'test'
data = pd.concat([train,test], ignore_index=True)
def clean_text(text):
return "".join([c for c in text.lower() if (text is not None) and (c not in punctuation)])
data['text_cleaned'] = data['text'].map(clean_text)
tfidf = TfidfVectorizer()
tfidf.fit(data['text_cleaned'])