可以将人名从文本中删除,或者将人名视为单词处理并保留在文本中。 代码实现如下:
import spacy
nlp = spacy.load("en_core_web_sm") doc = nlp("John Smith is a software engineer") names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"] print(names) # Output: ['John Smith']
from bertopic import BERTopic from nltk.tokenize import word_tokenize
topic_model = BERTopic(language="english") text = "John Smith is a software engineer" tokens = word_tokenize(text) cleaned_tokens = [token for token in tokens if token not in names] cleaned_text = " ".join(cleaned_tokens) topic_model.fit_transform([cleaned_text])