1、代码
def clean_text(text, remove_stopwords=False):
"""
数据清洗
"""
text = BeautifulSoup(text, 'html.parser').get_text()
text = re.sub(r'[^a-zA-Z]', ' ', text)
words = text.lower().split()
if remove_stopwords:
words = [w for w in words if w not in eng_stopwords]
return wordsdef to_review_vector(review):
"""
获取词向量
"""
global word_vec review = clean_text(review, remove_stopwords=True)
#print (review)
#words = nltk.word_tokenize(review)
word_vec = np.zeros((1,300))
for word in review:
#word_vec = np.zeros((1,300))
if word in model:
word_vec += np.array([model[word]])
#print (word_vec.mean(axis = 0))
return pd.Series(word_vec.mean(axis = 0))