NLP笔记
- 作者: chaihahaha
- 时间:
- 分类: 人工智能笔记
- 评论
展开阅读
变长词向量输入(Deprecated)
在keras中,使用model.add()
进行LSTM层结构初始化时,如果设定inputshape=(None,输入向量序列中向量的维度)
,即可实现向量序列的不定长度,即#time step可变,不用再对每个序列填充0向量,使所有序列长度相同了。
示例
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import numpy as np
model = Sequential()
model.add(LSTM(32, return_sequences=True, input_shape=(None, 2)))
model.add(LSTM(8, return_sequences=True))
model.add(Dense(1, activation='sigmoid'))
print(model.summary(90))
model.compile(loss='mean_squared_error',
optimizer='adam')
X1=np.array([[[0.25908799, 0.96578602],
[0.22886421, 0.16556086],
[0.82094901, 0.69984487],
[0.97888577, 0.76304284],
[0.28470417, 0.11232793],
[0.23395936, 0.14732181]]])
Y1=np.array([[[1]]])
X2=np.array([[[0.41400308, 0.48925297],
[0.99921471, 0.6069814 ],
[0.61261462, 0.7192767 ]]])
Y2=np.array([[[0.5]]])
model.fit(X1, Y1, steps_per_epoch=2, epochs=10, verbose=1)
但是,我个人不推荐这样做,因为如果这样使用变长的输入数据,在使用model.predict_generator()
进行预测时,会遇到如下Bug。
ValueError Traceback (most recent call last)
<ipython-input-17-11e3da5819e6> in <module>()
----> 1 model.predict_generator(generatorX(X,wv_model),steps=10, verbose=1)
E:\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py in predict_generator(self, generator, steps, max_queue_size, workers, use_multiprocessing, verbose)
2296 workers=workers,
2297 use_multiprocessing=use_multiprocessing,
-> 2298 verbose=verbose)
2299
2300 def _get_callback_model(self):
E:\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\training_generator.py in predict_generator(model, generator, steps, max_queue_size, workers, use_multiprocessing, verbose)
435 return all_outs[0][0]
436 else:
--> 437 return np.concatenate(all_outs[0])
438 if steps_done == 1:
439 return [out[0] for out in all_outs]
ValueError: all the input array dimensions except for the concatenation axis must match exactly
这个Bug可以在github上找到,但是并没有在旧版本中修复。
使用NLTK的Lemmatizer对英文单词进行时态、语态变形和还原
如果要对英文文本进行处理,还要在word embedding之前进行stemming和lemmatization以便减少词典大小。比如将复数名词和动词comes, horses转化为词干come, hors。
示例
import nltk
ps=nltk.stem.PorterStemmer()
print(ps.stem("comes")) #come
print(ps.stem("horses")) #hors
如果只想把复数形式转化为单数形式,可以安装Pattern3库,使用如下命令安装
pip install Pattern3
如果在安装后使用时遇到如下错误:
File "/home/xxx/anaconda3/lib/python3.7/site-packages/pattern3/text/tree.py", line 37
except:
^
IndentationError: expected an indented block
则编辑/home/xxx/anaconda3/lib/python3.7/site-packages/pattern3/text/tree.py
这个文件,将
34行的:
from itertools import chain
try:
except:
izip = zip # Python 3
改为:
try:
from itertools import chain
except:
izip = zip # Python 3
如果国内源速度慢,可以在~/.pip/ 下新建文件pip.conf,输入内容:
[global]
index-url = https://mirrors.aliyun.com/pypi/simple/
示例
from pattern3.en import singularize
print(singularize("comes"))
print(singularize("horses"))
更好的方案是使用NLTK进行lemmatize,它不仅可以改变单复数,还可以改变时态
from nltk.stem.wordnet import WordNetLemmatizer
words = ['gave','went','going','dating','comes','horses']
for word in words:
print(word+"-->"+WordNetLemmatizer().lemmatize(word,'v'))
#gave-->give
#went-->go
#going-->go
#dating-->date
#comes-->come
#horses-->horse
精简Word2vec模型大小
在使用预训练好的word2vec模型时,常常会遇到极其heavy的模型,如GoogleNews-vectors-negative300.bin
,在使用gensim.models.KeyedVectors.load_word2vec_format()
加载入内存后使用了将近4个G的内存,在训练时很容易导致MemoryError。然而我们使用的语料库的词汇很可能没有这么多,这个模型中大部分的词汇我们可能都没有用到,因此有必要对模型进行精简,以下代码实现模型精简。
import gensim
import nltk
import csv
import numpy as np
tokenizer=nltk.tokenize.RegexpTokenizer(r'\w+')
def find_minimal_vocab(X):
vocab = set()
for Xi in X:
for j in range(2):
vocab = vocab.union(set(tokenizer.tokenize(Xi[j])))
return vocab
def restrict_w2v(w2v, restricted_word_set):
# minify the word vector model
#w2v.init_sims()
new_vectors = []
new_vocab = {}
new_index2entity = []
new_vectors_norm = []
for i in range(len(w2v.vocab)):
word = w2v.index2entity[i]
vec = w2v.vectors[i]
vocab = w2v.vocab[word]
#vec_norm = w2v.vectors_norm[i]
if word in restricted_word_set:
vocab.index = len(new_index2entity)
new_index2entity.append(word)
new_vocab[word] = vocab
new_vectors.append(vec)
#new_vectors_norm.append(vec_norm)
w2v.vocab = new_vocab
w2v.vectors = np.array(new_vectors)
w2v.index2entity = new_index2entity
w2v.index2word = new_index2entity
#w2v.vectors_norm = new_vectors_norm
wv_model = gensim.models.KeyedVectors.load_word2vec_format(input("你要精简的模型文件路径及文件名: ", binary=True)
M=[]
with open(input("你的训练集路径及文件名:"), newline='\n',encoding='utf8') as csvfile:
reader = csv.reader(csvfile, delimiter='\t', quotechar='\t')
for row in reader:
M.append(row)
X=[i[5:7] for i in M]
Y=[i[4] for i in M]
del M
Md=[]
with open(input("你的开发集路径及文件名:"), newline='\n',encoding='utf8') as csvfile:
reader = csv.reader(csvfile, delimiter='\t', quotechar='\t')
for row in reader:
Md.append(row)
Xd=[i[5:7] for i in Md]
Yd=[i[4] for i in Md]
del Md
Mt=[]
with open(input("你的测试集路径及文件名:"), newline='\n',encoding='utf8') as csvfile:
reader = csv.reader(csvfile, delimiter='\t', quotechar='\t')
for row in reader:
Mt.append(row)
Xt=[i[5:7] for i in Mt]
Yt=[i[4] for i in Mt]
del Mt
v1 = find_minimal_vocab(X)
v2 = find_minimal_vocab(Xd)
v3 = find_minimal_vocab(Xt)
vocab = v1.union(v2,v3)
del v1
del v2
del v3
restrict_w2v(wv_model,vocab)
wv_model.save_word2vec_format(input("精简后的模型文件输出路径及文件名:"), binary=True)