NLP15-基于keras的中文情感挖掘试探
发布时间:2020-12-14 04:55:04 所属栏目:大数据 来源:网络整理
导读:摘要:keras开发,tf为后端;采用了两个样本(ChnSentiCorp_htl_ba_2000与imdb),三个神经网络的试探性运行(全连接的一般神经网络NN,LSTM,CNN),感觉keras比tf写代码更简单。对于NN只要参数充够的多,会拟合得很好,不过这样产生了过拟合;LSTM比CNN运
摘要:keras开发,tf为后端;采用了两个样本(ChnSentiCorp_htl_ba_2000与imdb),三个神经网络的试探性运行(全连接的一般神经网络NN,LSTM,CNN),感觉keras比tf写代码更简单。对于NN只要参数充够的多,会拟合得很好,不过这样产生了过拟合;LSTM比CNN运行的效果好很多。 keras的中文文档: http://keras-cn.readthedocs.io/en/latest/ NN下载数据 # 探索一下数据情况
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import numpy as np
from keras.datasets import imdb
from keras.layers import Embedding,Flatten,Dense
from keras.models import Sequential
from keras.preprocessing import sequence
## EDA
# 加载数据,这个数据来自: https://s3.amazonaws.com/text-datasets/imdb_full.pkl
(x_train,y_train),(x_test,y_test) = imdb.load_data()
# 探索一下数据情况
lens = list(map(len,x_train))
avg_len = np.mean(lens)
print(avg_len)
plt.hist(lens,bins=range(min(lens),max(lens) + 50,50))
plt.show()
# 由于长度不同,这里取相同的长度
m = max(max(list(map(len,x_train))),max(list(map(len,x_test))))
print('m=%d' % m)
maxword = min(400,m)
x_train = sequence.pad_sequences(x_train,maxlen=maxword)
x_test = sequence.pad_sequences(x_test,maxlen=maxword)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
#词数
vocab_siz = np.max([np.max(x_train[i]) for i in range(x_train.shape[0])]) + 1
print('vocab_siz=%d' % vocab_siz)
print('x_train.shape=[%d,%d]' % (x_train.shape[0],x_train.shape[1]))
#构建模型
model = Sequential()
# 第一层是嵌入层,矩阵为 vocab_siz * 64
model.add(Embedding(vocab_siz,64,input_length=maxword))
# 把矩阵压平,变成vocab_siz * 64维的向量
model.add(Flatten())
# 加入多层全连接
model.add(Dense(2000,activation='relu'))
model.add(Dense(500,activation='relu'))
model.add(Dense(200,activation='relu'))
model.add(Dense(50,activation='relu'))
# 最后一层输进0~1之间的值,像lr那样
model.add(Dense(1,activation='sigmoid'))
# 计算
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())
print(type(x_train))
#训练
model.fit(x_train,y_train,validation_data=(x_test,y_test),batch_size=100,nb_epoch=20,verbose=1)
score = model.evaluate(x_test,y_test)
print(score)
控制台显示: ________________________
Layer (type) Output Shape Param # Connected to ====================================================================================================
embedding_1 (Embedding) (None,400,64) 5669568 embedding_input_1[0][0]
____________________________________________________________________________________________________
flatten_1 (Flatten) (None,25600) 0 embedding_1[0][0]
____________________________________________________________________________________________________
dense_1 (Dense) (None,2000) 51202000 flatten_1[0][0]
____________________________________________________________________________________________________
dense_2 (Dense) (None,500) 1000500 dense_1[0][0]
____________________________________________________________________________________________________
dense_3 (Dense) (None,200) 100200 dense_2[0][0]
____________________________________________________________________________________________________
dense_4 (Dense) (None,50) 10050 dense_3[0][0]
____________________________________________________________________________________________________
dense_5 (Dense) (None,1) 51 dense_4[0][0] ====================================================================================================
Total params: 57,982,369
Trainable params: 57,369
Non-trainable params: 0
Epoch 1/20
25000/25000 [==============================] - 452s - loss: 0.4248 - acc: 0.7779 - val_loss: 0.2961 - val_acc: 0.8768
Epoch 2/20
25000/25000 [==============================] - 458s - loss: 0.0779 - acc: 0.9730 - val_loss: 0.4230 - val_acc: 0.8503
Epoch 3/20
25000/25000 [==============================] - 450s - loss: 0.0050 - acc: 0.9985 - val_loss: 0.7284 - val_acc: 0.8522
Epoch 4/20
25000/25000 [==============================] - 452s - loss: 0.0031 - acc: 0.9990 - val_loss: 0.9187 - val_acc: 0.8420
Epoch 5/20
25000/25000 [==============================] - 449s - loss: 0.0052 - acc: 0.9982 - val_loss: 1.0336 - val_acc: 0.8362
LSTM语料的样子,网上找到的情感分析语料“ChnSentiCorp_htl_ba_2000” 上成的例子中,显示出来的数据都是一整数,这个整数是词的id,上面把构建 id的过程省略了,这个LSTM的例子把这个词转id的过程补回来;同时,把对这个语料的处理的过程也补上,首先把pos与neg文件合并成一个文档,然后再把这两个合成一个文档,分词。 把这个分好词的文档运用Gensim构建词典,形成id与词的一个映射,转成以整数为id的向量矩阵。 探索词的维数据,选择合适的维度,如果长句进行截断,如果是短句进行填充。 处理完就可以构建LSTM了,训练,评估。。。。 代码: # -*- coding:utf-8-*-
import os
import re
import numpy as np
import matplotlib.pyplot as plt
# 分词
from pprint import pprint
import jieba
from bs4 import BeautifulSoup
from gensim import corpora
from keras.layers import Embedding,LSTM,Dense,Activation,Dropout
from keras.models import Sequential
from keras.preprocessing import sequence
from sklearn.cross_validation import train_test_split
def cutPhase(inFile,outFile):
# 如果没有自己定义的词典,这行不要
# jieba.load_userdict("dict_all.txt")
# 加载停用词
stoplist = {}.fromkeys([line.strip() for line in open('data/stopword.txt','r',encoding='utf-8')])
f1 = open(inFile,encoding='utf-8')
f2 = open(outFile,'w+',encoding='utf-8')
line = f1.readline()
count = 0
while line:
b = BeautifulSoup(line,"lxml")
line = b.text
# 分词
segs = jieba.cut(line,cut_all=False)
# 过滤停用词
segs = [word for word in list(segs)
if word.lstrip() is not None
and word.lstrip() not in stoplist
]
# 每个词用空格隔开
f2.write(" ".join(segs))
f2.write('n')
line = f1.readline()
count += 1
if count % 100 == 0:
print(count)
f1.close()
f2.close()
def load_data(out_pos_name='data/pos.txt',out_neg_name='data/neg.txt'):
def do_load(file_name,dir):
c = 0
with open(file_name,encoding='utf-8') as f_out:
for root,_,files in os.walk(dir):
# print(root)
for f_name in files:
p = os.path.join(root,f_name)
try:
with open(p,mode='r',encoding='gbk') as f_read:
# print(os.path.join(root,f_name))
c += 1
txt = f_read.read()
txt = re.subn(r's+',' ',txt)[0]
f_out.write('%sn' % (txt))
# if c % 100 == 0:
# print(c)
except Exception as e:
print('p:',p)
# print('e:',e)
print('加载pos!!!')
do_load(out_pos_name,'data/ChnSentiCorp_htl_ba_2000/pos')
print('加载neg!!!')
do_load(out_neg_name,'data/ChnSentiCorp_htl_ba_2000/neg')
def combine_data():
c = 0
f_w = open('data/train.cut',encoding='utf-8')
f_pos = open('data/pos.cut',encoding='utf-8')
line = f_pos.readline()
while line:
c += 1
f_w.write('%dt%s' % (1,line))
line = f_pos.readline()
print(c)
f_pos.close()
f_neg = open('data/neg.cut',encoding='utf-8')
line = f_neg.readline()
while line:
c += 1
f_w.write('%dt%s' % (0,line))
line = f_neg.readline()
print(c)
f_neg.close()
f_w.close()
if __name__ == '__main__':
# print('# 加载数据')
# load_data(out_pos_name='data/pos.txt',out_neg_name='data/neg.txt')
# print('# 分词')
# cutPhase(inFile='data/pos.txt',outFile='data/pos.cut')
# cutPhase(inFile='data/neg.txt',outFile='data/neg.cut')
# 数据融合
# combine_data()
Y = []
x = []
for line in open('data/train.cut',encoding='utf-8'):
label,sentence = line.split("t")
Y.append(int(label))
x.append(sentence.split())
print('#构建字典')
dic = corpora.Dictionary(x)
X = []
for row in x:
tmp = []
for w_i in row:
tmp.append(dic.token2id[w_i])
X.append(tmp)
X = np.array(X)
Y = np.array(Y)
# lens = list(map(len,X))
# avg_len = np.mean(lens)
# print(avg_len)
# plt.hist(lens,bins=range(min(lens),max(lens) + 50,50))
# plt.show()
# 由于长度不同,这里取相同的长度,平均长度为38.18,最大长度为337.
m = max(list(map(len,X)))
print('m=%d' % m)
maxword = min(100,m)
X = sequence.pad_sequences(X,maxlen=maxword)
print(X.shape)
## 数据划分
x_train,x_test,y_test = train_test_split(X,Y,test_size=0.2,random_state=42)
# 构建模型
model = Sequential()
model.add(Embedding(len(dic) + 1,128,input_length=maxword))
# model.add(LSTM(128,dropout_W=0.2,return_sequences=True))
# model.add(LSTM(64,return_sequences=True))
model.add(LSTM(128,dropout_W=0.2))
model.add(Dense(1))
model.add(Activation("sigmoid"))
# 计算
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])
print(model.summary())
# 进行训练
model.fit(x_train,nb_epoch=10,y_test))
## 结果评估
score,acc = model.evaluate(x_test,y_test,batch_size=100)
print("score: %.3f,accuracy: %.3f" % (score,acc))
# # 预测
# my_sentences = ['讨厌 房间']
# my_x = []
# for s in my_sentences:
# words = s.split()
# tmp = []
# for w_j in words:
# tmp.append(dic.token2id[w_j])
# my_x.append(tmp)
# my_X = np.array(my_x)
# my_X = sequence.pad_sequences(my_X,maxlen=maxword)
# labels = [int(round(x[0])) for x in model.predict(my_X)]
# for i in range(len(my_sentences)):
# print('%s:%s' % ('正面' if labels[i] == 1 else '负面',my_sentences[i]))
# 这里面没有考虑到字典没有的词,这个作为下一个版本改进点。
LSTM模型:
____________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to ====================================================================================================
embedding_1 (Embedding) (None,100,64) 748800 embedding_input_1[0][0]
____________________________________________________________________________________________________
lstm_1 (LSTM) (None,128) 98816 embedding_1[0][0]
____________________________________________________________________________________________________
lstm_2 (LSTM) (None,64) 49408 lstm_1[0][0]
____________________________________________________________________________________________________
lstm_3 (LSTM) (None,32) 12416 lstm_2[0][0]
____________________________________________________________________________________________________
dense_1 (Dense) (None,1) 33 lstm_3[0][0]
____________________________________________________________________________________________________
activation_1 (Activation) (None,1) 0 dense_1[0][0] ====================================================================================================
Total params: 909,473
Trainable params: 909,473
score: 0.572,accuracy: 0.859
____________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to ====================================================================================================
embedding_1 (Embedding) (None,128) 98816 embedding_1[0][0]
____________________________________________________________________________________________________
dense_1 (Dense) (None,1) 129 lstm_1[0][0]
____________________________________________________________________________________________________
activation_1 (Activation) (None,1) 0 dense_1[0][0] ====================================================================================================
Total params: 847,745
Trainable params: 847,745
Non-trainable params: 0
score: 0.302,accuracy: 0.871
____________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to ====================================================================================================
embedding_1 (Embedding) (None,128) 1497600 embedding_input_1[0][0]
____________________________________________________________________________________________________
lstm_1 (LSTM) (None,128) 131584 embedding_1[0][0]
____________________________________________________________________________________________________
dense_1 (Dense) (None,1) 0 dense_1[0][0] ====================================================================================================
Total params: 1,629,313
Trainable params: 1,313
Non-trainable params: 0
score: 0.386,accuracy: 0.874
CNN对上面的LSTM模型结构考虑使用CNN,主要修改神经网络模型。 # 构建模型CNN
model = Sequential()
model.add(Embedding(len(dic) + 1,128,input_length=maxword))
model.add(Conv1D(nb_filter=128,filter_length=5,border_mode='same',activation='relu'))
model.add(MaxPooling1D(pool_length=2))
model.add(Dropout(0.25))
model.add(Conv1D(nb_filter=128,filter_length=3,activation='relu'))
model.add(MaxPooling1D(pool_length=2))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(64,activation='relu'))
model.add(Dense(32,activation='relu'))
model.add(Dense(1,activation='relu'))
# 计算
model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=["accuracy"])
print(model.summary())
# 进行训练
model.fit(x_train,y_train,batch_size=100,nb_epoch=20,validation_data=(x_test,y_test))
## 结果评估
score,acc = model.evaluate(x_test,y_test,batch_size=100,verbose=1)
print("score: %.3f,accuracy: %.3f" % (score,acc))
____________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to ====================================================================================================
embedding_1 (Embedding) (None,128) 1497600 embedding_input_1[0][0]
____________________________________________________________________________________________________
convolution1d_1 (Convolution1D) (None,128) 82048 embedding_1[0][0]
____________________________________________________________________________________________________
maxpooling1d_1 (MaxPooling1D) (None,50,128) 0 convolution1d_1[0][0]
____________________________________________________________________________________________________
dropout_1 (Dropout) (None,128) 0 maxpooling1d_1[0][0]
____________________________________________________________________________________________________
convolution1d_2 (Convolution1D) (None,128) 49280 dropout_1[0][0]
____________________________________________________________________________________________________
maxpooling1d_2 (MaxPooling1D) (None,25,128) 0 convolution1d_2[0][0]
____________________________________________________________________________________________________
dropout_2 (Dropout) (None,128) 0 maxpooling1d_2[0][0]
____________________________________________________________________________________________________
flatten_1 (Flatten) (None,3200) 0 dropout_2[0][0]
____________________________________________________________________________________________________
dense_1 (Dense) (None,64) 204864 flatten_1[0][0]
____________________________________________________________________________________________________
dense_2 (Dense) (None,32) 2080 dense_1[0][0]
____________________________________________________________________________________________________
dense_3 (Dense) (None,1) 33 dense_2[0][0] ====================================================================================================
Total params: 1,835,905
Trainable params: 1,905
Non-trainable params: 0
____________________________________________________________________________________________________
这里主是为了测试,效果不怎么样。没有前面的LSTM好,考虑研究改进,或考虑学习CNNText… [happyprince,http://blog.csdn.net/ld326/article/details/78670821] (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |