码迷,mamicode.com
首页 > 其他好文 > 详细

中文情感分析 glove+LSTM

时间:2018-06-14 18:28:20      阅读:889      评论:0      收藏:0      [点我收藏+]

标签:sim   global   close   分词   wiki   表示   sam   als   array   

最近尝试了一下中文的情感分析。

主要使用了Glove和LSTM。语料数据集采用的是中文酒店评价语料

1、首先是训练Glove,获得词向量(这里是用的300d)。这一步使用的是jieba分词和中文维基。

2、将中文酒店评价语料进行清洗,并分词。分词后转化为词向量的表示形式。

3、使用LSTM网络进行训练。

最终的正确率在91%左右

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 30 13:52:23 2018

@author: xyli
处理酒店评价语料数据,
分词,并转化为Glove向量
"""
import sys
import os
import chardet
import jieba
import re
import gensim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Masking
from keras.layers import Dense, Input, Flatten, Activation
from keras.layers import Conv1D, GlobalMaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional,Reshape
from keras.models import Sequential, Model
from Attention_layer import Attention_layer

from keras.layers import Convolution2D, MaxPooling2D  
from keras.utils import np_utils 


def loadGLoveModel(filename):
    embeddings_index = {}
    f = open(filename)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype=float32)
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index

def word2Glovec(List,model):
    vec=[]
    insert = [float(0) for i in range(300)] #300表示vec的维度
    insert = np.asarray(insert, dtype=float32)
    for w in List:
        v = model.get(w)
        if v is None:
            vec.append(insert)
        else:
            vec.append(v)
    return vec

def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
#    string = string.decode(‘utf-8‘)
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\‘", "", string)
    string = re.sub(r"\"", "", string)
    string = re.sub(r"\r\n", "", string)
    string = re.sub(r"\r", "", string)
    string = re.sub(r"\,","",string)
    string = re.sub(r"\.","",string)
    string = re.sub(r"\,","",string)
    string = re.sub(r"\。","",string)
    string = re.sub(r"\(","",string)
    string = re.sub(r"\)","",string)
    string = re.sub(r"\(","",string)
    string = re.sub(r"\)","",string)
    string = re.sub(r"\“","",string)
    string = re.sub(r"\”","",string)
    return string.strip()

def fitList(List,n):
    L = len(List)
#    insert = [0 for i in range(300)]
    insert = !
    if L < n:
        d=n-L
        appList=[insert for i in range(d)]
        List+=appList
    else:
        if L>n:
            List=List[0:n]
    return List

def readData(filename):
    
    
    with open(filename, rb) as f:
        data = f.read()
        data=data.decode(gb18030,ignore)
        data=clean_str(data)
        seg_list = jieba.cut(data)  # 默认是精确模式
    segList=[]
    for s in seg_list:
        s=clean_str(s)
        segList.append(s)
    return segList
        
def loadData():
    Corpus_DIR = "data/ChnSentiCorp_htl_unba_10000"
    DIR=[/neg,/pos]
    commentList=[]
    rootdir = Corpus_DIR+DIR[0]
    filelist = os.listdir(rootdir) #列出文件夹下所有的目录与文件
    labelList=[[0.0,1.0] for i in range(0,len(filelist))]
    for i in range(0,len(filelist)):
       path = os.path.join(rootdir,filelist[i])
       if os.path.isfile(path):
              templist=readData(path)
              commentList.append(templist)
    
    rootdir = Corpus_DIR+DIR[1]
    filelist = os.listdir(rootdir) #列出文件夹下所有的目录与文件
    labelList2=[[1.0,0.0] for i in range(0,len(filelist))]
    for i in range(0,len(filelist)):
       path = os.path.join(rootdir,filelist[i])
       if os.path.isfile(path):
              templist=readData(path)
              commentList.append(templist)
    labelList+=labelList2
    return commentList,labelList

if __name__==__main__:
    List,labelList=loadData()  #加载语料数据
    gloveModel=loadGLoveModel(model/zhs_wiki_glove.vectors.300d.txt)  #加载glove模型数据
    countList=[]
    commentVecList=[]
    n=100
    for c in List:
        countList.append(len(c))
        glovec=word2Glovec(fitList(c,n),gloveModel)
        commentVecList.append(glovec)
        
    VALIDATION_SPLIT = 0.2
    
    commentVecList=np.array(commentVecList)
    labelList=np.array(labelList)
    indices = np.arange(commentVecList.shape[0])
    np.random.shuffle(indices)
    data = commentVecList[indices]
    labels = labelList[indices]
    
    nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
    x_train = data[:-nb_validation_samples]
    y_train = labels[:-nb_validation_samples]
    x_val = data[-nb_validation_samples:]
    y_val = labels[-nb_validation_samples:]
    
    model = Sequential()
    model.add(LSTM(120, input_shape=(x_train.shape[1], x_train.shape[2]),return_sequences=True))
#    model.add(Activation(‘relu‘)) #激活层 
#    model.add(Attention_layer())
    model.add(Bidirectional(LSTM(60,return_sequences=True)))
#    model.add(Attention_layer())
#    model.add(Activation(‘relu‘)) #激活层 
    model.add(Dropout(0.3)) #神经元随机失活
    model.add(Bidirectional(LSTM(30,return_sequences=False)))
    model.add(Dropout(0.3)) #神经元随机失活
    model.add(Dense(y_train.shape[1], activation=softmax))
    model.compile(loss=categorical_crossentropy, optimizer=adam, metrics=[accuracy])
    model.summary()
    model.fit(x_train, y_train, validation_data=(x_val, y_val),
              epochs=25, batch_size=200)
   
    

本文还在完善中。。。

中文情感分析 glove+LSTM

标签:sim   global   close   分词   wiki   表示   sam   als   array   

原文地址:https://www.cnblogs.com/xyli09/p/9183354.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!