真的灰常小白了,代码是根据网上大神的代码做了微调,算出来分数都为零,呜呜,太难了。。。
import jieba
import numpy as np
posdict = open('C:/.../中文情感词典/正面.txt', encoding = 'utf-8').readlines()
negdict = open('C:/.../中文情感词典/负面.txt', encoding = 'utf-8').readlines()
deny_word = open('C:/.../中文情感词典/否定词.txt', encoding = 'utf-8').readlines()
mostdict = open("C:/.../中文情感词典/extreme.txt", encoding="utf-8").readlines()
verydict = open("C:/.../中文情感词典/very.txt", encoding = 'utf-8').readlines()
moredict = open("C:/.../中文情感词典/more.txt", encoding = 'utf-8').readlines()
ishdict = open("C:/.../中文情感词典/ish.txt", encoding = 'utf-8').readlines()
insufficientdict = open("C:/.../中文情感词典/insufficiently.txt", encoding = 'utf-8').readlines()
inversedict = open("C:/.../中文情感词典/over.txt", encoding = 'utf-8').readlines()
degree_word = open("C:/.../中文情感词典/程度级别词语.txt", encoding = 'utf-8').readlines()
def match(word, sentiment_value):
if word in mostdict:
sentiment_value *= 2.0
elif word in verydict:
sentiment_value *= 1.5
elif word in moredict:
sentiment_value *= 1.25
elif word in ishdict:
sentiment_value *= 1.2
elif word in insufficientdict:
sentiment_value *= 0.8
elif word in inversedict:
sentiment_value *= -1
return sentiment_value
def sentiment_score_list(dataset):
seg_sentence = dataset.split('。')
count1 = []
count2 = []
for sen in seg_sentence: # 循环遍历每一个评论
segtmp = jieba.lcut(sen, cut_all=False) # 把句子进行分词,以列表的形式返回
print(segtmp)
i = 0 # 记录扫描到的词的位置
a = 0 # 记录情感词的位置
poscount = 0 # 积极词的第一次分值
poscount2 = 0 # 积极词反转后的分值
poscount3 = 0 # 积极词的最后分值(包括叹号的分值)
negcount = 0 # 消极词的第一分值
negcount2 = 0 # 消极词反转后的分值
negcount3 = 0 # 消极词的最后分值 (包括叹号的分值)
for word in segtmp:
if word in posdict: # 判断词语是否是情感词
poscount += 1
c = 0
for w in segtmp[a:i]: # 扫描情感词前的程度词
if w in mostdict:
poscount *= 2.0
elif w in verydict:
poscount *= 1.5
elif w in moredict:
poscount *= 1.25
elif w in ishdict:
poscount *= 1.2
elif word in insufficientdict:
poscount *= 0.8
elif word in inversedict:
poscount *= -1
elif w in deny_word:
c += 1
if judgeodd(c) == 'odd': # 扫描情感词前的否定词数
poscount *= -1.0
poscount2 += poscount
poscount = 0
poscount3 = poscount + poscount2 + poscount3
poscount2 = 0
else:
poscount3 = poscount + poscount2 + poscount3
poscount = 0
a = i + 1 # 情感词的位置变化
elif word in negdict: # 消极情感的分析,与上面一致
negcount += -1
d = 0
for w in segtmp[a:i]:
if w in mostdict:
negcount *= -2.0
elif w in verydict:
negcount *= -1.5
elif w in moredict:
negcount *= -1.25
elif w in ishdict:
negcount *= -1.2
elif word in insufficientdict:
poscount *= -0.8
elif w in degree_word:
d += 1
if judgeodd(d) == 'odd':
negcount *= -1.0
negcount2 += negcount
negcount = 0
negcount3 = negcount + negcount2 + negcount3
negcount2 = 0
else:
negcount3 = negcount + negcount2 + negcount3
negcount = 0
a = i + 1
elif word == '!' or word == '!': ## 判断句子是否有感叹号
for w2 in segtmp[::-1]: # 扫描感叹号前的情感词,发现后权值+2,然后退出循环
if w2 in posdict or negdict:
poscount3 += 2
negcount3 += 2
break
i += 1 # 扫描词位置前移
# 以下是防止出现负数的情况
pos_count = 0
neg_count = 0
if poscount3 < 0 and negcount3 > 0:
neg_count += negcount3 - poscount3
pos_count = 0
elif negcount3 < 0 and poscount3 > 0:
pos_count = poscount3 - negcount3
neg_count = 0
elif poscount3 < 0 and negcount3 < 0:
neg_count = -poscount3
pos_count = -negcount3
else:
pos_count = poscount3
neg_count = negcount3
count1.append([pos_count, neg_count])
count2.append(count1)
count1 = []
return count2
def sentiment_score(senti_score_list):
score = []
for review in senti_score_list:
score_array = np.array(review)
Pos = np.sum(score_array[:, 0]) # 积极
Neg = np.sum(score_array[:, 1]) # 消极
AvgPos = np.mean(score_array[:, 0])
AvgPos = float('%.1f'%AvgPos)
AvgNeg = np.mean(score_array[:, 1])
AvgNeg = float('%.1f'%AvgNeg)
StdPos = np.std(score_array[:, 0])
StdPos = float('%.1f'%StdPos)
StdNeg = np.std(score_array[:, 1])
StdNeg = float('%.1f'%StdNeg)
score.append([Pos, Neg, AvgPos, AvgNeg, StdPos, StdNeg])
return score
data = '...'
data1= '...'
print(sentiment_score(sentiment_score_list(data)))
print(sentiment_score(sentiment_score_list(data1)))