2021/6/20 20:26:54
''' 每次处理两百 ''' import jieba # 分词 import re # 过滤特殊字符 import numpy as np # 辅助处理 import pandas as pd # 处理 import emoji # 过滤表情 import jieba.analyse # 分级级别权重 import imageio # 图片 import jieba.posseg as pseg # 词性标注 from wordcloud import WordCloud # 词云 import os import matplotlib.pyplot as plt # 画布 import difflib # 相似度判断 import configparser # 配置文件导入 # 路径配置导入 def cfg(): # 生成config对象 conf = configparser.ConfigParser() # 用config对象读取配置文件"config.ini" , encoding='utf-8') # 以列表形式返回所有的section sections = conf.sections() items = conf.items('filePaths') items = dict(items) return items # 采用关键词典简化数据 def wts_dict(): wts_lst = [] with open(cfg()['wts_dict_path'], encoding='utf-8') as f: for line in f: line = line.replace("\n", "").replace("\r", "") wts_lst.append(line) return wts_lst # 文件加载 def dict_load(path): print("文件加载!") dt = [] with open(path, encoding='utf-8-sig') as f: for line in f: if line.strip() != '': # 去除空格 dt.append(line.strip()) return (dt) # 情感值计算 def sents(sent, negdict, posdict, nodict): pos = 0 # 积极 neg = 0 # 消极 for i in range(len(sent)): if sent[i] in negdict: if i == 1 and sent[i - 1] in nodict: pos = pos + 1 # 否定-消极 elif i == 1 and sent[i - 1] not in nodict: neg = neg + 1 # 其他-消极 elif i > 1 and sent[i - 1] in nodict: if sent[i - 2] in nodict: neg = neg + 1 # 否定-否定-消极 else: pos = pos + 1 # 其他-否定-消极 elif i > 1 and sent[i - 1] not in nodict: if sent[i - 2] in nodict: pos = pos + 1 # 否定-其他-消极 else: neg = neg + 1.5 # 程度-消极 elif sent[i] in posdict: if i == 1 and sent[i - 1] in nodict: neg = neg + 1 # 否定-积极 elif i == 1 and sent[i - 1] not in nodict: pos = pos + 1 # 其他-积极 elif i > 1 and sent[i - 1] in nodict: if sent[i - 2] in nodict: pos = pos + 1 # 否定-否定-积极 else: neg = neg + 1 # 其他-否定-积极 elif i > 1 and sent[i - 1] not in nodict: if sent[i - 2] in nodict: neg = neg + 1 # 否定-其他-积极 else: pos = pos + 1.5 # 程度-积极 # print(pos, neg) return pos, neg # 过滤表情 def filter_emoji(test_str): result = emoji.demojize(test_str) return emoji.emojize(result) # 数据加载处理为字符串列表 def pretreatment(): # 加载excel excel = pd.read_excel(cfg()['excel_path']) #encoding = utf-8 # punt_list = ',.!?;~。!?;~… '.encode('utf8').decode('utf8') # 暂时留取时间,不做处理 data = excel[['Title', 'Notes']] # 生成单维DataFrame 并删除重复行 datafreame = pd.DataFrame(data).dropna(how='any').drop_duplicates(subset='Notes') # 去除空值 NaN dataToTwo = datafreame.dropna(axis=0) # 存放comment列----》字符串列表 dataToTwoStr = [] for i in dataToTwo['Notes']: dataToTwoStr.append(filter_emoji(str(i))) # 表情处理 #print(dataToTwoStr) return dataToTwoStr # 开始加载情感词典列表 def first_Load(): neg_dict = [] # 消极情感词典 pos_dict = [] # 积极情感词典 no_dict = [] # 否定词词典 pos_dict = dict_load(cfg()['pos_dict_path']) # print(pos_dict) neg_dict = dict_load(cfg()['neg_dict_path']) # print(neg_dict) no_dict = dict_load(cfg()['no_dict_path']) # print(no_dict) return pos_dict, neg_dict, no_dict # dicts = {,{,[]}} def comment_base_split(wts_lst, comment_base): index = [] for i in wts_lst: if i in comment_base: sall_index = [r.span() for r in re.finditer(i, comment_base)] index.append(sall_index) # for i in index: # i = list(set(i)) # print(index) index_commnet = list(index) t = [] # print(index_commnet) for i in index_commnet: for j in i: tutle = list(j) if tutle[0] < 10: tutle[0] = 0 else: tutle[0] = tutle[0] - 20 if tutle[1] < 10: tutle[1] = 20 else: tutle[1] = tutle[1] + 20 t.append(tutle) # print(type(index_commnet)) for s in range(len(index_commnet)): index_commnet[s] = t # print(index_commnet) comment_base_split_dict = [] for i in index_commnet: for j in i: comment_base_split_dict.append(comment_base[j[0]:j[1]]) for i in comment_base_split_dict: for j in comment_base_split_dict: if get_equal_rate_1(i, j) > 0.85: # 相似度大于0.85 删除 comment_base_split_dict.remove(j) break return comment_base_split_dict # 判断俩字符串相似度 def get_equal_rate_1(str1, str2): return difflib.SequenceMatcher(None, str1, str2).quick_ratio() # 最终处理 def batchProcessing(): # 加载五台山关键字词典 wts_lst = wts_dict() # 三个情感分析词典加载 pos_dict, neg_dict, no_dict = first_Load() # 获取评论 字符串列表 inputs = pretreatment() lst = [] for i in inputs: lists = [] # 单列单行评论---》过滤特殊符号 comment_base = i.replace("\n", "").replace("\r", "").replace(" ", "") lst.append(comment_base_split(wts_lst, comment_base)) new_list = [] for i in lst: if i not in new_list: new_list.append(i) num_lst = [] pos_lst = [] neg_lst = [] # --------------------------------------print(lst) for j in new_list: for k in j: k = "".join(k.split()) sub_str = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", k) # 字符串中文分词 sent = jieba.lcut(sub_str) # 计算情感值 pos, neg = sents(sent, pos_dict, neg_dict, no_dict) pos_lst.append(pos) neg_lst.append(neg) num = pos - neg num_lst.append(num) with open(cfg()['juzi_path'], 'a', encoding='utf-8') as f: f.write(sub_str + '\n') # 保存分析结果 strs_lst = [] strs_lst.append(comment_base) # 总情感值(游记切割后,每句情感值列表),积极情感值列表, 消极情感值列表 outputFile(strs_lst, neg_lst, pos_lst) # 保存分析结果 def outputFile(commentStr, pos, neg): #print(commentStr) with open(cfg().get('out'), 'a', encoding='utf-8') as f: # with open(cfg()['comment_ioPath'],'a',encoding='utf-8') as f2: # for i in commentStr: # f2.write(i) for i in commentStr: f.write(i) f.write("\n积极倾向值:{}".format(sum(pos)) + "\n") f.write("消极倾向值:{}".format(sum(neg)) + "\n") num = sum(pos) + sum(neg) if (num > 0): f.write("情感倾向:积极" + "\n") elif (num < 0): f.write("情感倾向:消极" + "\n") else: f.write("情感倾向:中性" + "\n") f.write('-' * 100 + "\n") # 生成词云 def toWordCloud(): # 停用词 fr = open(cfg()['cn_stopwords'], 'r', encoding='utf-8') stop_word_list = fr.readlines() new_stop_word_list = [] for stop_word in stop_word_list: stop_word = stop_word.replace('\ufeef', '').strip() new_stop_word_list.append(stop_word) with open(cfg()['path'], 'r', encoding='utf-8') as f: words = word_dict = {} word_list = '' words_arr = words.split('\n') words_jiebas = [] for i in words_arr: words_jiebas.append(jieba.lcut(i)) for words_jieba in words_jiebas: for word in words_jieba: if (len(word) > 1 and not word in new_stop_word_list): word_list = word_list + ' ' + word if (word_dict.get(word)): word_dict[word] = word_dict[word] + 1 else: word_dict[word] = 1 ##print(word_list) # print(word_dict) # 按次数进行排序 sort_words = sorted(word_dict.items(), key=lambda x: x[1], reverse=True) print(sort_words[0:101]) # 输出前0-100的词 #color_mask = imageio.imread(cfg()['bg_path']) wc = WordCloud( background_color="white", # 背景颜色 max_words=1000, # 显示最大词数 font_path=cfg()['simsun_path'], # 使用字体 min_font_size=20, max_font_size=500, random_state=42, # 随机数 collocations=False, # 避免重复单词 width=1600, height=1200, margin=10, #mask=color_mask) # 图幅宽度 ) #wc.recolor([random_state, color_func, colormap]) wc.generate(word_list) wc.to_file(cfg()['wordcloud_ioPath']) plt.figure(dpi=100) # 以图片的形式显示词云 plt.imshow(wc,interpolation='catrom',vmax=1000) # 关闭图像坐标系 plt.axis("off") if __name__ == '__main__': batchProcessing() lst = [] with open(cfg()['juzi_path'], encoding="utf-8-sig") as f: for i in f: lst.append(i) lst = list(set(lst)) with open(cfg()['path'], 'a', encoding='utf-8-sig') as f2: for i in lst: f2.write(i) toWordCloud()
