python对文章词频的统计
2021/8/13 17:06:00
本文主要是介绍python对文章词频的统计,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!
import os import re from nltk import ne_chunk, pos_tag, word_tokenize import nltk from docx import Document import langid import pandas as pd def readWord(): text = "" rootdir = 'C:\\Users\\Administrator\\Desktop\\一季度' list = os.listdir(rootdir) # 列出文件夹下所有的目录与文件 for i in range(0, len(list)): path = os.path.join(rootdir, list[i]) print(path) document = Document(path) # 获取所有段落 all_paragraphs = document.paragraphs for paragraph in all_paragraphs: if langid.classify(paragraph.text)[0] == 'en': text += paragraph.text + "\n" return text def get_entities(): obj = {} arr = [] # 对文章分词 # sentence = "I am named John Doe AI AI AI AI" sentence = readWord() obj = {} tagged_sentence = ne_chunk(pos_tag(word_tokenize(sentence))) for tagged in tagged_sentence: if len(tagged) == 2: # print(tagged[1]) pattern = re.compile("\b’\b|\b”\b|\b—\b|\b\[\b|\b…\b|\b/\b|\bs\b|\bP\b|\bII\b|\bR\b|\bA\b|\b]\b") if pattern.findall(tagged[0][0]) and (tagged[1] == "NNP" or tagged[1] == "NNPS"): # if (tagged[1] == "NNP" or tagged[1] == "NNPS") and tagged[0] != "’" and tagged[0][0] != "”" and tagged[0][0] != "—" and tagged[0][0] != "[" and tagged[0][0] != "]" and tagged[0][0] != "…" and tagged[0][0] != "@" and tagged[0][0] != "/" and tagged[0][0] != "s" and tagged[0][0] != "|" and tagged[0][0] != "|" and tagged[0][0] != "P" and tagged[0][0] != "•" and tagged[0][0] != "II" and tagged[0][0] != "R" and tagged[0][0] != "A" and tagged[0][0] != "“" and tagged[0][0] != "‘" and tagged[0][0] != "–": if obj.get(tagged[0]) is not None: obj[tagged[0]] += 1 else: obj[tagged[0]] = 1 else: # print(tagged) # print(tagged[0]) if len(tagged[0]) == 2: # print(tagged[1]) if (tagged[0][1] == "NNP" or tagged[0][1] == "NNPS") and tagged[0][0] != "’" and tagged[0][0] != "”" and \ tagged[0][0] != "—" and tagged[0][0] != "[" and tagged[0][0] != "]" and tagged[0][0] != "…" and \ tagged[0][0] != "@" and tagged[0][0] != "/" and tagged[0][0] != "s" and tagged[0][0] != "|" and \ tagged[0][0] != "|" and tagged[0][0] != "P" and tagged[0][0] != "•" and tagged[0][0] != "II" and \ tagged[0][0] != "R" and tagged[0][0] != "A" and tagged[0][0] != "“" and tagged[0][0] != "‘" and \ tagged[0][0] != "–": if obj.get(tagged[0][0]) is not None: obj[tagged[0][0]] += 1 else: obj[tagged[0][0]] = 1 # # tagged_sentence = nltk.tag.pos_tag(sentence.split()) # tagged_sentence = ne_chunk(pos_tag(word_tokenize(sentence))) # # print(ne_chunk(pos_tag(word_tokenize(sentence)))) # # print(tagged_sentence) # for tagged in tagged_sentence: # if tagged[1] == "NNP" or tagged[1] == "NNPS": # # if obj.get(tagged[0]) is not None: # # obj[tagged[0]] += 1 # # else: # # obj[tagged[0]] = 1 # if obj.get(tagged[0].strip(",")) is not None: # obj[tagged[0].strip(",").strip(".")] += 1 # else: # obj[tagged[0]] = 1 # 将对象转为数组对象,便于pd将数据转为一种数据结构,写入excel中 dataframe是一种表格型的数据存储结构,可以看作是几个serie的集合。dataframe既有行索引,也有列索引。 for o in obj: obja = {"word": o, "num": obj[o]} arr.append(obja) p = pd.DataFrame(arr) # print(p) p.to_csv('c4i.csv', encoding='utf_8_sig') # print(p) if __name__ == '__main__': get_entities() # readWord()
使用的依赖库如下所示:
python-docx==0.8.11
这篇关于python对文章词频的统计的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!
- 2024-12-24Python编程入门指南
- 2024-12-24Python编程基础入门
- 2024-12-24Python编程基础:变量与数据类型
- 2024-12-23使用python部署一个usdt合约,部署自己的usdt稳定币
- 2024-12-20Python编程入门指南
- 2024-12-20Python编程基础与进阶
- 2024-12-19Python基础编程教程
- 2024-12-19python 文件的后缀名是什么 怎么运行一个python文件?-icode9专业技术文章分享
- 2024-12-19使用python 把docx转为pdf文件有哪些方法?-icode9专业技术文章分享
- 2024-12-19python怎么更换换pip的源镜像?-icode9专业技术文章分享