2021/10/30 23:19:09
#得到wordnet本身: from nltk.corpus import wordnet #获得一个词的所有sense,包括词语的各种变形的sense: wordnet.synsets('published') >>>[Synset('print.v.01'),Synset('publish.v.02'),Synset('publish.v.03'),Synset('published.a.01'),Synset('promulgated.s.01')] #得到一个sense的所有lemma: wordnet.synsets('publish')[0].lemmas >>>[Lemma('print.v.01.print'), Lemma('print.v.01.publish')] #得到Lemma出现的次数: wordnet.synsets('publish')[0].lemmas[1].count() #在wordnet中,名词和动词被组织成了完整的层次式分类体系,因此可以通过计算两个sense在分类树中的距离,这个距离反应了它们的语义相似度: x = wordnet.synsets('recommended')[-1] y = wordnet.synsets('suggested')[-1] x.shortest_path_distance(y) >>> 0 #形容词和副词的相似度计算方法: #形容词和副词没有被组织成分类体系,所以不能用path_distance。 a = wordnet.synsets('beautiful')[0] b = wordnet.synsets('good')[0] a.shortest_path_distance(b) >>> -1 #形容词和副词最有用的关系是similar to。 a = wordnet.synsets('glorious')[0] a.similar_tos() >>> [Synset('incandescent.s.02'),Synset('divine.s.06'),……]
(1)独立起始概念(Unique Beginner)
如果有一同义词集合(即 概念)没有上位同义词集合(即 上位概念),则称之为独立起始概念(Unique Beginner)。在WordNet名词体系中,共有25个独立起始概念。其他名词通过上位/下位关系与这25个独立起始概念构成25个独立的层次结构。也就是说,标识着某个起始概念特点的属性将它的所有下位概念所继承,而这个起始概念就可以看作为是该语义领域内的所有概念(同义词集合)的一个原始语义元素。
(2) 词典编撰ID(Lexicographer ID)
每一个同义词集合(synonymy set)均有惟一的一个编号,这个编号就称为词典编撰ID(Lexicographer ID)。
(3) 概念链(Concept Chain)
在WordNet名词体系中,我们定义概念链(Concept Chain)如下:
概念链的首端对应的就是WordNet中的独立起始概念。比如:概念链ch1可以表示为:(3255461)<(2681909)<(3289024)<(3174243)<(3443493)<(19244)<(2645)<(16236)<(1740)。其中(3255461)作为概念链的末端代表的是词“football”的一个义项,而(1740)是WordNet中的独立起始概念,成为概念链的首端。概念“game equipment”(3289024)是概念“ball”(2681909)的上层概念,表达的语义更抽象。
1、在WordNet中,几乎全在实例化Class LazyCorpusLoader()。
2、关于Class LazyCorpusLoader
import gc import re import nltk TRY_ZIPFILE_FIRST = False class LazyCorpusLoader: """ 参数列表详解 :param name: The name of the corpus :type name: str :param reader_cls: The specific CorpusReader class, e.g. PlaintextCorpusReader, WordListCorpusReader :type reader: nltk.corpus.reader.api.CorpusReader :param nltk_data_subdir: The subdirectory where the corpus is stored. :type nltk_data_subdir: str :param *args: Any other non-keywords arguments that `reader_cls` might need. :param *kargs: Any other keywords arguments that `reader_cls` might need. """ def __init__(self, name, reader_cls, *args, **kwargs): from nltk.corpus.reader.api import CorpusReader #初始化函数 assert issubclass(reader_cls, CorpusReader) self.__name = self.__name__ = name self.__reader_cls = reader_cls # If nltk_data_subdir is set explicitly if "nltk_data_subdir" in kwargs: # 使用指定的子目录路径。 self.subdir = kwargs["nltk_data_subdir"] # 弹出'nltk_data_subdir'参数。 kwargs.pop("nltk_data_subdir", None) else: # Otherwise use 'nltk_data/corpora' self.subdir = "corpora" self.__args = args self.__kwargs = kwargs def __load(self): # 查找语料库根目录。 zip_name = re.sub(r"(([^/]+)(/.*)?)", r"\2.zip/\1/", self.__name) if TRY_ZIPFILE_FIRST: try: root = nltk.data.find(f"{self.subdir}/{zip_name}") except LookupError as e: try: root = nltk.data.find(f"{self.subdir}/{self.__name}") except LookupError: raise e else: try: root = nltk.data.find(f"{self.subdir}/{self.__name}") except LookupError as e: try: root = nltk.data.find(f"{self.subdir}/{zip_name}") except LookupError: raise e #加载语料库。 corpus = self.__reader_cls(root, *self.__args, **self.__kwargs) # This is where the magic happens! Transform ourselves into # the corpus by modifying our own __dict__ and __class__ to # match that of the corpus. args, kwargs = self.__args, self.__kwargs name, reader_cls = self.__name, self.__reader_cls self.__dict__ = corpus.__dict__ self.__class__ = corpus.__class__ # _unload support: assign __dict__ and __class__ back, then do GC. # after reassigning __dict__ there shouldn't be any references to # corpus data so the memory should be deallocated after gc.collect() def _unload(self): lazy_reader = LazyCorpusLoader(name, reader_cls, *args, **kwargs) self.__dict__ = lazy_reader.__dict__ self.__class__ = lazy_reader.__class__ gc.collect() self._unload = _make_bound_method(_unload, self) def __getattr__(self, attr): # Fix for inspect.isclass under Python 2.6 # (see http://bugs.python.org/issue1225107). # Without this fix tests may take extra 1.5GB RAM # because all corpora gets loaded during test collection. if attr == "__bases__": raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'") self.__load() # This looks circular, but its not, since __load() changes our # __class__ to something new: return getattr(self, attr) def __repr__(self): return "<{} in {!r} (not loaded yet)>".format( self.__reader_cls.__name__, ".../corpora/" + self.__name, ) def _unload(self): # If an exception occurs during corpus loading then # '_unload' method may be unattached, so __getattr__ can be called; # we shouldn't trigger corpus loading again in this case. pass def _make_bound_method(func, self): """ Magic for creating bound methods (used for _unload). """ class Foo: def meth(self): pass f = Foo() bound_method = type(f.meth) try: return bound_method(func, self, self.__class__) except TypeError: # python3 return bound_method(func, self)
- 2025-01-102025 蛇年,J 人直播带货内容审核团队必备的办公软件有哪 6 款?
- 2025-01-10高效运营背后的支柱:文档管理优化指南
- 2025-01-10年末压力山大?试试优化你的文档管理
- 2025-01-10跨部门协作中的进度追踪重要性解析
- 2025-01-10总结 JavaScript 中的变体函数调用方式
- 2025-01-10HR团队如何通过数据驱动提升管理效率?6个策略
- 2025-01-10WBS实战指南:如何一步步构建高效项目管理框架?
- 2025-01-10实现精准执行:团队协作新方法
- 2025-01-10如何使用工具提升活动策划团队的工作效率?几个必备工具推荐
- 2025-01-10WiX 标签使用介绍:打造专业安装程序的利器