node2vec之小黑尝试
2022/1/10 17:13:24
本文主要是介绍node2vec之小黑尝试,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!
参数设定
import warnings import random warnings.filterwarnings('ignore') import argparse import numpy as np import networkx as nx #import node2vec from gensim.models import Word2Vec import random np.random.seed(1) def parse_args(): ''' Parses the node2vec arguments. ''' # 使用parser加载信息 parser = argparse.ArgumentParser(description="Run node2vec.") # 输入文件 parser.add_argument('--input', nargs='?', default='../graph/karate.edgelist', help='Input graph path') # 输出文件 parser.add_argument('--output', nargs='?', default='../emb/karate.emb', help='Embeddings path') # embedding维度 parser.add_argument('--dimensions', type=int, default=128, help='Number of dimensions. Default is 128.') # 节点序列长度 parser.add_argument('--walk-length', type=int, default=80, help='Length of walk per source. Default is 80.') # 随机游走的次数 parser.add_argument('--num-walks', type=int, default=10, help='Number of walks per source. Default is 10.') # word2vec窗口大小,word2vec参数 parser.add_argument('--window-size', type=int, default=10, help='Context size for optimization. Default is 10.') # SGD优化时epoch数量,word2vec参数 parser.add_argument('--iter', default=10, type=int, help='Number of epochs in SGD') # 并行化核数,word2vec参数 parser.add_argument('--workers', type=int, default=8, help='Number of parallel workers. Default is 8.') # 参数p parser.add_argument('--p', type=float, default=1, help='Return hyperparameter. Default is 1.') # 参数q parser.add_argument('--q', type=float, default=1, help='Inout hyperparameter. Default is 1.') # 权重 parser.add_argument('--weighted', dest='weighted', action='store_true', help='Boolean specifying (un)weighted. Default is unweighted.') parser.add_argument('--unweighted', dest='unweighted', action='store_false') parser.set_defaults(weighted=False) # 有向无向 parser.add_argument('--directed', dest='directed', action='store_true', help='Graph is (un)directed. Default is undirected.') parser.add_argument('--undirected', dest='undirected', action='store_false') parser.set_defaults(directed=False) return parser.parse_args(args=[]) # return parser.parse_known_args() def read_graph(file): graph_x = nx.read_edgelist(file,nodetype = int,create_using = nx.DiGraph()) for edge in graph_x.edges(): graph_x[edge[0]][edge[1]]['weight'] = 1 # 无向操作 if args.undirected: graph_x = graph_x.to_undirected() return graph_x args = parse_args() nx_G = read_graph(args.input) nx.draw(nx_G,with_labels = True) list(nx_G.neighbors(25))
[32, 28, 26]
采样算法
def alias_setup(probs): smaller = [] larger = [] Q = np.zeros(len(probs),dtype = int) P = np.zeros(len(probs)) probs = [prob / sum(probs) for prob in probs] for t,prob in enumerate(probs): P[t] = prob * len(probs) if P[t] > 1: larger.append(t) else: smaller.append(t) # 准备开始采样 while larger and smaller: large = larger.pop() small = smaller.pop() Q[small] = large P[large] -= (1 - P[small]) if P[large] < 1: smaller.append(large) else: larger.append(large) return Q,P #J,q = alias_setup([1/2,1/3,1/12,1/12]) def alias_draw(J, q): length = len(J) PP = int(np.floor(np.random.rand() * length)) QQ = int(np.floor(np.random.rand())) if QQ <= q[PP]: return PP else: return J[QQ]
定义模型的Graph类
class Graph(object): def __init__(self,nx_G,is_directed, p, q): self.nx_G = nx_G self.is_directed = is_directed self.p = p self.q = q def get_alias_edge(self, src, dst): nx_G = self.nx_G p = self.p q = self.q unnormalized_probs = [] for node in sorted(nx_G.neighbors(dst)): if node == src: prob = nx_G[dst][node]['weight'] / p elif nx_G.has_edge(node,src): prob = nx_G[dst][node]['weight'] else: prob = nx_G[dst][node]['weight'] / q unnormalized_probs.append(prob) normalized_probs = [float(prob) / sum(unnormalized_probs) for prob in unnormalized_probs] #print(normalized_probs) return alias_setup(normalized_probs) def preprocess_transition_probs(self): is_directed = self.is_directed nx_G = self.nx_G # 结点 的采样映射 alias_nodes = {} for node in nx_G.nodes(): unnormalized_probs = [nx_G[node][neighbor]['weight'] for neighbor in sorted(nx_G.neighbors(node))] probs = [prob / sum(unnormalized_probs) for prob in unnormalized_probs] alias_nodes[node] = alias_setup(probs) # 边的采样映射 alias_edges = {} for src,tgt in nx_G.edges(): unnormalized_probs = self.get_alias_edge(src,tgt) alias_edges[(src,tgt)] = unnormalized_probs if not is_directed: unnormalized_probs = self.get_alias_edge(tgt,src) alias_edges[(tgt,src)] = unnormalized_probs self.alias_nodes = alias_nodes self.alias_edges = alias_edges return alias_nodes,alias_edges def node2vec_walk(self, walk_length, start_node): nx_G = self.nx_G alias_nodes = self.alias_nodes alias_edges = self.alias_edges walk = [start_node] while len(walk) < walk_length: cur = walk[-1] neighbors = sorted(nx_G.neighbors(cur)) if neighbors: if len(walk) == 1: # 只有一个结点的话就对点采样 q,p = alias_nodes[cur] sample_index = alias_draw(q,p) walk.append(neighbors[sample_index]) else: # 超过两个点,就进行边采样 pre = walk[-2] q,p = alias_edges[(pre,cur)] sample_index = alias_draw(q,p) walk.append(neighbors[sample_index]) else: break return walk def simulate_walks(self, num_walks, walk_length): nx_G = self.nx_G nodes = list(nx_G.nodes()) walks = [] for t in range(num_walks): random.shuffle(nodes) #print('epoch {}'.format(t)) for node in nodes: walks.append(self.node2vec_walk(walk_length,node)) return walks
word2vec训练部分代码
def learn_embeddings(walks): # 将node的类型int转化为string walk_lol = [] for walk in walks: tmp = [] for node in walk: tmp.append(str(node)) walk_lol.append(tmp) model = Word2Vec(walk_lol, size = 2, window = args.window_size, min_count = 0, sg = 1, workers = args.workers, iter = args.iter) model.wv.save_word2vec_format(args.output) return model
聚类
# k-means聚类 from sklearn import cluster from sklearn.metrics import adjusted_rand_score from sklearn.model_selection import train_test_split import pandas as pd def draw_cluster(p,q,pos): g = Graph(nx_G,False,p,q) g.preprocess_transition_probs() walks = g.simulate_walks(40,40) # 训练node2vec模型 model = learn_embeddings(walks) # 导入节点名称,获取embedding embedding_node=[] for i in range(1,35): j=str(i) embedding_node.append(model[j]) embedding_node=np.matrix(embedding_node).reshape((34,-1)) y_pred = cluster.KMeans(n_clusters=4).fit_predict(embedding_node) # 调用 test_RandomForestClassifier print(y_pred) pos = nx.spring_layout(nx_G) nx.draw_networkx_nodes(nx_G,pos,node_color = y_pred,label = True) nx.draw_networkx_edges(nx_G,pos,nodelist = nx_G.edges())
实验效果
# p小 q大,偏向宽度优先搜索,模型更具有同构(注意黄色点,起着桥接的属性!!!) p = 0.1 q = 20 draw_cluster(p,q)
[2 2 3 2 1 1 1 2 3 3 1 2 2 2 0 0 1 2 0 2 0 2 0 0 0 0 0 0 3 0 3 3 0 0]
# p大 q小,偏向深度优先搜索,模型更具有社群属性 p = 20 q = 0.1 draw_cluster(p,q)
[0 0 3 0 2 2 2 0 3 3 2 0 0 0 1 1 2 0 1 0 1 0 1 1 1 1 1 1 3 1 3 3 1 1]
这篇关于node2vec之小黑尝试的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!
- 2025-01-10Rakuten 乐天积分系统从 Cassandra 到 TiDB 的选型与实战
- 2025-01-09CMS内容管理系统是什么?如何选择适合你的平台?
- 2025-01-08CCPM如何缩短项目周期并降低风险?
- 2025-01-08Omnivore 替代品 Readeck 安装与使用教程
- 2025-01-07Cursor 收费太贵?3分钟教你接入超低价 DeepSeek-V3,代码质量逼近 Claude 3.5
- 2025-01-06PingCAP 连续两年入选 Gartner 云数据库管理系统魔力象限“荣誉提及”
- 2025-01-05Easysearch 可搜索快照功能,看这篇就够了
- 2025-01-04BOT+EPC模式在基础设施项目中的应用与优势
- 2025-01-03用LangChain构建会检索和搜索的智能聊天机器人指南
- 2025-01-03图像文字理解,OCR、大模型还是多模态模型?PalliGema2在QLoRA技术上的微调与应用