Graph圖演算法 Hits&Page Rank
來自專欄 數據分析俠
Graph在關係網路中發揮重要作用,搜索高質量內容、找出關係強人群......
Python3提供了很好的graph三方包,只需要安裝python-graph-core即可。
Python代碼:
hits
#首先安裝pip install python-graph-core#encoding:utf8from pygraph.classes.digraph import digraphimport mathimport pandas as pdimport numpy as npclass HITSIterator: __doc__ = 計算一張圖中的hub,authority值 def __init__(self, dg): self.max_iterations = 100 # 最大迭代次數 self.min_delta = 0.0001 # 確定迭代是否結束的參數 self.graph = dg self.hub = {} self.authority = {} for node in self.graph.nodes(): self.hub[node] = 1 self.authority[node] = 1 def hits(self): """ 計算每個頁面的hub,authority值 :return: """ if not self.graph: return flag = False for i in range(self.max_iterations): change = 0.0 # 記錄每輪的變化值 norm = 0 # 標準化係數 tmp = {} # 計算每個頁面的authority值 tmp = self.authority.copy() for node in self.graph.nodes(): self.authority[node] = 0 for incident_page in self.graph.incidents(node): # 遍歷所有「入射」的頁面 self.authority[node] += self.hub[incident_page] norm += pow(self.authority[node], 2) # 標準化 norm = math.sqrt(norm) for node in self.graph.nodes(): self.authority[node] /= norm change += abs(tmp[node] - self.authority[node]) # 計算每個頁面的hub值 norm = 0 tmp = self.hub.copy() for node in self.graph.nodes(): self.hub[node] = 0 for neighbor_page in self.graph.neighbors(node): # 遍歷所有「出射」的頁面 self.hub[node] += self.authority[neighbor_page] norm += pow(self.hub[node], 2) # 標準化 norm = math.sqrt(norm) for node in self.graph.nodes(): self.hub[node] /= norm change += abs(tmp[node] - self.hub[node]) #print("This is NO.%s iteration" % (i + 1)) #print("authority", self.authority) #print("hub", self.hub) if change < self.min_delta: flag = True break #if flag: # print("finished in %s iterations!" % (i + 1)) #else: # print("finished out of 100 iterations!") #print("The best authority page: ", max(self.authority.items(), key=lambda x: x[1])) print("The best hub page: ", max(self.hub.items(), key=lambda x: x[1])) #print(sorted(self.hub.items(),key=lambda x:x[1])) # name = self.hub.keys() # value = self.hub.items() # # arr = list(value) # print(pd.DataFrame(arr,columns=[key,value])) #lst = zip(name,value) #table_data = pd.DataFrame(list(lst),index=[name,value]) #print(table_data)if __name__ == __main__: dg = digraph() dg.add_nodes(["A", "B", "C", "D", "E"]) dg.add_edge(("A", "C")) #A→C dg.add_edge(("A", "D")) #A→D dg.add_edge(("B", "D")) dg.add_edge(("C", "E")) dg.add_edge(("D", "E")) dg.add_edge(("B", "E")) #B→E dg.add_edge(("E", "A")) hits = HITSIterator(dg) hits.hits()
PageRank:
# -*- coding: utf-8 -*-"""2018-03"""from pygraph.classes.digraph import digraphclass PRIterator: __doc__ = 計算一張圖中的PR值 def __init__(self, dg): self.damping_factor = 0.85 # 阻尼係數,即α self.max_iterations = 100 # 最大迭代次數 self.min_delta = 0.00001 # 確定迭代是否結束的參數,即? self.graph = dg def page_rank(self): # 先將圖中沒有出鏈的節點改為對所有節點都有出鏈 for node in self.graph.nodes(): if len(self.graph.neighbors(node)) == 0: for node2 in self.graph.nodes(): digraph.add_edge(self.graph, (node, node2)) nodes = self.graph.nodes() graph_size = len(nodes) if graph_size == 0: return {} page_rank = dict.fromkeys(nodes, 1.0 / graph_size) # 給每個節點賦予初始的PR值 damping_value = (1.0 - self.damping_factor) / graph_size # 公式中的(1?α)/N部分 flag = False for i in range(self.max_iterations): change = 0 for node in nodes: rank = 0 for incident_page in self.graph.incidents(node): # 遍歷所有「入射」的頁面 rank += self.damping_factor * (page_rank[incident_page] / len(self.graph.neighbors(incident_page))) rank += damping_value change += abs(page_rank[node] - rank) # 絕對值 page_rank[node] = rank print("This is NO.%s iteration" % (i + 1)) print(page_rank) if change < self.min_delta: flag = True break if flag: print("finished in %s iterations!" % node) else: print("finished out of 100 iterations!") return page_rankif __name__ == __main__: dg = digraph() dg.add_nodes(["A", "B", "C", "D", "E"]) dg.add_edge(("A", "B")) dg.add_edge(("A", "C")) dg.add_edge(("A", "D")) dg.add_edge(("B", "D")) dg.add_edge(("C", "E")) dg.add_edge(("D", "E")) dg.add_edge(("B", "E")) dg.add_edge(("E", "A")) pr = PRIterator(dg) page_ranks = pr.page_rank() print("The final page rank is
", page_ranks)
http://weixin.qq.com/r/V3XUzBbEm0vkrVKv9yBF (二維碼自動識別)
【我剁手都要買的寶貝(數據分析俠 《人人都會數據分析》20萬字電子書),快來和我一起瓜分紅I包】,複製這條信息¥1X8z0qb4DCH¥後打開??手淘??
我的博客即將搬運同步至騰訊雲+社區,邀請大家一同入駐:https://cloud.tencent.com/developer/support-plan
推薦閱讀:
※機器學習-線性回歸與邏輯回歸
※如何訓練模型?(1)——最小二乘法
※從西瓜書出發--機器學習筆記(1)
※開放數據獲取匯總(持續更新)
※機器學習中的優化方法