分享一份我寫的後綴樹Ukkonen構造演算法代碼

上一篇寫後綴數組DC3的文章,我太監了。生活真的很累。我遠還沒有達到對自己的預期,還在努力奮鬥,沒有什麼時間去做很詳細的解釋。這篇也同樣。下班之後的所有時間都在寫C++,看分散式的書籍和課程。各位有什麼這方面的捷徑和優秀資源,也請告訴我,萬分感謝!我最近在看的是「Distributed Systems Principles and Paradigms」,感謝俄國人!

from typing import Dict # 經過上百次實踐, Python 3 的 typing 還是很難用啊!# 因為主要目的是 Demo, 這裡用一個全局變數表示當前已插入的 stringcurr_str = class STNode: default_suffix_link: STNode = None # 之後將會設置為 root def __init__(self): self.content_: Dict[str, STNode] = {} # 通往子節點的字典 self.op: int = None self._ed: int = None self.suffix_link: STNode = STNode.default_suffix_link self.parent: STNode = None # 需要 parent 在 split 的時候來更新跳轉關係(坑了我2天) def __getitem__(self, key: str): return self.content_[key] def __setitem__(self, key: str, value: STNode): self.content_[key] = value def __contains__(self, key: str): return key in self.content_ def __iter__(self): # 只有自動測試的時候用 yield from curr_str[self.op:self.ed] @property def ed(self): return self._ed if self._ed != :ed else len(curr_str) @ed.setter def ed(self, val): self._ed = val @property def is_root(self): return self.op is None and self.ed is None @property def is_inner(self): return not self.is_root and self.content_ @property def is_leaf(self): return not self.is_root and not self.content_class SuffixTree: def __init__(self): self.root = STNode() STNode.default_suffix_link = self.root self.remainder = 0 self.cursor = 0 self.act_node = self.root self.act_direct = 0 self.act_offset = 0 def insert_char(self, char: str): global curr_str curr_str += char self.remainder += 1 def case_root(): if char not in self.root: leaf_node = STNode() leaf_node.parent = self.root leaf_node.op = self.cursor leaf_node.ed = :ed # 為了更符合我的直覺, 我 ed 用一個特殊標記, 其實可以換成真實 string 的長度 self.root[char] = leaf_node self.remainder -= 1 else: edge_node = self.root[char] self.act_direct = edge_node.op self.act_offset += 1 assert self.act_offset == 1 if self.act_node.is_root and self.act_offset == 0: case_root() else: edge_node = self.act_node[curr_str[self.act_direct]] # 節約資源, 我用 node 表示 edge if edge_node.op + self.act_offset == edge_node.ed and char in edge_node: self.act_node = edge_node self.act_direct = edge_node[char].op self.act_offset = 1 elif char == curr_str[edge_node.op + self.act_offset]: self.act_offset += 1 else: prev_inner_node: STNode = None # 非常重要! 更新 suffix link def split_grow(): nonlocal prev_inner_node leaf_node = STNode() leaf_node.op = self.cursor leaf_node.ed = :ed self.remainder -= 1 if (edge_node.is_leaf or edge_node.ed - edge_node.op > 1) and edge_node.op + self.act_offset != edge_node.ed: inner_node = STNode() if prev_inner_node: prev_inner_node.suffix_link = inner_node prev_inner_node = inner_node inner_node.op = edge_node.op inner_node.ed = inner_node.op + self.act_offset inner_node.parent = edge_node.parent inner_node.parent[curr_str[inner_node.op]] = inner_node edge_node.op = inner_node.ed edge_node.parent = inner_node inner_node[curr_str[edge_node.op]] = edge_node inner_node[curr_str[leaf_node.op]] = leaf_node leaf_node.parent = inner_node else: if prev_inner_node: prev_inner_node.suffix_link = edge_node prev_inner_node = edge_node edge_node[curr_str[leaf_node.op]] = leaf_node leaf_node.parent = edge_node def overflow_fix(): # suffix link 跳轉之後, act_offset 會溢出 nonlocal edge_node edge_node = self.act_node[curr_str[self.act_direct]] supply = edge_node.ed - edge_node.op if self.act_offset > supply: self.act_node = edge_node self.act_direct += supply self.act_offset -= supply return overflow_fix() while self.remainder > 0: split_grow() if not self.act_node.is_inner: self.act_offset -= 1 self.act_direct += 1 if self.act_offset > 0: overflow_fix() else: case_root() break else: self.act_node = self.act_node.suffix_link overflow_fix() if edge_node.op + self.act_offset == edge_node.ed and char in edge_node: self.act_node = edge_node self.act_direct = edge_node[char].op self.act_offset = 1 if prev_inner_node: prev_inner_node.suffix_link = self.act_node break elif char == curr_str[edge_node.op + self.act_offset]: break self.cursor += 1 def __contains__(self, item: str): edge_node = self.root.content_.get(item[0]) if edge_node is None: return False i = 0 while True: for exist_char in edge_node: if exist_char != item[i]: return False else: i += 1 if i == len(item): return True edge_node = edge_node.content_[item[i]]if __name__ == __main__: from random import choice st = SuffixTree() def bundle_test(test_data): global curr_str curr_str = for i, char in enumerate(test_data): st.insert_char(char) for start in range(i + 1): assert test_data[start:i + 1] in st st.__init__() alphabet = (A, B, C, D, E) for _ in range(10000): test_d = for _ in range(20): test_d += choice(alphabet) print(test_d) bundle_test(test_d)

會Python的童鞋如果看了我最後的測試應該就知道這份代碼的正確性是毫無疑問的。

我自己沒時間寫解釋,但已經有大神寫了傳說級答案。這裡是傳送門,Ukkonens suffix tree algorithm in plain English?

後綴樹理解起來容易,但寫起來真是一把辛酸淚,希望大家少踩坑。

推薦閱讀:

030 Substring with Concatenation of All Words[H]
Q-learning 和 DQN
番外篇 · 補充說明
018 4Sum[M]
發現交互之美丨妙筆生花

TAG:演算法 | 演算法與數據結構 | 演算法設計 |