|
@@ -0,0 +1,250 @@
|
|
|
|
|
+#!/usr/bin/env python
|
|
|
|
|
+# -*- coding:utf-8 -*-
|
|
|
|
|
+# ToolGood.Words.WordsSearch.py
|
|
|
|
|
+# 2020, Lin Zhijun, https://github.com/toolgood/ToolGood.Words
|
|
|
|
|
+# Licensed under the Apache License 2.0
|
|
|
|
|
+# 更新日志
|
|
|
|
|
+# 2020.04.06 第一次提交
|
|
|
|
|
+# 2020.05.16 修改,支持大于0xffff的字符
|
|
|
|
|
+
|
|
|
|
|
+__all__ = ['WordsSearch']
|
|
|
|
|
+__author__ = 'Lin Zhijun'
|
|
|
|
|
+__date__ = '2020.05.16'
|
|
|
|
|
+
|
|
|
|
|
+class TrieNode():
|
|
|
|
|
+ def __init__(self):
|
|
|
|
|
+ self.Index = 0
|
|
|
|
|
+ self.Index = 0
|
|
|
|
|
+ self.Layer = 0
|
|
|
|
|
+ self.End = False
|
|
|
|
|
+ self.Char = ''
|
|
|
|
|
+ self.Results = []
|
|
|
|
|
+ self.m_values = {}
|
|
|
|
|
+ self.Failure = None
|
|
|
|
|
+ self.Parent = None
|
|
|
|
|
+
|
|
|
|
|
+ def Add(self,c):
|
|
|
|
|
+ if c in self.m_values :
|
|
|
|
|
+ return self.m_values[c]
|
|
|
|
|
+ node = TrieNode()
|
|
|
|
|
+ node.Parent = self
|
|
|
|
|
+ node.Char = c
|
|
|
|
|
+ self.m_values[c] = node
|
|
|
|
|
+ return node
|
|
|
|
|
+
|
|
|
|
|
+ def SetResults(self,index):
|
|
|
|
|
+ if (self.End == False):
|
|
|
|
|
+ self.End = True
|
|
|
|
|
+ self.Results.append(index)
|
|
|
|
|
+
|
|
|
|
|
+class TrieNode2():
|
|
|
|
|
+ def __init__(self):
|
|
|
|
|
+ self.End = False
|
|
|
|
|
+ self.Results = []
|
|
|
|
|
+ self.m_values = {}
|
|
|
|
|
+ self.minflag = 0xffff
|
|
|
|
|
+ self.maxflag = 0
|
|
|
|
|
+
|
|
|
|
|
+ def Add(self,c,node3):
|
|
|
|
|
+ if (self.minflag > c):
|
|
|
|
|
+ self.minflag = c
|
|
|
|
|
+ if (self.maxflag < c):
|
|
|
|
|
+ self.maxflag = c
|
|
|
|
|
+ self.m_values[c] = node3
|
|
|
|
|
+
|
|
|
|
|
+ def SetResults(self,index):
|
|
|
|
|
+ if (self.End == False) :
|
|
|
|
|
+ self.End = True
|
|
|
|
|
+ if (index in self.Results )==False :
|
|
|
|
|
+ self.Results.append(index)
|
|
|
|
|
+
|
|
|
|
|
+ def HasKey(self,c):
|
|
|
|
|
+ return c in self.m_values
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ def TryGetValue(self,c):
|
|
|
|
|
+ if (self.minflag <= c and self.maxflag >= c):
|
|
|
|
|
+ if c in self.m_values:
|
|
|
|
|
+ return self.m_values[c]
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+class WordsSearch():
|
|
|
|
|
+ def __init__(self):
|
|
|
|
|
+ self._first = {}
|
|
|
|
|
+ self._keywords = []
|
|
|
|
|
+ self._indexs=[]
|
|
|
|
|
+
|
|
|
|
|
+ def SetKeywords(self,keywords):
|
|
|
|
|
+ self._keywords = keywords
|
|
|
|
|
+ self._indexs=[]
|
|
|
|
|
+ for i in range(len(keywords)):
|
|
|
|
|
+ self._indexs.append(i)
|
|
|
|
|
+
|
|
|
|
|
+ root = TrieNode()
|
|
|
|
|
+ allNodeLayer={}
|
|
|
|
|
+
|
|
|
|
|
+ for i in range(len(self._keywords)): # for (i = 0; i < _keywords.length; i++)
|
|
|
|
|
+ p = self._keywords[i]
|
|
|
|
|
+ nd = root
|
|
|
|
|
+ for j in range(len(p)): # for (j = 0; j < p.length; j++)
|
|
|
|
|
+ nd = nd.Add(ord(p[j]))
|
|
|
|
|
+ if (nd.Layer == 0):
|
|
|
|
|
+ nd.Layer = j + 1
|
|
|
|
|
+ if nd.Layer in allNodeLayer:
|
|
|
|
|
+ allNodeLayer[nd.Layer].append(nd)
|
|
|
|
|
+ else:
|
|
|
|
|
+ allNodeLayer[nd.Layer]=[]
|
|
|
|
|
+ allNodeLayer[nd.Layer].append(nd)
|
|
|
|
|
+ nd.SetResults(i)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ allNode = []
|
|
|
|
|
+ allNode.append(root)
|
|
|
|
|
+ for key in allNodeLayer.keys():
|
|
|
|
|
+ for nd in allNodeLayer[key]:
|
|
|
|
|
+ allNode.append(nd)
|
|
|
|
|
+ allNodeLayer=None
|
|
|
|
|
+
|
|
|
|
|
+ for i in range(len(allNode)): # for (i = 0; i < allNode.length; i++)
|
|
|
|
|
+ if i==0 :
|
|
|
|
|
+ continue
|
|
|
|
|
+ nd=allNode[i]
|
|
|
|
|
+ nd.Index = i
|
|
|
|
|
+ r = nd.Parent.Failure
|
|
|
|
|
+ c = nd.Char
|
|
|
|
|
+ while (r != None and (c in r.m_values)==False):
|
|
|
|
|
+ r = r.Failure
|
|
|
|
|
+ if (r == None):
|
|
|
|
|
+ nd.Failure = root
|
|
|
|
|
+ else:
|
|
|
|
|
+ nd.Failure = r.m_values[c]
|
|
|
|
|
+ for key2 in nd.Failure.Results :
|
|
|
|
|
+ nd.SetResults(key2)
|
|
|
|
|
+ root.Failure = root
|
|
|
|
|
+
|
|
|
|
|
+ allNode2 = []
|
|
|
|
|
+ for i in range(len(allNode)): # for (i = 0; i < allNode.length; i++)
|
|
|
|
|
+ allNode2.append( TrieNode2())
|
|
|
|
|
+
|
|
|
|
|
+ for i in range(len(allNode2)): # for (i = 0; i < allNode2.length; i++)
|
|
|
|
|
+ oldNode = allNode[i]
|
|
|
|
|
+ newNode = allNode2[i]
|
|
|
|
|
+
|
|
|
|
|
+ for key in oldNode.m_values :
|
|
|
|
|
+ index = oldNode.m_values[key].Index
|
|
|
|
|
+ newNode.Add(key, allNode2[index])
|
|
|
|
|
+
|
|
|
|
|
+ for index in range(len(oldNode.Results)): # for (index = 0; index < oldNode.Results.length; index++)
|
|
|
|
|
+ item = oldNode.Results[index]
|
|
|
|
|
+ newNode.SetResults(item)
|
|
|
|
|
+
|
|
|
|
|
+ oldNode=oldNode.Failure
|
|
|
|
|
+ while oldNode != root:
|
|
|
|
|
+ for key in oldNode.m_values :
|
|
|
|
|
+ if (newNode.HasKey(key) == False):
|
|
|
|
|
+ index = oldNode.m_values[key].Index
|
|
|
|
|
+ newNode.Add(key, allNode2[index])
|
|
|
|
|
+ for index in range(len(oldNode.Results)):
|
|
|
|
|
+ item = oldNode.Results[index]
|
|
|
|
|
+ newNode.SetResults(item)
|
|
|
|
|
+ oldNode=oldNode.Failure
|
|
|
|
|
+ allNode = None
|
|
|
|
|
+ root = None
|
|
|
|
|
+
|
|
|
|
|
+ # first = []
|
|
|
|
|
+ # for index in range(65535):# for (index = 0; index < 0xffff; index++)
|
|
|
|
|
+ # first.append(None)
|
|
|
|
|
+
|
|
|
|
|
+ # for key in allNode2[0].m_values :
|
|
|
|
|
+ # first[key] = allNode2[0].m_values[key]
|
|
|
|
|
+
|
|
|
|
|
+ self._first = allNode2[0]
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ def FindFirst(self,text):
|
|
|
|
|
+ ptr = None
|
|
|
|
|
+ for index in range(len(text)): # for (index = 0; index < text.length; index++)
|
|
|
|
|
+ t =ord(text[index]) # text.charCodeAt(index)
|
|
|
|
|
+ tn = None
|
|
|
|
|
+ if (ptr == None):
|
|
|
|
|
+ tn = self._first.TryGetValue(t)
|
|
|
|
|
+ else:
|
|
|
|
|
+ tn = ptr.TryGetValue(t)
|
|
|
|
|
+ if (tn==None):
|
|
|
|
|
+ tn = self._first.TryGetValue(t)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ if (tn != None):
|
|
|
|
|
+ if (tn.End):
|
|
|
|
|
+ item = tn.Results[0]
|
|
|
|
|
+ keyword = self._keywords[item]
|
|
|
|
|
+ return { "Keyword": keyword, "Success": True, "End": index, "Start": index + 1 - len(keyword), "Index": self._indexs[item] }
|
|
|
|
|
+ ptr = tn
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+ def FindAll(self,text):
|
|
|
|
|
+ ptr = None
|
|
|
|
|
+ list = []
|
|
|
|
|
+
|
|
|
|
|
+ for index in range(len(text)): # for (index = 0; index < text.length; index++)
|
|
|
|
|
+ t =ord(text[index]) # text.charCodeAt(index)
|
|
|
|
|
+ tn = None
|
|
|
|
|
+ if (ptr == None):
|
|
|
|
|
+ tn = self._first.TryGetValue(t)
|
|
|
|
|
+ else:
|
|
|
|
|
+ tn = ptr.TryGetValue(t)
|
|
|
|
|
+ if (tn==None):
|
|
|
|
|
+ tn = self._first.TryGetValue(t)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ if (tn != None):
|
|
|
|
|
+ if (tn.End):
|
|
|
|
|
+ for j in range(len(tn.Results)): # for (j = 0; j < tn.Results.length; j++)
|
|
|
|
|
+ item = tn.Results[j]
|
|
|
|
|
+ keyword = self._keywords[item]
|
|
|
|
|
+ list.append({ "Keyword": keyword, "Success": True, "End": index, "Start": index + 1 - len(keyword), "Index": self._indexs[item] })
|
|
|
|
|
+ ptr = tn
|
|
|
|
|
+ return list
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ def ContainsAny(self,text):
|
|
|
|
|
+ ptr = None
|
|
|
|
|
+ for index in range(len(text)): # for (index = 0; index < text.length; index++)
|
|
|
|
|
+ t =ord(text[index]) # text.charCodeAt(index)
|
|
|
|
|
+ tn = None
|
|
|
|
|
+ if (ptr == None):
|
|
|
|
|
+ tn = self._first.TryGetValue(t)
|
|
|
|
|
+ else:
|
|
|
|
|
+ tn = ptr.TryGetValue(t)
|
|
|
|
|
+ if (tn==None):
|
|
|
|
|
+ tn = self._first.TryGetValue(t)
|
|
|
|
|
+
|
|
|
|
|
+ if (tn != None):
|
|
|
|
|
+ if (tn.End):
|
|
|
|
|
+ return True
|
|
|
|
|
+ ptr = tn
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ def Replace(self,text, replaceChar = '*'):
|
|
|
|
|
+ result = list(text)
|
|
|
|
|
+
|
|
|
|
|
+ ptr = None
|
|
|
|
|
+ for i in range(len(text)): # for (i = 0; i < text.length; i++)
|
|
|
|
|
+ t =ord(text[i]) # text.charCodeAt(index)
|
|
|
|
|
+ tn = None
|
|
|
|
|
+ if (ptr == None):
|
|
|
|
|
+ tn = self._first.TryGetValue(t)
|
|
|
|
|
+ else:
|
|
|
|
|
+ tn = ptr.TryGetValue(t)
|
|
|
|
|
+ if (tn==None):
|
|
|
|
|
+ tn = self._first.TryGetValue(t)
|
|
|
|
|
+
|
|
|
|
|
+ if (tn != None):
|
|
|
|
|
+ if (tn.End):
|
|
|
|
|
+ maxLength = len( self._keywords[tn.Results[0]])
|
|
|
|
|
+ start = i + 1 - maxLength
|
|
|
|
|
+ for j in range(start,i+1): # for (j = start; j <= i; j++)
|
|
|
|
|
+ result[j] = replaceChar
|
|
|
|
|
+ ptr = tn
|
|
|
|
|
+ return ''.join(result)
|