这两天在折腾下数据的分析及导出,爬虫抓取页面的时候,我们会坐做关键字的匹配,在数据库中标记这个url是否有我们需要的关键字。 这个时候你不能再用find()了,这太没有效率了,而且你会发现在同时处理几千个任务的时候,会出现cpu的瓶颈。 如果采用ahocorasick来实现,可以很有效的减轻cpu的消耗。
AC自动机是多模式匹配的一个经典数据结构,原理是和KMP一样的构造fail指针,不过AC自动机是在Trie树上构造的,但原理是一样的。我这里就不多说ac的原理了,有兴趣的朋友可以自己找自己瞅瞅。
python下已经有个现成的库了 ,大家直接 pip install ahocorasick .
import ahocorasick
tree = ahocorasick.KeywordTree()
tree.add("alpha")
tree.add("alpha beta")
tree.add("gamma")
tree.make()
tree.search("I went to alpha beta the other day to pick up some spam")
(10, 15)
tree.search_long("I went to alpha beta the other day to pick up some spam")
(10, 20)
tree.search("and also got some alphabet soup")
(18, 23)
tree.search("but no waffles")
tree.search_long("oh, gamma rays are not tasty")
(4, 9)
tree.findall("I went to alpha beta to pick up alphabet soup")
for match in tree.findall("I went to alpha beta to pick up alphabet soup"):
... print match
...
(10, 15)
(32, 37)
如果不想用现成的ac的库,可以自己照着ac的原理写一份,只是过程很痛苦,我还是喜欢拿来主意 。毕竟实现算法的过程是个十分xxx的事情。
#coding=utf-8
KIND = 16
#BASE = ord('a')
class Node():
static = 0
def __init__(self):
self.fail = None
self.next = [None]*KIND
self.end = False
self.word = None
Node.static += 1
class AcAutomation():
def __init__(self):
self.root = Node()
self.queue = []
def getIndex(self,char):
return ord(char)# - BASE
def insert(self,string):
p = self.root
for char in string:
index = self.getIndex(char)
if p.next[index] == None:
p.next[index] = Node()
p = p.next[index]
p.end = True
p.word = string
def build_automation(self):
self.root.fail = None
self.queue.append(self.root)
while len(self.queue)!=0:
parent = self.queue[0]
self.queue.pop(0)
for i,child in enumerate(parent.next):
if child == None:continue
if parent == self.root:
child.fail = self.root
else:
failp = parent.fail
while failp != None:
if failp.next[i] != None:
child.fail = failp.next[i]
break
failp = failp.fail
if failp==None: child.fail=self.root
self.queue.append(child)
def matchOne(self,string):
p = self.root
for char in string:
index = self.getIndex(char)
while p.next[index]==None and p!=self.root: p=p.fail
if p.next[index]==None:p=self.root
else: p=p.next[index]
if p.end:return True,p.word
return False,None
class UnicodeAcAutomation():
def __init__(self,encoding='utf-8'):
self.ac = AcAutomation()
self.encoding = encoding
def getAcString(self,string):
string = bytearray(string.encode(self.encoding))
ac_string = ''
for byte in string:
ac_string += chr(byte%16)
ac_string += chr(byte/16)
#print ac_string
return ac_string
def insert(self,string):
if type(string) != unicode:
raise Exception('UnicodeAcAutomation:: insert type not unicode')
ac_string = self.getAcString(string)
self.ac.insert(ac_string)
def build_automation(self):
self.ac.build_automation()
def matchOne(self,string):
if type(string) != unicode:
raise Exception('UnicodeAcAutomation:: insert type not unicode')
ac_string = self.getAcString(string)
retcode,ret = self.ac.matchOne(ac_string)
if ret!=None:
s = ''
for i in range(len(ret)/2):
s += chr(ord(ret[2*i])+ord(ret[2*i+1])*16)
ret = s.decode('utf-8')
return retcode,ret
def main2():
ac = UnicodeAcAutomation()
ac.insert(u'丁亚光')
ac.insert(u'好吃的')
ac.insert(u'好玩的')
ac.build_automation()
print ac.matchOne(u'hi,丁亚光在干啥')
print ac.matchOne(u'ab')
print ac.matchOne(u'不能吃饭啊')
print ac.matchOne(u'饭很好吃,有很多好好的吃的,')
print ac.matchOne(u'有很多好玩的')
if __name__ == '__main__':
main2()
有个网友已经把ac自动机和redis做了一些联合 。
#coding: utf-8
'''
使用redis的ac算法
'''
import redis
def smart_unicode(s, encoding='utf-8'):
ret = s
if type(s) is str:
ret = s.decode(encoding)
return ret
def smart_str(s, encoding='utf-8'):
ret = s
if type(s) is unicode:
ret = s.encode(encoding)
return ret
class RedisACKeywords(object):
'''
(1) Efficient String Matching: An Aid to Bibliographic Search
(2) Construction of Aho Corasick Automaton in Linear Time for Integer Alphabets
'''
# %s is name
KEYWORD_KEY=u'%s:keyword'
PREFIX_KEY=u'%s:prefix'
SUFFIX_KEY=u'%s:suffix'
# %s is keyword
OUTPIUT_KEY=u'%s:output'
NODE_KEY=u'%s:node'
def __init__(self, host='localhost', port=6379, db=12, name='RedisACKeywords', encoding='utf8'):
'''
db: 7+5 because 1975
'''
self.encoding = encoding
self.client = redis.Redis(host=host, port=port, db=db)
self.client.ping()
self.name = smart_unicode(name)
# Init trie root
self.client.zadd(self.PREFIX_KEY % self.name, u'', 1.0)
def add(self, keyword):
keyword = keyword.strip().lower()
assert keyword
keyword = smart_unicode(keyword)
# Add keyword in keyword set
self.client.sadd(self.KEYWORD_KEY % self.name, keyword)
self._build_trie(keyword)
num = self.client.scard(self.KEYWORD_KEY % self.name)
return num
def remove(self, keyword):
assert keyword
keyword = keyword.strip().lower()
keyword = smart_unicode(keyword)
self._remove(keyword)
self.client.srem(self.KEYWORD_KEY % self.name, keyword)
num = self.client.scard(self.KEYWORD_KEY % self.name)
return num
def find(self, text):
ret = []
i = 0
state = u''
utext = smart_unicode(text)
while i < len(utext):
c = utext[i]
next_state = self._go(state, c)
if next_state is None:
next_state = self._fail(state + c)
####################
## the above line likes take same effect as this block
#next_state = self._fail(state)
#_next_state = self._go(next_state, c)
#if _next_state is None:
# _next_state = self._fail(next_state + c)
#next_state = _next_state
######################
outputs = self._output(state)
ret.extend(outputs)
state = next_state
i += 1
# check last state
outputs = self._output(state)
ret.extend(outputs)
return ret
def flush(self):
keywords = self.client.smembers(self.KEYWORD_KEY % self.name)
for keyword in keywords:
self.client.delete(self.OUTPIUT_KEY % smart_unicode(keyword))
self.client.delete(self.NODE_KEY % smart_unicode(keyword))
self.client.delete(self.PREFIX_KEY % self.name)
self.client.delete(self.SUFFIX_KEY % self.name)
self.client.delete(self.KEYWORD_KEY % self.name)
def info(self):
return {
'keywords':self.client.scard(self.KEYWORD_KEY % self.name),
'nodes':self.client.zcard(self.PREFIX_KEY % self.name),
}
def suggest(self, input):
input = smart_unicode(input)
ret = []
rank = self.client.zrank(self.PREFIX_KEY % self.name, input)
a = self.client.zrange(self.PREFIX_KEY % self.name, rank, rank)
while a:
node = smart_unicode(a[0])
if node.startswith(input) and self.client.sismember(self.KEYWORD_KEY % self.name, node):
ret.append(node)
rank += 1
a = self.client.zrange(self.PREFIX_KEY % self.name, rank, rank)
return ret
def _go(self, state, c):
'''
转向函数
'''
assert type(state) is unicode
next_state = state + c
i = self.client.zscore(self.PREFIX_KEY % self.name, next_state)
if i is None:
return None
return next_state
def _build_trie(self, keyword):
assert type(keyword) is unicode
l = len(keyword)
for i in xrange(l): # trie depth increase
prefix = keyword[:i+1] # every prefix is a node
_suffix = u''.join(reversed(prefix))
if self.client.zscore(self.PREFIX_KEY % self.name, prefix) is None: # node does not exist
self.client.zadd(self.PREFIX_KEY % self.name, prefix, 1.0)
self.client.zadd(self.SUFFIX_KEY % self.name, _suffix, 1.0) # reversed suffix node
self._rebuild_output(_suffix)
else:
if (self.client.sismember(self.KEYWORD_KEY % self.name, prefix)): # node may change, rebuild affected nodes
self._rebuild_output(_suffix)
def _rebuild_output(self, _suffix):
assert type(_suffix) is unicode
rank = self.client.zrank(self.SUFFIX_KEY % self.name, _suffix)
a = self.client.zrange(self.SUFFIX_KEY % self.name, rank, rank)
while a:
suffix_ = smart_unicode(a[0])
if suffix_.startswith(_suffix):
state = u''.join(reversed(suffix_))
self._build_output(state)
else:
break
rank += 1 # TODO: Binary search?
a = self.client.zrange(self.SUFFIX_KEY % self.name, rank, rank)
def _build_output(self, state):
assert type(state) is unicode
outputs = []
if self.client.sismember(self.KEYWORD_KEY % self.name, state):
outputs.append(state)
fail_state = self._fail(state)
fail_output = self._output(fail_state)
if fail_output:
outputs.extend(fail_output)
if outputs:
self.client.sadd(self.OUTPIUT_KEY % state, *outputs)
for k in outputs:
self.client.sadd(self.NODE_KEY % k, state) # ref node for delete keywords in output
def _fail(self, state):
'''
失败函数
'''
assert type(state) is unicode
# max suffix node will be the failed node
next_state = u''
for i in xrange(1, len(state)+1): # depth increase
next_state = state[i:]
if self.client.zscore(self.PREFIX_KEY % self.name, next_state):
break
return next_state
def _output(self, state):
'''
输出函数
'''
assert type(state) is unicode
return [smart_unicode(k) for k in self.client.smembers(self.OUTPIUT_KEY % state)]
def debug_print(self):
keywords = self.client.smembers(self.KEYWORD_KEY % self.name)
if keywords:
print '-', self.KEYWORD_KEY % self.name, u' '.join(keywords)
prefix = self.client.zrange(self.PREFIX_KEY % self.name, 0, -1)
if prefix:
prefix[0] = u'.'
print '-', self.PREFIX_KEY % self.name, u' '.join(prefix)
suffix = self.client.zrange(self.SUFFIX_KEY % self.name, 0, -1)
if suffix:
print '-', self.SUFFIX_KEY % self.name, u' '.join(suffix)
outputs = []
for node in prefix:
output = self._output(smart_unicode(node))
outputs.append({node: output})
if outputs:
print '-', 'outputs', outputs
nodes = []
for keyword in keywords:
keyword_nodes = self.client.smembers(self.NODE_KEY % smart_unicode(keyword))
nodes.append({keyword: keyword_nodes})
if nodes:
print '-', 'nodes', nodes
def _remove(self, keyword):
assert type(keyword) is unicode
nodes = self.client.smembers(self.NODE_KEY % keyword)
for node in nodes:
self.client.srem(self.OUTPIUT_KEY % smart_unicode(node), keyword)
self.client.delete(self.NODE_KEY % keyword)
# remove nodes if need
l = len(keyword)
for i in xrange(l, 0, -1): # depth decrease
prefix = keyword[:i]
if self.client.sismember(self.KEYWORD_KEY % self.name, prefix) and i!=l:
break
_suffix = u''.join(reversed(prefix))
rank = self.client.zrank(self.PREFIX_KEY % self.name, prefix)
if rank is None:
break
a = self.client.zrange(self.PREFIX_KEY % self.name, rank+1, rank+1)
if a:
prefix_ = smart_unicode(a[0])
if not prefix_.startswith(prefix):
self.client.zrem(self.PREFIX_KEY % self.name, prefix)
self.client.zrem(self.SUFFIX_KEY % self.name, _suffix)
else:
break
else:
self.client.zrem(self.PREFIX_KEY % self.name, prefix)
self.client.zrem(self.SUFFIX_KEY % self.name, _suffix)
if __name__ == '__main__':
acs = RedisACKeywords(name='test3')
ks = ['aabbc', 'abbc', 'bbb']
for k in ks:
acs.add(k)
print 'find result in aaabbbccc is :', acs.find('aaabbbccc')
acs.flush()
keywords = RedisACKeywords(name='test2')
ks = ['her', 'he', 'his', 'here', 'there']
for k in ks:
keywords.add(k)
keywords.debug_print()
print '>>>>>>>>>>>>'
text = 'here'
print 'text: %s' % text
print 'keywords: %s added. ' % ' '.join(ks), keywords.find(text) # her, he
input = 'he'
print 'suggest %s: ' % input, keywords.suggest(input) # her, he
text = 'ushers'
print 'text: %s' % text
print 'keywords: %s added. ' % ' '.join(ks), keywords.find(text) # her, he
ks2 = ['she', 'hers']
for k in ks2:
keywords.add(k)
print 'keywords: %s added. ' % ' '.join(ks2), keywords.find(text) # her, he, she, hers
keywords.add('h')
print 'h added. ', keywords.find(text) # her, he, she, hers, h
keywords.remove('h')
print 'h removed. ', keywords.find(text) # her, he, she, hers
keywords.flush()
print 'flushed. ', keywords.find(text) # []

KMP 这些都开始研究啦,不错