字典树的应用

之前我们实现一个简单的字典树,现在我们学习一下它的应用场合:

  • 前缀匹配:给定字典库,输入一段字符,返回以该字符串为前缀的所有单词。

  • 字频统计:给出一段文本,统计其中指定单词出现的频数。

前缀匹配

假设我们还是这几个单词:apps apple cook cookie cold,当我们想获得以co为前缀的单词时,只需要在字典树中依次找到c、o节点,然后搜索o节点的所有子树,取出其中的单词即可。

在上一篇博客中,我们已经实现了字典树的基本操作,这里只需要再加上一个前缀匹配方法即可。具体流程如下,将前缀字符串标记为当前前缀,将根节点标记为当前节点,执行操作1:

1.当前前缀为空,对当前节点执行操作2。否则,取出当前单词的首字符,标记为X,遍历当前节点的子节点,如果X存在于子节点N中,将N标记为当前节点,将剩余字符串标记为当前单词,重复操作1;如果X不存在于子节点中,返回None。

2.以当前节点为根节点,进行深度优先搜索,取得当前节点所有子树下的所有单词。

python实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# -*- coding: utf -*-
class Trie_tree(object):
def __init__(self):
# node = [father, child, keep_char, is_word]
self._root = [None, [], None, False]
self.tmp = []
def insert(self, word):
current_word = word
current_node = self._root
self._insert_operation_1(current_word, current_node)
def _insert_operation_1(self, current_word, current_node):
if current_word:
first_char = current_word[0]
child_node = current_node[1]
is_in_child_node = False
for node in child_node:
if first_char == node[2]:
is_in_child_node = True
current_word = current_word[1:]
current_node = node
self._insert_operation_1(current_word, current_node)
break
if not is_in_child_node:
self._insert_operation_2(current_word, current_node)
else:
current_node[3] = True
def _insert_operation_2(self, current_word, current_node):
first_char = current_word[0]
# create a new node and insert it in the tree
new_node = [None, [], None, False]
new_node[2] = first_char
new_node[0] = current_node
current_node[1].append(new_node) # 添加进child
current_word = current_word[1:]
if current_word:
current_node = new_node
self._insert_operation_2(current_word, current_node)
else:
new_node[3] = True
def find(self, word):
current_word = word
current_node = self._root
return self._find_operation(current_word, current_node)
def _find_operation(self, current_word, current_node):
if current_word:
first_char = current_word[0]
child_node = current_node[1]
if not child_node:
return False
is_in_child_node = False
for node in child_node:
if first_char == node[2]:
is_in_child_node =True
current_word = current_word[1:]
current_node = node
return self._find_operation(current_word, current_node)
if not is_in_child_node:
return False
else:
return current_node[3]
def delete(self, word):
current_word = word
current_node = self._root
return self._delete_operation(current_word, current_node)
def _delete_operation(self, current_word, current_node):
if current_word:
first_char = current_word[0]
child_node = current_node[1]
if not child_node:
return False
is_in_child_node = False
for node in child_node:
if first_char == node[2]:
is_in_child_node = True
current_word = current_word[1:]
current_node = node
return self._delete_operation(current_word, current_node)
if not is_in_child_node:
return False
else:
if current_node[1]:
current_node[3] = False
# current_node is leaf_node
else:
father_node = current_node[0]
father_node[1].remove(current_node)
return True
def print_tree(self, current_node):
if current_node:
self._dfs_print(current_node)
def _dfs_print(self, current_node):
if current_node[2]:
self.tmp.append(current_node[2])
# if it is a word
if current_node[3]:
print ("".join(self.tmp))
for child in current_node[1]:
self._dfs_print(child)
if self.tmp:
self.tmp.pop()
# 实现前缀匹配
def pre_match(self, pre_str):
current_word = pre_str
current_node = self._root
return self._pre_match_op(current_word, current_node)
def _pre_match_op(self, current_word, current_node):
if current_word:
first_char = current_word[0]
child_node = current_node[1]
if not child_node:
return None
is_in_child_node = False
for node in child_node:
if first_char == node[2]:
is_in_child_node = True
current_word = current_word[1:]
current_node = node
return self._pre_match_op(current_word, current_node)
if not is_in_child_node:
return None
else:
match_word = []
# pre_str is already a word
if current_node[3]:
match_word.append("")
self._pre_match_dfs("", current_node, match_word)
return match_word
def _pre_match_dfs(self, keep_char, current_node, match_word):
child_node = current_node[1]
for child in child_node:
word = keep_char + child[2]
# if it is a word
if child[3]:
match_word.append(word)
self._pre_match_dfs(word, child, match_word)
return match_word
if __name__ == '__main__':
demo_trie = Trie_tree()
print ("--------Insert dict---------")
with open("words.txt", "r") as word_set:
for line in word_set:
for word in line.split():
demo_trie.insert(word)
print ("--------Enter pre_str-------")
try:
pre_str = input()
print ("--------Pre match-----------")
words = demo_trie.pre_match(pre_str)
if not words:
print ("No word starts with ", pre_str)
else:
for word in words:
print (pre_str + word)
print ("Totaly", len(words), "words")
except EOFError:
print ("Exit......")

1
2
3
4
5
6
7
8
9
10
11
12
13
14
--------Insert dict --------
--------Enter pre_str-------
co
--------Pre match -------
could
course
court
courage
count
...
...
cottage
coin
Totaly 90 words

词频统计

要实现词频统计的话很简单,只要将我们之前定义节点的数据结构稍作修改,就可以用于统计字频了。把原来数据结构中的标记位改为频数位,即保存该单词出现的次数。然后,再把原有字典树实现中的插入操作和查找操作稍微改动,就可以实现字频统计功能了。

即 node = [father, child, keep_char, is_word] -> node = [father, child, keep_char, word_count]

插入操作:将单词标记为当前单词,将根节点标记为当前节点,执行操作1:

1.当前单词为空,当前节点单词出现频数加1,终止操作;否则取出当前单词的首字符记为X,遍历当前节点的子节点:如果X存在于子节点N,将剩余字符标记为当前单词,将N标记为当前节点,重复操作1,如果X不存在于当前节点的子节点中,那么进入操作2。

2.取出当前单词的首字符记为X,新建一个节点M存储X,M的父节点为当前节点。剩余字符串记为当前单词,如果当前单词为空,M节点单词出现频数加1,终止操作;否则,将M标记为当前节点,重复操作2。

查询操作:将单词标记为当前单词,将根节点标记为当前节点,执行操作1:

1.当前单词为空,返回当前节点字频数,即为该单词出现的次数。否则,取出当前单词的首字符,标记为X,遍历当前节点的子节点,如果X存在于子节点N中,将N标记为当前节点,将剩余字符串标记为当前单词,重复操作1;如果X不存在于子节点中,返回0。

python 实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# -*- coding: utf-8 -*-
class Trie_tree(object):
def __init__(self):
# node = [father, child, keep_char, word_count]
self._root = [None, [], None, 0]
self.tmp = []
def insert(self, word):
current_word = word
current_node = self._root
self._insert_operation_1(current_word, current_node)
def _insert_operation_1(self, current_word, current_node):
if current_word:
first_char = current_word[0]
child_node = current_node[1]
is_in_child_node = False
for node in child_node:
if first_char == node[2]:
is_in_child_node = True
current_word = current_word[1:]
current_node = node
self._insert_operation_1(current_word, current_node)
break
if not is_in_child_node:
self._insert_operation_2(current_word, current_node)
else:
# word_count++
current_node[3] += 1
def _insert_operation_2(self, current_word, current_node):
first_char = current_word[0]
# create a new node and insert it in the tree
new_node = [None, [], None, 0]
new_node[2] = first_char
new_node[0] = current_node
current_node[1].append(new_node) # add in child
current_word = current_word[1:]
if current_word:
current_node = new_node
self._insert_operation_2(current_word, current_node)
else:
new_node[3] += 1
def find(self, word):
current_word = word
current_node = self._root
return self._find_operation(current_word, current_node)
def _find_operation(self, current_word, current_node):
if current_word:
first_char = current_word[0]
child_node = current_node[1]
if not child_node:
return 0
is_in_child_node = False
for node in child_node:
if first_char == node[2]:
is_in_child_node =True
current_word = current_word[1:]
current_node = node
return self._find_operation(current_word, current_node)
if not is_in_child_node:
return 0
else:
return current_node[3]
if __name__ == '__main__':
import re
demo_trie = Trie_tree()
print ("---------Build the trie----------")
with open("article.txt", "r") as content:
for line in content:
for word in re.split(r"[\s\W]+", line):
if not word:
break
if not word.isdigit():
demo_trie.insert(word.lower())
print ("----------Count word test---------")
print ("all :", demo_trie.find("all"))
print ("is :", demo_trie.find("is"))
print ("good :", demo_trie.find("good"))
print ("packages :", demo_trie.find("packages"))

1
2
3
4
5
6
---------Build the trie----------
----------Count word test---------
all : 20
is : 10
good : 1
packages : 19

C语言实现

from 维基百科

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define TREE_WIDTH 256
#define WORDLENMAX 128
struct trie_node_st {
int count;
int pass; //add a count for the part-include for example 'this is' then the 'is' is hited two times
struct trie_node_st *next[TREE_WIDTH];
};
static struct trie_node_st root={0, {NULL));
static char *spaces=" \t\n/.\"\'()";
void myfree(struct trie_node_st * rt)
{
for(int i=0; i<TREE_WIDTH; i++){
if(rt->next[i]!=NULL){
myfree(rt->next[i]);
rt->next[i] = NULL;
}
}
free(rt);
return;
}
static int
insert (const char *word)
{
int i;
struct trie_node_st *curr, *newnode;
if (word[0]=='\0'){
return 0;
}
curr = &root;
for (i=0; ; ++i) {
if (word[i] == '\0') {
break;
}
curr->pass++;//count
if (curr->next[ word[i] ] == NULL) {
newnode = (struct trie_node_st*)malloc(sizeof(struct trie_node_st));
memset (newnode, 0, sizeof(struct trie_node_st));
curr->next[ word[i] ] = newnode;
}
curr = curr->next[ word[i] ];
}
curr->count ++;
return 0;
}
static void
printword (const char *str, int n)
{
printf ("%s\t%d\n", str, n);
}
static int
do_travel (struct trie_node_st *rootp)
{
static char worddump[WORDLENMAX+1];
static int pos=0;
int i;
if (rootp == NULL) {
return 0;
}
if (rootp->count) {
worddump[pos]='\0';
printword (worddump, rootp->count+rootp->pass);
}
for (i=0;i<TREE_WIDTH;++i) {
worddump[pos++]=i;
do_travel (rootp->next[i]);
pos--;
}
return 0;
}
int
main (void)
{
char *linebuf=NULL, *line, *word;
size_t bufsize=0;
int ret;
while (1) {
ret=getline (&linebuf, &bufsize, stdin);
if (ret==-1) {
break;
}
line=linebuf;
while (1) {
word = strsep (&line, spaces);
if (word==NULL) {
break;
}
if (word[0]=='\0') {
continue;
}
insert (word);
}
}
do_travel (&root);
free (linebuf);
for(int i=0; i<TREE_WIDTH; i++){
if(root.next[i]!=0){
myfree(root.next[i]);
}
}
exit (0);
}