字典树的应用

之前我们实现一个简单的字典树，现在我们学习一下它的应用场合：

前缀匹配：给定字典库，输入一段字符，返回以该字符串为前缀的所有单词。
字频统计：给出一段文本，统计其中指定单词出现的频数。

前缀匹配

假设我们还是这几个单词：apps apple cook cookie cold，当我们想获得以co为前缀的单词时，只需要在字典树中依次找到c、o节点，然后搜索o节点的所有子树，取出其中的单词即可。

在上一篇博客中，我们已经实现了字典树的基本操作，这里只需要再加上一个前缀匹配方法即可。具体流程如下，将前缀字符串标记为当前前缀，将根节点标记为当前节点，执行操作1：

1.当前前缀为空，对当前节点执行操作2。否则，取出当前单词的首字符，标记为X，遍历当前节点的子节点，如果X存在于子节点N中，将N标记为当前节点，将剩余字符串标记为当前单词，重复操作1；如果X不存在于子节点中，返回None。

2.以当前节点为根节点，进行深度优先搜索，取得当前节点所有子树下的所有单词。

python实现

# -*- coding: utf -*-
class Trie_tree(object):
    def __init__(self):
        # node = [father, child, keep_char, is_word]
        self._root = [None, [], None, False]
        self.tmp = []
    def insert(self, word):
        current_word = word
        current_node = self._root
        self._insert_operation_1(current_word, current_node)
    def _insert_operation_1(self, current_word, current_node):
        if current_word:
            first_char = current_word[0]
            child_node = current_node[1]
            is_in_child_node = False
            for node in child_node:
                if first_char == node[2]:
                    is_in_child_node = True
                    current_word = current_word[1:]
                    current_node = node
                    self._insert_operation_1(current_word, current_node)
                    break
            if not is_in_child_node:
                self._insert_operation_2(current_word, current_node)
        else:
            current_node[3] = True
    def _insert_operation_2(self, current_word, current_node):
        first_char = current_word[0]
        # create a new node and insert it in the tree
        new_node = [None, [], None, False]
        new_node[2] = first_char
        new_node[0] = current_node
        current_node[1].append(new_node) # 添加进child
        current_word = current_word[1:]
        if current_word:
            current_node = new_node
            self._insert_operation_2(current_word, current_node)
        else:
            new_node[3] = True
    def find(self, word):
        current_word = word
        current_node = self._root
        return self._find_operation(current_word, current_node)
    def _find_operation(self, current_word, current_node):
        if current_word:
            first_char = current_word[0]
            child_node = current_node[1]
            if not child_node:
                return False
            is_in_child_node = False
            for node in child_node:
                if first_char == node[2]:
                    is_in_child_node =True
                    current_word = current_word[1:]
                    current_node = node
                    return self._find_operation(current_word, current_node)
            if not is_in_child_node:
                return False
        else:
            return current_node[3]
    def delete(self, word):
        current_word = word
        current_node = self._root
        return self._delete_operation(current_word, current_node)
    def _delete_operation(self, current_word, current_node):
        if current_word:
            first_char = current_word[0]
            child_node = current_node[1]
            if not child_node:
                return False
            is_in_child_node = False
            for node in child_node:
                if first_char == node[2]:
                    is_in_child_node = True
                    current_word = current_word[1:]
                    current_node = node
                    return self._delete_operation(current_word, current_node)
            if not is_in_child_node:
                return False
        else:
            if current_node[1]:
                current_node[3] = False
            # current_node is leaf_node
            else:
                father_node = current_node[0]
                father_node[1].remove(current_node)
            return True
    def print_tree(self, current_node):
        if current_node:
            self._dfs_print(current_node)
    def _dfs_print(self, current_node):
        if current_node[2]:
            self.tmp.append(current_node[2])
        # if it is a word
        if current_node[3]:
            print ("".join(self.tmp))
        for child in current_node[1]:
            self._dfs_print(child)
        if self.tmp:
            self.tmp.pop()
    # 实现前缀匹配
    def pre_match(self, pre_str):
        current_word = pre_str
        current_node = self._root
        return self._pre_match_op(current_word, current_node)
    def _pre_match_op(self, current_word, current_node):
        if current_word:
            first_char = current_word[0]
            child_node = current_node[1]
            if not child_node:
                return None
            is_in_child_node = False
            for node in child_node:
                if first_char == node[2]:
                    is_in_child_node = True
                    current_word = current_word[1:]
                    current_node = node
                    return self._pre_match_op(current_word, current_node)
            if not is_in_child_node:
                return None
        else:
            match_word = []
            # pre_str is already a word
            if current_node[3]:
                match_word.append("")
            self._pre_match_dfs("", current_node, match_word)
            return match_word
    def _pre_match_dfs(self, keep_char, current_node, match_word):
        child_node = current_node[1]
        for child in child_node:
            word = keep_char + child[2]
            # if it is a word
            if child[3]:
                match_word.append(word)
            self._pre_match_dfs(word, child, match_word)
        return match_word
if __name__ == '__main__':
    demo_trie = Trie_tree()
    print ("--------Insert dict---------")
    with open("words.txt", "r") as word_set:
        for line in word_set:
            for word in line.split():
                demo_trie.insert(word)
    print ("--------Enter pre_str-------")
    try:
        pre_str = input()
        print ("--------Pre match-----------")
        words = demo_trie.pre_match(pre_str)
        if not words:
            print ("No word starts with ", pre_str)
        else:
            for word in words:
                print (pre_str + word)
        print ("Totaly", len(words), "words")
    except EOFError:
        print ("Exit......")

--------Insert dict --------
--------Enter pre_str-------
co
--------Pre match    -------
could
course
court
courage
count
...
...
cottage
coin
Totaly 90 words

词频统计

要实现词频统计的话很简单，只要将我们之前定义节点的数据结构稍作修改，就可以用于统计字频了。把原来数据结构中的标记位改为频数位，即保存该单词出现的次数。然后，再把原有字典树实现中的插入操作和查找操作稍微改动，就可以实现字频统计功能了。

即 node = [father, child, keep_char, is_word] -> node = [father, child, keep_char, word_count]

插入操作：将单词标记为当前单词，将根节点标记为当前节点，执行操作1：

1.当前单词为空，当前节点单词出现频数加1，终止操作；否则取出当前单词的首字符记为X，遍历当前节点的子节点：如果X存在于子节点N，将剩余字符标记为当前单词，将N标记为当前节点，重复操作1，如果X不存在于当前节点的子节点中，那么进入操作2。

2.取出当前单词的首字符记为X，新建一个节点M存储X，M的父节点为当前节点。剩余字符串记为当前单词，如果当前单词为空，M节点单词出现频数加1，终止操作；否则，将M标记为当前节点，重复操作2。

查询操作：将单词标记为当前单词，将根节点标记为当前节点，执行操作1：

1.当前单词为空，返回当前节点字频数，即为该单词出现的次数。否则，取出当前单词的首字符，标记为X，遍历当前节点的子节点，如果X存在于子节点N中，将N标记为当前节点，将剩余字符串标记为当前单词，重复操作1；如果X不存在于子节点中，返回0。

python 实现

# -*- coding: utf-8 -*-
class Trie_tree(object):
    def __init__(self):
        # node = [father, child, keep_char, word_count]
        self._root = [None, [], None, 0]
        self.tmp = []
    def insert(self, word):
        current_word = word
        current_node = self._root
        self._insert_operation_1(current_word, current_node)
    def _insert_operation_1(self, current_word, current_node):
        if current_word:
            first_char = current_word[0]
            child_node = current_node[1]
            is_in_child_node = False
            for node in child_node:
                if first_char == node[2]:
                    is_in_child_node = True
                    current_word = current_word[1:]
                    current_node = node
                    self._insert_operation_1(current_word, current_node)
                    break
            if not is_in_child_node:
                self._insert_operation_2(current_word, current_node)
        else:
            # word_count++
            current_node[3] += 1
    def _insert_operation_2(self, current_word, current_node):
        first_char = current_word[0]
        # create a new node and insert it in the tree
        new_node = [None, [], None, 0]
        new_node[2] = first_char
        new_node[0] = current_node
        current_node[1].append(new_node) # add in child
        current_word = current_word[1:]
        if current_word:
            current_node = new_node
            self._insert_operation_2(current_word, current_node)
        else:
            new_node[3] += 1
    def find(self, word):
        current_word = word
        current_node = self._root
        return self._find_operation(current_word, current_node)
    def _find_operation(self, current_word, current_node):
        if current_word:
            first_char = current_word[0]
            child_node = current_node[1]
            if not child_node:
                return 0
            is_in_child_node = False
            for node in child_node:
                if first_char == node[2]:
                    is_in_child_node =True
                    current_word = current_word[1:]
                    current_node = node
                    return self._find_operation(current_word, current_node)
            if not is_in_child_node:
                return 0
        else:
            return current_node[3]
if __name__ == '__main__':
    import re
    demo_trie = Trie_tree()
    print ("---------Build the trie----------")
    with open("article.txt", "r") as content:
        for line in content:
            for word in re.split(r"[\s\W]+", line):
                if not word:
                    break
                if not word.isdigit():
                    demo_trie.insert(word.lower())
    print ("----------Count word test---------")
    print ("all      :", demo_trie.find("all"))
    print ("is       :", demo_trie.find("is"))
    print ("good     :", demo_trie.find("good"))
    print ("packages :", demo_trie.find("packages"))

---------Build the trie----------
----------Count word test---------
all      : 20
is       : 10
good     : 1
packages : 19

C语言实现

from 维基百科

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define TREE_WIDTH 256
#define WORDLENMAX 128
struct trie_node_st {
        int count;
        int pass; //add a count for the part-include for example 'this is' then the 'is' is hited two times 
        struct trie_node_st *next[TREE_WIDTH];
};
static struct trie_node_st root={0, {NULL));
static char *spaces=" \t\n/.\"\'()";
void myfree(struct trie_node_st * rt)
{
	for(int i=0; i<TREE_WIDTH; i++){
		if(rt->next[i]!=NULL){
			myfree(rt->next[i]);
			rt->next[i] = NULL;
		}
	}
	free(rt);
	return;
}
static int
insert (const char *word)
{
        int i;
        struct trie_node_st *curr, *newnode;
        if (word[0]=='\0'){
                return 0;
        }
        curr = &root;
        for (i=0; ; ++i) {
                if (word[i] == '\0') {
                        break;
                }
                curr->pass++;//count
                if (curr->next[ word[i] ] == NULL) {
                        newnode = (struct trie_node_st*)malloc(sizeof(struct trie_node_st));
                        memset (newnode, 0, sizeof(struct trie_node_st));
                        curr->next[ word[i] ] = newnode;
                } 
                curr = curr->next[ word[i] ];
        }
        curr->count ++;
        return 0;
}
static void
printword (const char *str, int n)
{
        printf ("%s\t%d\n", str, n);
}
static int
do_travel (struct trie_node_st *rootp)
{
        static char worddump[WORDLENMAX+1];
        static int pos=0;
        int i;
        if (rootp == NULL) {
                return 0;
        }
        if (rootp->count) {
                worddump[pos]='\0';
                printword (worddump, rootp->count+rootp->pass);
        }
        for (i=0;i<TREE_WIDTH;++i) {
                worddump[pos++]=i;
                do_travel (rootp->next[i]);
                pos--;
        }
        return 0;
}
int
main (void)
{
        char *linebuf=NULL, *line, *word;
        size_t bufsize=0;
        int ret;
        while (1) {
                ret=getline (&linebuf, &bufsize, stdin);
                if (ret==-1) {
                        break;
                }
                line=linebuf;
                while (1) {
                        word = strsep (&line, spaces);
                        if (word==NULL) {
                                break;
                        }
                        if (word[0]=='\0') {
                                continue;
                        }
                        insert (word);
                }
        }
        do_travel (&root);
        free (linebuf);
	for(int i=0; i<TREE_WIDTH; i++){
		if(root.next[i]!=0){
			myfree(root.next[i]);
		}
	}
        exit (0);
}