萌新笔记——C++里创建 Trie字典树（中文词典）（三）（联想）

　　萌新做词典第三篇，做得不好，还请指正，谢谢大佬！

　　今天把词典的联想做好了，也是比较low的，还改了之前的查询、遍历等代码。 Orz

　　一样地先放上运行结果：

 test1
 ID :     char : 件    word : 编程软件
 ID :     char : 习    word : 编程学习
 ID :     char : 站    word : 编程学习网站
 ID :     char : 门    word : 编程入门 test2
 ID :     char : 练    word : 编程训练
 ID :     char : 门    word : 编程入门
 ID :     char : 习    word : 编程学习
 ID :     char : 站    word : 编程学习网站
 ID :     char : 件    word : 编程软件
 find ID :     word : 编程学习 associate "编程" :
 find!
 训练
 入门
 学习
 学习网站
 软件

　　测试用的test.cc

 #include "Dictionary.h"
 #include <iostream>
 #include <string>
 #include <vector>
 using std::cout;
 using std::endl;
 using std::string;
 using std::vector; int test1()
 {
     ccx::Dictionary words;
     string word1 = "编程入门";
     string word2 = "编程软件";
     string word3 = "编程学习";
     string word4 = "编程学习网站";         words.push(word1);
     words.push(word2);
     words.push(word3);
     words.push(word4);         words.resetIt();     while(!words.isEnd())
     {
         cout << "ID : " << words.getCurWordId()
             << "\tchar : " << words.getCurChar()
             << "\tword : " << words.getCurWord() << endl;
         words.next();
     }     words.leading_out();
     return ;
 } int test2()
 {
     ccx::Dictionary words;
     words.leading_in();     string word("编程训练");
     words.push(word);
     words.resetIt();     while(!words.isEnd())
     {
         cout << "ID : " << words.getCurWordId()
             << "\tchar : " << words.getCurChar()
             << "\tword : " << words.getCurWord() << endl;
         words.next();
     }
     string tmp = "编程学习";
     int id = words.search(tmp);
     if(- == id)
     {
         cout << "no such word like \"" << tmp << "\"" << endl;
     }else{
         cout << "find ID : " << id
             << "\tword : " << tmp << endl;
     }     cout << endl;
     cout << "associate \"编程\" : " << endl;     vector<string> data;
     string temp = "编程";     if(words.associate(temp, data))
     {
         cout << "find!" << endl;
         for(auto & elem : data)
         {
             cout << elem << endl;
         }
     }else{
         cout << "can't find" << endl;
     }     return ;
 } int main()
 {
     cout << "test1" << endl;
     test1();
     cout << endl;
     cout << "test2" << endl;
     test2();
     cout << endl;
 }

　　test1不变，test2 在导入后再插入一个词“编程训练”，发现ID是正常的。

　　然后在test2最后调用联想函数，传入“编程”，能够正常传出所有的字符串。

　　在做这个的时候，一开始想的很简单，就是拿传入的词去树中查找，找到最后一人字对应的节点，然后以那个节点为根进行遍历。然后就开开心心地去写了，结果写一部分就要对之前的代码进行更改，于是，这个接口越来越“肥”了：

Dictionary.h

 #ifndef __DICTIONARY_H__
 #define __DICTIONARY_H__ #include "DictionaryData.h"
 #include "DictionaryConf.h" #include <memory>
 #include <vector>
 #include <list> namespace ccx{ using std::shared_ptr;
 using std::vector;
 using std::list; class Dictionary
 {
     typedef unordered_map<string, pDictElem>::iterator WordIt;
     public:
         Dictionary();
         void push(const string & word);
         void push(vector<string> & words);
         int search(const string & word);
         bool associate(const string & word, vector<string> & data);
     private:
         void AddWord(const string & word, int wordId);
         void splitWord(const string & word, vector<string> & characters);//把词拆成字
         int search(vector<string> & data, pDictElem & pcur);
         pDictElem _dictionary;
         DictionaryConf _conf;     //遍历
     public:
         string getCurChar();
         string getCurWord();
         int getCurWordId();
         bool isEnd();
         void resetIt();
         void next();
     private:
         void resetPoint(pDictElem pcur);
         void next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict);
         void nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict);
         string getCurWord(list<WordIt> & stackWord);         pDictElem _pcur;
         WordIt _itcur; //用list实现栈，遍历时方便
         list<WordIt> _stackWord;
         list<pDictElem> _stackDict; //导入导出
     public:
         void leading_in();
         void leading_out();
 }; } #endif

　　对几个原有的函数进行了重载，主要是为了能够复用一些代码，但是又想不到合适的新的函数名（英语不太好Orz）。

　　首先，是要能够查找并返回新的根结点，于是对search进行修改：

 int Dictionary::search(vector<string> & characters, pDictElem & root)
 {
     vector<string>::iterator it_char;
     it_char = characters.begin();
     root = _dictionary;
     int i = ;
     for(; it_char != characters.end(); ++it_char, ++i)
     {
         WordIt it_word;
         it_word = root->_words.find(*it_char);         if(it_word == root->_words.end())
         {
             break;
         }else{
             root = it_word->second;
         }
     }
     return i;
 }

　　形参第一项是分解后的字集，第二项是一个智能指针，指向某个节点。这里返回值改为了字集的第几项，有两个目的：

　　1、插入函数中可以方便地知道下一个要插入的是哪个字符

　　2、联想函数中可以判断字集中的字是否都存在于词典中

　　3、好吧，我没想到其它好办法，而且当时是想到上面两点就这么做了，后来发现，插入部分的代码根本就不用改

　　然后是重载search：

 int Dictionary::search(const string & word)
 {
     pDictElem root = _dictionary;
     vector<string> temp;
     splitWord(word, temp);     int ret = search(temp, root);
     int size = temp.size();
     if(ret != size)
     {
         return -;
     }
     return root->_wordId;
 }

　　在这里对字进行分解，并定义一个临时的根结点，这样做的目的是为了保护private中的根结点，并且可以在多线程环境中互不干扰。

　　能够找到“新的根”后，就要对它进行遍历了。如果只有单一线程或进程来使用它，这里可以直接把resetPoint（原来的）修改一下，设置指定结点就可以了：

 void Dictionary::resetPoint(pDictElem pcur)
 {
     _pcur = pcur;
     if(_stackDict.size())
     {
         _stackDict.clear();
     }
     if(_stackWord.size())
     {
         _stackWord.clear();
     }
     next();
 }

　　如果是这样，那前面也完全不用修改。由于这个词典最后是要应用到miniSearchEngin中，于是我对遍历部分的函数进行了修改：

 void Dictionary::next()
 {
     next(_pcur, _stackWord, _stackDict);
 } void Dictionary::next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
 {
     while(pcur)
     {
         nextWord(pcur, stackWord, stackDict);
         if(!pcur || pcur->_wordId)
         {
             break;
         }
     }
 } void Dictionary::nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
 {
     if(pcur)
     {
         if(pcur->_words.size())
         {
             stackDict.push_back(pcur);
             stackWord.push_back(pcur->_words.begin());
             pcur = stackWord.back()->second;
         }else{
             ++(stackWord.back());
         }
         while(stackWord.back() == stackDict.back()->_words.end())
         {
             stackDict.pop_back();
             stackWord.pop_back();
             if(!stackDict.size())
             {
                 pcur = NULL;
             }
             ++(stackWord.back());
         }
         if(pcur)
         {
             pcur = stackWord.back()->second;
         }
     }
 }

　　next部分，改为传入参数，这样就可以在associate里定义临时的栈和智能指针等，遍历的时候与其它工作并没有任何关系。

　　同样地，getWord也要做相同的更改：

 string Dictionary::getCurWord()
 {
     return getCurWord(_stackWord);
 } string Dictionary::getCurWord(list<WordIt> & stackWord)
 {
     string temp;
     list<WordIt>::iterator it_word;
     it_word = stackWord.begin();         for(; it_word != stackWord.end(); ++it_word)
     {
         temp += (*it_word)->first;
     }
     return temp;
 }

　　当然了，对外提供的接口都是不要传参的，其它的只能在内部使用，于是放入了private区。

　　终于可以开始写联想了0.0

 bool Dictionary::associate(const string & word, vector<string> & data)
 {
     pDictElem root = _dictionary;
     vector<string> temp;
     splitWord(word, temp);     int ret = search(temp, root);
     int size = temp.size();
     if(ret != size)
     {
         return false;
     }     list<WordIt> stackWord;
     list<pDictElem> stackDict;
     next(root, stackWord, stackDict);
     while(root)
     {
         string temp = getCurWord(stackWord);
         data.push_back(temp);
         next(root, stackWord, stackDict);
     }     if(!data.size())
     {
         return false;
     }
     return true;
 }

　　返回bool类型，可以方便地判断是否联想成功，即以传入的词做为前缀，能否找到剩余部分（词典里有存）。于是乎，一个渣渣型号的词典就做好啦~~~

Dictionary.cc

 #include "Dictionary.h"
 #include <iostream>
 #include <fstream>
 #include <string>
 #include <json/json.h> namespace ccx{ using std::endl;
 using std::cout;
 using std::pair;
 using std::ofstream;
 using std::ifstream; Dictionary::Dictionary()
 : _dictionary(new DictElem)
 , _conf()
 {
     _dictionary->_wordId = ;
     _pcur = _dictionary;
 } void Dictionary::splitWord(const string & word, vector<string> & characters)
 {
     int num = word.size();
     int i = ;
     while(i < num)
     {
         int size = ;
         if(word[i] & 0x80)
         {
             char temp = word[i];
             temp <<= ;
             do{
                 temp <<= ;
                 ++size;
             }while(temp & 0x80);
         }
         string subWord;
         subWord = word.substr(i, size);
         characters.push_back(subWord);
         i += size;
     }
 } void Dictionary::AddWord(const string & word, int wordId)
 {
     vector<string> characters;
     splitWord(word, characters);     vector<string>::iterator it_char;
     it_char = characters.begin();
     pDictElem root;
     root = _dictionary;
     for(; it_char != characters.end(); ++it_char)
     {
         WordIt it_word;
         it_word = root->_words.find(*it_char);         if(it_word == root->_words.end())
         {
             pair<string, pDictElem> temp;
             temp.first = *it_char;
             pDictElem dictemp(new DictElem);
             dictemp->_word = *it_char;
             dictemp->_wordId = ;
             temp.second = dictemp;
             root->_words.insert(temp);
             root = dictemp;
         }else{
             root = it_word->second;
         }
     }
     if(!root->_wordId)
     {
         root->_wordId = wordId;
     }
 } void Dictionary::push(const string & word)
 {
     ++(_dictionary->_wordId);
     AddWord(word, _dictionary->_wordId);
 } void Dictionary::push(vector<string> & words)
 {
     int size = words.size();
     for(int i = ; i < size; ++i)
     {
         push(words[i]);
     }
 } int Dictionary::search(const string & word)
 {
     pDictElem root = _dictionary;
     vector<string> temp;
     splitWord(word, temp);     int ret = search(temp, root);
     int size = temp.size();
     if(ret != size)
     {
         return -;
     }
     return root->_wordId;
 } int Dictionary::search(vector<string> & characters, pDictElem & root)
 {
     vector<string>::iterator it_char;
     it_char = characters.begin();
     root = _dictionary;
     int i = ;
     for(; it_char != characters.end(); ++it_char, ++i)
     {
         WordIt it_word;
         it_word = root->_words.find(*it_char);         if(it_word == root->_words.end())
         {
             break;
         }else{
             root = it_word->second;
         }
     }
     return i;
 } bool Dictionary::associate(const string & word, vector<string> & data)
 {
     pDictElem root = _dictionary;
     vector<string> temp;
     splitWord(word, temp);     int ret = search(temp, root);
     int size = temp.size();
     if(ret != size)
     {
         return false;
     }     list<WordIt> stackWord;
     list<pDictElem> stackDict;
     next(root, stackWord, stackDict);
     while(root)
     {
         string temp = getCurWord(stackWord);
         data.push_back(temp);
         next(root, stackWord, stackDict);
     }     if(!data.size())
     {
         return false;
     }
     return true;
 } //遍历用 void Dictionary::resetPoint(pDictElem pcur)
 {
     _pcur = pcur;
     if(_stackDict.size())
     {
         _stackDict.clear();
     }
     if(_stackWord.size())
     {
         _stackWord.clear();
     }
     next();
 } void Dictionary::resetIt()
 {
     resetPoint(_dictionary);
 } void Dictionary::next()
 {
     next(_pcur, _stackWord, _stackDict);
 } void Dictionary::next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
 {
     while(pcur)
     {
         nextWord(pcur, stackWord, stackDict);
         if(!pcur || pcur->_wordId)
         {
             break;
         }
     }
 } void Dictionary::nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
 {
     if(pcur)
     {
         if(pcur->_words.size())
         {
             stackDict.push_back(pcur);
             stackWord.push_back(pcur->_words.begin());
             pcur = stackWord.back()->second;
         }else{
             ++(stackWord.back());
         }
         while(stackWord.back() == stackDict.back()->_words.end())
         {
             stackDict.pop_back();
             stackWord.pop_back();
             if(!stackDict.size())
             {
                 pcur = NULL;
             }
             ++(stackWord.back());
         }
         if(pcur)
         {
             pcur = stackWord.back()->second;
         }
     }
 } string Dictionary::getCurChar()
 {
     return _pcur->_word;
 } int Dictionary::getCurWordId()
 {
     return _pcur->_wordId;
 } string Dictionary::getCurWord()
 {
     return getCurWord(_stackWord);
 } string Dictionary::getCurWord(list<WordIt> & stackWord)
 {
     string temp;
     list<WordIt>::iterator it_word;
     it_word = stackWord.begin();         for(; it_word != stackWord.end(); ++it_word)
     {
         temp += (*it_word)->first;
     }
     return temp;
 } bool Dictionary::isEnd()
 {
     return _pcur == NULL;
 } void Dictionary::leading_in()//导入，失败没必要退出程序
 {
     ifstream ifs;
     const char * path = _conf.getDictionaryPath().c_str();
     ifs.open(path);
     if(!ifs.good())
     {
         cout << "open Dictionary.json error(leading_in)" << endl;
     }else{
         Json::Value root;
         Json::Reader reader;         if(!reader.parse(ifs, root, false))
         {
             cout << "json read Dictionary.json error" << endl;
         }else{
             int size = root.size();
             for(int i = ; i < size; ++i)
             {
                 string word = root[i]["Word"].asString();
                 int wordId = root[i]["WordId"].asInt();
                 AddWord(word, wordId);
                 ++(_dictionary->_wordId);
             }
         }
     }
 } void Dictionary::leading_out()
 {
     Json::Value root;
     Json::FastWriter writer;     resetIt();     while(!isEnd())
     {
         Json::Value elem;
         elem["Word"] = getCurWord();
         elem["WordId"] = getCurWordId();
         root.append(elem);
         next();
     }     string words;
     words = writer.write(root);     ofstream ofs;
     const char * path = _conf.getDictionaryPath().c_str();
     ofs.open(path);
     if(!ofs.good())
     {
         cout << "open Dictionary.json error(leading_out)" << endl;
         ofs.open("Dictionary.tmp");
         if(!ofs.good())
         {
             exit(EXIT_FAILURE);
         }
     }     ofs << words;
     ofs.close();
 } }

个人收藏笔记记录

开通VIP