00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062 # ifndef HUFFMAN_H
00063 # define HUFFMAN_H
00064
00065 # include <memory>
00066 # include <autosprintf.h>
00067 # include <tpl_binNodeUtils.H>
00068 # include <tpl_treap.H>
00069 # include <tpl_binHeap.H>
00070 # include <tpl_dynMapTree.H>
00071 # include <bitArray.H>
00072
00073 namespace Aleph {
00074
00075 class Huffman_Node;
00076 typedef DynMapTree<Treap_Vtl, string, Huffman_Node *> Symbol_Map;
00077 typedef BinNode< Aleph::pair<string, size_t> > Freq_Node;
00078 struct Huffman_Node : public BinHeap<size_t>::Node
00079 {
00080 BinNode<string> * bin_node;
00081
00082 Freq_Node * freq_node;
00083
00084 Huffman_Node() : BinHeap<size_t>::Node(0), bin_node(NULL), freq_node(NULL)
00085 {
00086
00087 }
00088
00089 Huffman_Node(BinNode<string> * node)
00090 : BinHeap<size_t>::Node(0), bin_node(node), freq_node(NULL)
00091 {
00092
00093 }
00094
00095 ~Huffman_Node() { }
00096
00097 };
00098 typedef BinHeap<size_t> Huffman_Heap;
00099 static inline const size_t & get_freq(Huffman_Node * huffman_node)
00100
00101 {
00102 return huffman_node->get_key();
00103 }
00104
00105
00106 static inline void increase_freq(Huffman_Node * huffman_node)
00107
00108 {
00109 huffman_node->get_key()++;
00110 }
00111
00112
00113 static inline void set_freq(Huffman_Node * huffman_node, const size_t & freq)
00114
00115 {
00116 huffman_node->get_key() = freq;
00117 }
00118
00119 typedef DynMapTree<Treap_Vtl, string, BitArray> Code_Map;
00120 static inline bool is_leaf(BinNode<string> * p)
00121 {
00122 return LLINK(p) == NULL and RLINK(p) == NULL;
00123 }
00129 class Huffman_Encoder_Engine
00130 {
00131 BinNode<string> * root;
00132 Huffman_Heap heap;
00133 Symbol_Map symbol_map;
00134 Code_Map code_map;
00135
00136 Freq_Node * freq_root;
00137
00138 string end_symbol;
00139 size_t text_len;
00140 void build_prefix_encoding(BinNode<string> * p, BitArray & array)
00141 {
00142 if (is_leaf(p))
00143 {
00144 const string & str = p->get_key();
00145 code_map.insert(str, BitArray(array));
00146 return;
00147 }
00148 array.push(0); build_prefix_encoding(LLINK(p), array); array.pop();
00149 array.push(1); build_prefix_encoding(RLINK(p), array); array.pop();
00150 }
00151 void build_encoding_map()
00152 {
00153
00154 I(symbol_map.size() > 0);
00155
00156 if (root == NULL)
00157 throw domain_error("Huffman encoding tree has not been generated");
00158
00159 BitArray array(0);
00160 symbol_map.empty();
00161 build_prefix_encoding(root, array);
00162 }
00163 bool test_end(const string & str) const
00164 {
00165 if (end_symbol == "NO-END")
00166 return false;
00167 return end_symbol == str;
00168 }
00169 void update_freq(const string & str)
00170 {
00171
00172 if (root != NULL)
00173 throw domain_error("Huffman encoding tree has already been generated");
00174
00175 if (test_end(str))
00176 throw domain_error("End symbol has already been inserted");
00177
00178 Huffman_Node * huffman_node = NULL;
00179 Huffman_Node ** huffman_node_ptr = symbol_map.test(str);
00180 if (huffman_node_ptr == NULL)
00181 {
00182 unique_ptr<BinNode<string> > bin_node_auto(new BinNode<string>(str));
00183 huffman_node = static_cast<Huffman_Node*>
00184 (heap.insert(new Huffman_Node(bin_node_auto.get())));
00185 symbol_map.insert(str, huffman_node);
00186 bin_node_auto.release();
00187 }
00188 else
00189 huffman_node = *huffman_node_ptr;
00190
00191 increase_freq(huffman_node);
00192 heap.update(huffman_node);
00193 }
00194 static void append_code(BitArray & bit_stream, const BitArray & symbol_code)
00195 {
00196 const size_t symbol_code_size = symbol_code.size();
00197 for (int i = 0; i < symbol_code_size; i++)
00198 bit_stream.push(symbol_code[i]);
00199 }
00200 public:
00202 Huffman_Encoder_Engine()
00203 : root(NULL), freq_root(NULL), end_symbol("NO-END"), text_len(0)
00204 {
00205
00206 }
00207
00208 private:
00209
00210 struct Get_Key
00211 {
00212 string operator () (BinNode<string> * p) const
00213 {
00214 return is_leaf(p) ? p->get_key() : "";
00215 }
00216 };
00217
00218 struct Load_Key
00219 {
00220 void operator () (BinNode<string> * p, ifstream & input) const
00221 {
00222 if (is_leaf(p))
00223 input >> p->get_key();
00224 }
00225 };
00226
00227 public:
00228
00243 void save_tree(ofstream & output)
00244 {
00245 if (root == NULL)
00246 throw std::domain_error("Huffman tree has not been generated");
00247
00248 Aleph::save_tree<BinNode<string>, Get_Key> (root, output);
00249 }
00250
00291 void save_tree_in_array_of_chars(const string & array_name, ofstream & output)
00292 {
00293 if (root == NULL)
00294 throw std::domain_error("Huffman tree has not been generated");
00295
00296 Aleph::save_tree_in_array_of_chars<BinNode<string>, Get_Key>
00297 (root, array_name, output);
00298 }
00299
00303 void save_encoded_text(ofstream & output, BitArray & text)
00304 {
00305
00306 }
00307
00311 void save_encoded_text_in_array_of_chars(ofstream & output, BitArray & text)
00312 {
00313
00314 }
00316 BinNode<string> *& get_root()
00317 {
00318 if (root == NULL)
00319 throw std::domain_error("Huffman tree has not been generated");
00320
00321 return root;
00322 }
00339 BinNode<string> * generate_huffman_tree(
00340
00341 const bool & with_freqs = false
00342
00343 )
00344 {
00345 while (heap.size() > 1)
00346 {
00347 Huffman_Node * l_huffman_node = (Huffman_Node*) heap.getMin();
00348 Huffman_Node * r_huffman_node = (Huffman_Node*) heap.getMin();
00349 BinNode <string> * bin_node = new BinNode <string>;
00350 Huffman_Node * huffman_node = new Huffman_Node (bin_node);
00351 LLINK(bin_node) = l_huffman_node->bin_node;
00352 RLINK(bin_node) = r_huffman_node->bin_node;
00353 const size_t new_freq = get_freq(l_huffman_node) + get_freq(r_huffman_node);
00354 Aleph::set_freq(huffman_node, new_freq);
00355
00356 if (with_freqs)
00357 {
00358 Freq_Node *& l_freq_node = l_huffman_node->freq_node;
00359 if (l_freq_node == NULL)
00360 {
00361 l_freq_node = new Freq_Node;
00362 l_freq_node->get_key().first =
00363 l_huffman_node->bin_node->get_key();
00364 l_freq_node->get_key().second = l_huffman_node->get_key();
00365 }
00366
00367 Freq_Node *& r_freq_node = r_huffman_node->freq_node;
00368 if (r_freq_node == NULL)
00369 {
00370 r_freq_node = new Freq_Node;
00371 r_freq_node->get_key().first =
00372 r_huffman_node->bin_node->get_key();
00373 r_freq_node->get_key().second = r_huffman_node->get_key();
00374 }
00375
00376 const string str = gnu::autosprintf ("%d", new_freq);
00377 Freq_Node *& freq_node = huffman_node->freq_node;
00378 freq_node = new Freq_Node;
00379 freq_node->get_key().first = str;
00380 freq_node->get_key().second = huffman_node->get_key();
00381 LLINK(freq_node) = l_freq_node;
00382 RLINK(freq_node) = r_freq_node;
00383 }
00384
00385 delete l_huffman_node;
00386 delete r_huffman_node;
00387 heap.insert(huffman_node);
00388 }
00389
00390 Huffman_Node * huffman_root = (Huffman_Node *) heap.getMin();
00391 root = huffman_root->bin_node;
00392
00393 if (with_freqs)
00394 freq_root = huffman_root->freq_node;
00395
00396 delete huffman_root;
00397 build_encoding_map();
00398
00399 return root;
00400 }
00401
00402
00416 void load_tree(ifstream & input)
00417 {
00418 if (root != BinNode<string>::NullPtr)
00419 destroyRec(root);
00420
00421 root = Aleph::load_tree<BinNode<string>, Load_Key> (input);
00422 }
00423
00424
00426 Freq_Node *& get_freq_root()
00427 {
00428 if (freq_root == NULL)
00429 throw std::domain_error("Huffman tree has not been generated");
00430
00431 return freq_root;
00432 }
00442 void set_freq(const string & str, const size_t & freq)
00443
00444 {
00445 if (root != NULL)
00446 throw domain_error("Huffman encoding tree has already been generated");
00447 if (test_end(str))
00448 throw domain_error("End symbol has already been inserted");
00449
00450
00451 Huffman_Node ** huffman_node_ptr = symbol_map.test(str);
00452
00453 if (huffman_node_ptr != NULL)
00454 throw std::domain_error
00455 (gnu::autosprintf("Frequency for symbol %s has already set", str.c_str()));
00456
00457 unique_ptr<BinNode<string> > bin_node_auto(new BinNode<string>(str));
00458 Huffman_Node * huffman_node = new Huffman_Node(bin_node_auto.get());
00459 Aleph::set_freq(huffman_node, freq);
00460 heap.insert(huffman_node);
00461 symbol_map.insert(str, huffman_node);
00462 bin_node_auto.release();
00463 }
00464
00465 private:
00466
00467 static const size_t Max_Token_Size = 256;
00468
00469 public:
00470
00483 void read_input(char * input
00484
00485 , const bool & with_freqs = false
00486
00487 )
00488
00489 {
00490 if (root != NULL)
00491 throw domain_error("Huffman encoding tree has already been generated");
00492
00493 char * curr_stream = input;
00494 char curr_token[Max_Token_Size];
00495 curr_token[1] = '\0';
00496
00497 while (*curr_stream != '\0')
00498 {
00499 curr_token[0] = *curr_stream++;
00500 update_freq(curr_token);
00501 text_len++;
00502 }
00503
00504 set_end_of_stream("");
00505 generate_huffman_tree(with_freqs);
00506 }
00507
00520 void read_input(ifstream & input, const bool & with_freqs = false)
00521 {
00522 if (root != NULL)
00523 throw domain_error("Huffman encoding tree has already been generated");
00524
00525 char curr_token[2];
00526 curr_token[0] = curr_token[1] = '\0';
00527
00528 while (not input.eof())
00529 {
00530 input.read(curr_token, 1);
00531 update_freq(curr_token);
00532 text_len++;
00533 }
00534
00535 set_end_of_stream("");
00536 generate_huffman_tree(with_freqs);
00537 }
00538
00540 void set_end_of_stream(const string & str)
00541 {
00542 if (test_end(str))
00543 throw domain_error("End symbol has already been inserted");
00544
00545 if (root != NULL)
00546 throw domain_error("Huffman encoding tree has already been generated");
00547
00548 unique_ptr<BinNode<string> > bin_node_auto ( new BinNode<string> (str) );
00549
00550 Huffman_Node * huffman_node = static_cast<Huffman_Node*>
00551 (heap.insert(new Huffman_Node(bin_node_auto.get())));
00552 symbol_map.insert(str, huffman_node);
00553 bin_node_auto.release();
00554 end_symbol = str;
00555 }
00569 size_t encode(char * input, BitArray & bit_stream)
00570 {
00571
00572 if (root == NULL)
00573 throw std::domain_error("Huffman tree has not been generated");
00574
00575 char * curr_stream = input;
00576 char curr_token[Max_Token_Size];
00577 curr_token[1] = '\0';
00578 while (*curr_stream != '\0')
00579 {
00580 curr_token[0] = *curr_stream++;
00581 append_code(bit_stream, code_map[curr_token]);
00582 }
00583 append_code(bit_stream, code_map[""]);
00584
00585 return bit_stream.size();
00586 }
00587
00601 size_t encode(ifstream & input, BitArray & bit_stream)
00602 {
00603 if (root == NULL)
00604 throw std::domain_error("Huffman tree has not been generated");
00605 char curr_token[2];
00606 curr_token[0] = curr_token[1] = '\0';
00607 while (not input.eof())
00608 {
00609 input.read(curr_token, 1);
00610 append_code(bit_stream, code_map[curr_token]);
00611 }
00612 append_code(bit_stream, code_map[""]);
00613 return bit_stream.size();
00614 }
00615
00616 };
00622 class Huffman_Decoder_Engine
00623 {
00624 BinNode<string> * root;
00625 string end_symbol;
00626 public:
00627
00636 Huffman_Decoder_Engine(BinNode<string> * p, const string & end)
00637 : root(p), end_symbol(end)
00638 {
00639
00640 }
00642 BinNode<string> *& get_root()
00643 {
00644 if (root == NULL)
00645 throw std::domain_error("Huffman tree has not been generated");
00646
00647 return root;
00648 }
00660 void decode(BitArray & bit_stream, ostream & output)
00661 {
00662 const size_t & bit_stream_len = bit_stream.size();
00663 BinNode<string> * p = root;
00664 for (int i = 0; i < bit_stream_len; ++i)
00665 {
00666 if (bit_stream.read_bit(i) == 0)
00667 p = LLINK(p);
00668 else
00669 p = RLINK(p);
00670
00671
00672 if (p == NULL)
00673 throw std::domain_error("Invalid bits sequence");
00674
00675 if (is_leaf(p))
00676 {
00677 const string & symbol = p->get_key();
00678 if (symbol == end_symbol)
00679 break;
00680
00681 output << symbol;
00682 p = root;
00683 }
00684 }
00685 }
00686 };
00687
00688 }
00689
00690 # endif // HUFFMAN_H