Recursivamente regresando al inicio Aleph-w: Algoritmos y estructuras de datos

Huffman.H

00001 
00002 /*
00003   This file is part of Aleph-w system
00004 
00005   Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
00006   Leandro Rabindranath León
00007   All rights reserved.
00008 
00009   Redistribution and use in source and binary forms, with or without
00010   modification, are permitted provided that the following conditions are
00011   met: 
00012   1. Redistributions of source code must retain the above copyright 
00013      notice, this list of conditions and the following disclaimer.
00014   2. Redistributions in binary form must reproduce the above copyright 
00015      notice, this list of conditions and the following disclaimer in the 
00016      documentation and/or other materials provided with the distribution.
00017   3. All advertising materials mentioning features or use of this software
00018      must display the following acknowledgement:
00019 
00020      Copyright (c) 2002-2011 Leandro Rabindranath León. See details of 
00021      licence.     
00022 
00023      This product includes software developed by the Hewlett-Packard
00024      Company, Free Software Foundation and Silicon Graphics Computer
00025      Systems, Inc. 
00026   4. Neither the name of the <organization> nor the
00027      names of its contributors may be used to endorse or promote products
00028      derived from this software without specific prior written permission.
00029 
00030 THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ''AS IS'' AND ANY
00031 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
00032 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
00033 DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
00034 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
00035 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00036 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
00037 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00038 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00039 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00040 
00041   Aleph-w is distributed in the hope that it will be useful, but WITHOUT
00042   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
00043   or FITNESS FOR A PARTICULAR PURPOSE.
00044 
00045   I request users of this software to return to 
00046 
00047   Leandro Rabindranath Leon
00048   CEMISID 
00049   Ed La Hechicera 
00050   3er piso, ala sur
00051   Facultad de Ingenieria 
00052   Universidad de Los Andes 
00053   Merida - REPÚBLICA BOLIVARIANA DE VENEZUELA    or
00054 
00055   lrleon@ula.ve
00056   leandro.r.leon@gmail.com
00057 
00058   any improvements or extensions that they make and grant me the rights
00059   to redistribute these changes.  
00060 */
00061 
00062 # ifndef HUFFMAN_H
00063 # define HUFFMAN_H
00064 
00065 # include <memory>
00066 # include <autosprintf.h>
00067 # include <tpl_binNodeUtils.H>
00068 # include <tpl_treap.H>
00069 # include <tpl_binHeap.H>
00070 # include <tpl_dynMapTree.H>
00071 # include <bitArray.H>
00072 
00073 namespace Aleph {
00074 
00075 class Huffman_Node;
00076 typedef DynMapTree<Treap_Vtl, string, Huffman_Node *> Symbol_Map;
00077 typedef BinNode< Aleph::pair<string, size_t> > Freq_Node;
00078 struct Huffman_Node : public BinHeap<size_t>::Node
00079 {
00080   BinNode<string> * bin_node; 
00081 
00082   Freq_Node * freq_node;
00083 
00084   Huffman_Node() : BinHeap<size_t>::Node(0), bin_node(NULL), freq_node(NULL) 
00085   {
00086     /* empty */ 
00087   }
00088 
00089   Huffman_Node(BinNode<string> * node) 
00090     : BinHeap<size_t>::Node(0), bin_node(node), freq_node(NULL)
00091   {
00092     /* empty */
00093   }
00094 
00095   ~Huffman_Node() { /* No debe liberarse memoria de bin_node */ }
00096 
00097 };
00098 typedef BinHeap<size_t> Huffman_Heap;
00099 static inline const size_t & get_freq(Huffman_Node * huffman_node)
00100 
00101 {
00102   return huffman_node->get_key();
00103 }
00104 
00105 
00106 static inline void increase_freq(Huffman_Node * huffman_node)
00107 
00108 {
00109   huffman_node->get_key()++;
00110 }
00111 
00112 
00113 static inline void set_freq(Huffman_Node * huffman_node, const size_t & freq)
00114 
00115 {
00116   huffman_node->get_key() = freq;
00117 }
00118 
00119 typedef DynMapTree<Treap_Vtl, string, BitArray> Code_Map;
00120 static inline bool is_leaf(BinNode<string> * p)
00121 {
00122   return LLINK(p) == NULL and RLINK(p) == NULL;
00123 }
00129 class Huffman_Encoder_Engine
00130 {
00131   BinNode<string> * root; 
00132   Huffman_Heap heap;
00133   Symbol_Map   symbol_map; 
00134   Code_Map code_map;
00135 
00136   Freq_Node * freq_root;
00137 
00138   string end_symbol;
00139   size_t text_len;
00140   void build_prefix_encoding(BinNode<string> * p, BitArray & array)
00141   {
00142     if (is_leaf(p))
00143       {
00144         const string & str = p->get_key();
00145         code_map.insert(str, BitArray(array));
00146         return;
00147       }
00148     array.push(0); build_prefix_encoding(LLINK(p), array); array.pop();
00149     array.push(1); build_prefix_encoding(RLINK(p), array); array.pop();
00150   }
00151   void build_encoding_map()
00152   {
00153 
00154     I(symbol_map.size() > 0);
00155 
00156     if (root == NULL)
00157       throw domain_error("Huffman encoding tree has not been generated");
00158 
00159     BitArray array(0);
00160     symbol_map.empty();
00161     build_prefix_encoding(root, array);
00162   }
00163   bool test_end(const string & str) const
00164   {
00165     if (end_symbol == "NO-END") 
00166       return false;
00167     return end_symbol == str;
00168   }
00169   void update_freq(const string & str)
00170   {
00171 
00172     if (root != NULL)
00173       throw domain_error("Huffman encoding tree has already been generated");
00174 
00175     if (test_end(str))
00176       throw domain_error("End symbol has already been inserted");
00177 
00178     Huffman_Node * huffman_node      = NULL;
00179     Huffman_Node ** huffman_node_ptr = symbol_map.test(str);
00180     if (huffman_node_ptr == NULL) // ¿símbolo definido previamente?
00181       { // No ==> crear una entrada en symbol_map e insertarlo en heap
00182         unique_ptr<BinNode<string> > bin_node_auto(new BinNode<string>(str));
00183         huffman_node = static_cast<Huffman_Node*>
00184           (heap.insert(new Huffman_Node(bin_node_auto.get())));
00185         symbol_map.insert(str, huffman_node);
00186         bin_node_auto.release();
00187       }
00188     else
00189       huffman_node = *huffman_node_ptr; // ya definido, recuperarlo
00190 
00191     increase_freq(huffman_node); 
00192     heap.update(huffman_node);
00193   }
00194   static void append_code(BitArray & bit_stream, const BitArray & symbol_code)
00195   {
00196     const size_t symbol_code_size = symbol_code.size();
00197     for (int i = 0; i < symbol_code_size; i++) 
00198       bit_stream.push(symbol_code[i]); 
00199   }
00200   public:
00202   Huffman_Encoder_Engine() 
00203     : root(NULL), freq_root(NULL), end_symbol("NO-END"), text_len(0)
00204   {
00205     // empty
00206   }
00207 
00208   private:
00209 
00210   struct Get_Key
00211   {
00212     string operator () (BinNode<string> * p) const
00213     {
00214       return is_leaf(p) ? p->get_key() : "";
00215     }
00216   };
00217 
00218   struct Load_Key
00219   {
00220     void operator () (BinNode<string> * p, ifstream & input) const
00221     {
00222       if (is_leaf(p))
00223         input >> p->get_key();
00224     }
00225   };
00226 
00227   public:
00228 
00243   void save_tree(ofstream & output) 
00244   {
00245     if (root == NULL)
00246       throw std::domain_error("Huffman tree has not been generated");
00247 
00248     Aleph::save_tree<BinNode<string>, Get_Key> (root, output);
00249   }
00250 
00291   void save_tree_in_array_of_chars(const string & array_name, ofstream & output)
00292   {
00293     if (root == NULL)
00294       throw std::domain_error("Huffman tree has not been generated");
00295 
00296     Aleph::save_tree_in_array_of_chars<BinNode<string>, Get_Key> 
00297       (root, array_name, output);
00298   }
00299 
00303   void save_encoded_text(ofstream & output, BitArray & text)
00304   {
00305     
00306   }
00307 
00311   void save_encoded_text_in_array_of_chars(ofstream & output, BitArray & text)
00312   {
00313     
00314   }
00316   BinNode<string> *& get_root()  
00317   {
00318     if (root == NULL)
00319       throw std::domain_error("Huffman tree has not been generated");
00320 
00321     return root; 
00322   }
00339   BinNode<string> * generate_huffman_tree( 
00340 
00341   const bool & with_freqs = false
00342 
00343                                           ) 
00344   {
00345     while (heap.size() > 1) // hasta que quede solo un nodo
00346       {
00347         Huffman_Node * l_huffman_node = (Huffman_Node*) heap.getMin(); // izq
00348         Huffman_Node * r_huffman_node = (Huffman_Node*) heap.getMin(); // der
00349         BinNode <string> * bin_node = new BinNode <string>;
00350         Huffman_Node * huffman_node = new Huffman_Node (bin_node);
00351         LLINK(bin_node) = l_huffman_node->bin_node;
00352         RLINK(bin_node) = r_huffman_node->bin_node;
00353         const size_t new_freq = get_freq(l_huffman_node) + get_freq(r_huffman_node);
00354         Aleph::set_freq(huffman_node, new_freq);
00355 
00356         if (with_freqs)
00357           {
00358             Freq_Node *& l_freq_node = l_huffman_node->freq_node;
00359             if (l_freq_node == NULL)
00360               {
00361                 l_freq_node                    = new Freq_Node;
00362                 l_freq_node->get_key().first   = 
00363                   l_huffman_node->bin_node->get_key();
00364                 l_freq_node->get_key().second = l_huffman_node->get_key();
00365               }
00366 
00367             Freq_Node *& r_freq_node = r_huffman_node->freq_node;
00368             if (r_freq_node == NULL)
00369               {
00370                 r_freq_node                    = new Freq_Node;
00371                 r_freq_node->get_key().first   = 
00372                   r_huffman_node->bin_node->get_key();
00373                 r_freq_node->get_key().second = r_huffman_node->get_key();
00374               }
00375 
00376             const string str = gnu::autosprintf ("%d", new_freq);
00377             Freq_Node *& freq_node      = huffman_node->freq_node;
00378             freq_node                   = new Freq_Node;
00379             freq_node->get_key().first  = str;
00380             freq_node->get_key().second = huffman_node->get_key();
00381             LLINK(freq_node)            = l_freq_node;
00382             RLINK(freq_node)            = r_freq_node;
00383           }
00384 
00385         delete l_huffman_node;
00386         delete r_huffman_node;
00387         heap.insert(huffman_node);
00388       } // nodo que queda en heap es la raíz del árbol de prefijos  
00389 
00390     Huffman_Node * huffman_root = (Huffman_Node *) heap.getMin();
00391     root = huffman_root->bin_node;
00392 
00393     if (with_freqs)
00394       freq_root = huffman_root->freq_node;
00395 
00396     delete huffman_root;
00397     build_encoding_map(); // construir mapeo de códigos
00398 
00399     return root;
00400   }
00401 
00402 
00416   void load_tree(ifstream & input)
00417   {
00418     if (root != BinNode<string>::NullPtr)
00419       destroyRec(root);
00420     
00421     root = Aleph::load_tree<BinNode<string>, Load_Key> (input);
00422   }
00423 
00424 
00426   Freq_Node *& get_freq_root() 
00427   { 
00428     if (freq_root == NULL)
00429       throw std::domain_error("Huffman tree has not been generated");
00430 
00431     return freq_root; 
00432   }
00442   void set_freq(const string & str, const size_t & freq)
00443 
00444   {
00445     if (root != NULL)
00446       throw domain_error("Huffman encoding tree has already been generated");
00447     if (test_end(str))
00448       throw domain_error("End symbol has already been inserted");
00449        
00450         // Buscar símbolo str
00451     Huffman_Node ** huffman_node_ptr = symbol_map.test(str);
00452 
00453     if (huffman_node_ptr != NULL) // ¿ya fue definido?
00454       throw std::domain_error // Sí ==> esto es un error!
00455         (gnu::autosprintf("Frequency for symbol %s has already set", str.c_str()));
00456    
00457     unique_ptr<BinNode<string> > bin_node_auto(new BinNode<string>(str));
00458     Huffman_Node * huffman_node = new Huffman_Node(bin_node_auto.get());
00459     Aleph::set_freq(huffman_node, freq); 
00460     heap.insert(huffman_node);
00461     symbol_map.insert(str, huffman_node);
00462     bin_node_auto.release();
00463   }
00464 
00465   private:
00466 
00467     static const size_t Max_Token_Size = 256;
00468 
00469   public:
00470 
00483   void read_input(char * input
00484 
00485   , const bool & with_freqs = false
00486 
00487                   )
00488 
00489   {
00490     if (root != NULL)
00491       throw domain_error("Huffman encoding tree has already been generated");
00492 
00493     char * curr_stream = input;
00494     char curr_token[Max_Token_Size];
00495     curr_token[1] = '\0';
00496 
00497     while (*curr_stream != '\0')
00498       {
00499         curr_token[0] = *curr_stream++; 
00500         update_freq(curr_token);
00501         text_len++;
00502       }
00503 
00504     set_end_of_stream("");
00505     generate_huffman_tree(with_freqs);
00506   }
00507 
00520   void read_input(ifstream & input, const bool & with_freqs = false)
00521   {
00522     if (root != NULL)
00523       throw domain_error("Huffman encoding tree has already been generated");
00524 
00525     char curr_token[2];
00526     curr_token[0] = curr_token[1] = '\0';
00527 
00528     while (not input.eof())
00529       {
00530         input.read(curr_token, 1); 
00531         update_freq(curr_token);
00532         text_len++;
00533       }
00534 
00535     set_end_of_stream("");
00536     generate_huffman_tree(with_freqs);
00537   }
00538 
00540   void set_end_of_stream(const string & str)
00541   {
00542     if (test_end(str))
00543       throw domain_error("End symbol has already been inserted");
00544 
00545     if (root != NULL)
00546       throw domain_error("Huffman encoding tree has already been generated");
00547 
00548     unique_ptr<BinNode<string> > bin_node_auto ( new BinNode<string> (str) );
00549 
00550     Huffman_Node * huffman_node =  static_cast<Huffman_Node*>
00551       (heap.insert(new Huffman_Node(bin_node_auto.get())));
00552     symbol_map.insert(str, huffman_node); 
00553     bin_node_auto.release();
00554     end_symbol = str;
00555   }
00569   size_t encode(char * input, BitArray & bit_stream)
00570   {
00571 
00572     if (root == NULL)
00573       throw std::domain_error("Huffman tree has not been generated");
00574 
00575     char * curr_stream = input;
00576     char curr_token[Max_Token_Size];
00577     curr_token[1] = '\0';
00578     while (*curr_stream != '\0')
00579       {
00580         curr_token[0] = *curr_stream++; 
00581         append_code(bit_stream, code_map[curr_token]);
00582       }
00583     append_code(bit_stream, code_map[""]);
00584 
00585     return bit_stream.size();
00586   }
00587 
00601   size_t encode(ifstream & input, BitArray & bit_stream)
00602   {
00603     if (root == NULL)
00604       throw std::domain_error("Huffman tree has not been generated");
00605     char curr_token[2];
00606     curr_token[0] = curr_token[1] = '\0'; 
00607     while (not input.eof())
00608       {
00609         input.read(curr_token, 1); 
00610         append_code(bit_stream, code_map[curr_token]);
00611       }
00612     append_code(bit_stream, code_map[""]);
00613     return bit_stream.size();
00614   }
00615 
00616 };
00622 class Huffman_Decoder_Engine
00623 {
00624   BinNode<string> * root; 
00625   string end_symbol;
00626   public:
00627 
00636   Huffman_Decoder_Engine(BinNode<string> * p, const string & end)
00637     : root(p), end_symbol(end)
00638   {
00639     // empty
00640   }
00642   BinNode<string> *& get_root()  
00643   {
00644     if (root == NULL)
00645       throw std::domain_error("Huffman tree has not been generated");
00646 
00647     return root; 
00648   }
00660   void decode(BitArray & bit_stream, ostream & output)
00661   {
00662     const size_t & bit_stream_len = bit_stream.size();
00663     BinNode<string> * p = root;
00664     for (int i = 0; i < bit_stream_len; ++i)
00665       {
00666         if (bit_stream.read_bit(i) == 0)
00667           p = LLINK(p);
00668         else
00669           p = RLINK(p);
00670 
00671 
00672        if (p == NULL)
00673          throw std::domain_error("Invalid bits sequence");
00674 
00675        if (is_leaf(p)) // ¿es hoja?
00676          {     // sí ==> escribir símbolo y reiniciar a raíz
00677            const string & symbol = p->get_key();
00678            if (symbol == end_symbol) // ¿se alcanza final?
00679              break;
00680 
00681            output << symbol;
00682            p = root; // reiniciar a raíz, pues se leerá un nuevo código
00683          }
00684       }
00685   }
00686 };
00687 
00688 } // end namespace Aleph
00689 
00690 # endif // HUFFMAN_H

Leandro Rabindranath León