Shadowrun: Awakened 29 September 2011 - Build 871
RangeCoder.hpp
Go to the documentation of this file.
00001 /*
00002     Copyright (c) 2009-2010 Christopher A. Taylor.  All rights reserved.
00003 
00004     Redistribution and use in source and binary forms, with or without
00005     modification, are permitted provided that the following conditions are met:
00006 
00007     * Redistributions of source code must retain the above copyright notice,
00008       this list of conditions and the following disclaimer.
00009     * Redistributions in binary form must reproduce the above copyright notice,
00010       this list of conditions and the following disclaimer in the documentation
00011       and/or other materials provided with the distribution.
00012     * Neither the name of LibCat nor the names of its contributors may be used
00013       to endorse or promote products derived from this software without
00014       specific prior written permission.
00015 
00016     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00017     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00018     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00019     ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
00020     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00021     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00022     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00023     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00024     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00025     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00026     POSSIBILITY OF SUCH DAMAGE.
00027 */
00028 
00029 #ifndef CAT_RANGE_CODER_HPP
00030 #define CAT_RANGE_CODER_HPP
00031 
00032 #include <cat/Platform.hpp>
00033 #include <ostream>
00034 #include <map>
00035 #include <string>
00036 
00037 namespace cat {
00038 
00039 
00040 /*
00041     TextStatsCollector
00042 
00043     Collects order-1 statistics of the text given one character at a time.
00044     Order-1 statistics include the likelihood of a character given the previous one.
00045 
00046     This is intended to be used on a large sample of text (of unlimited length)
00047     to come up with statistics that most text should follow.  When the resulting
00048     table is used with a range coder, compression should be reliably achieved,
00049     though a bit should be allocated for the case where the result of the coder
00050     would be longer than encoding with uniform ranges, like if someone enters "ZZZZ".
00051 
00052     The RangeEncoder class has a text compressor based on the output of this class.
00053 
00054     I opted for a static table because this is to be run on a network server
00055     where many clients are connected.  The memory needed for this type of
00056     compression does not increase with the number of clients.
00057 */
00058 class TextStatsCollector
00059 {
00060     u32 last, total, frequencies[256][256];
00061     u8 seen[256];
00062 
00063 public:
00064     TextStatsCollector();
00065 
00066 public:
00067     // Record a character occurance
00068     // 0 = end of line, so next character counts towards initial character frequency
00069     void Tally(u8 x);
00070 
00071 #if defined(CAT_PRAGMA_PACK)
00072 #pragma pack(push)
00073 #pragma pack(1)
00074 #endif
00075 
00076     struct TableFormat
00077     {
00078         // MurmurHash2 of remainder, with seed = 0
00079         u32 hash;
00080 
00081         // Total symbols in the table <= 256
00082         u16 total;
00083 
00084         // Fraction of a byte represented by total, scaled from [0, 2^15]
00085         u16 log2total;
00086 
00087         // ASCII character code -> Table index map
00088         u8 char2index[256];
00089 
00090         // Table index -> ASCII character code map
00091         u8 index2char[256];
00092 
00093         /*
00094             Start of frequency table
00095 
00096             The first 32 entries are used for reverse lookup (freq->symbol):
00097             GET_SYMBOL_LUT() will get this address:
00098             frequencies[0..15] = array of 16 bytes creating a lookup table (LUT) given
00099                                  the high 4 bits of the frequency, for the low range
00100             frequencies[16..31] = array of 16 bytes creating a lookup table (LUT) given
00101                                   the high 4 bits of the frequency, for the high range
00102 
00103             GET_SYMBOL_BASE() will get this address:
00104             frequencies[32] = cumulative frequency for (last=0, this=1) out of 2^16 trials
00105             frequencies[33] = cumulative frequency for (last=0, this=2) out of 2^16 trials
00106             frequencies[34] = ...
00107 
00108             Note: (0, 0) is implicitly zero, and (0, TOTAL) is implicitly 2^16
00109             So these tables don't include those implicit entries.
00110         */
00111         u16 frequencies[1];
00112     } CAT_PACKED;
00113 
00114 #if defined(CAT_PRAGMA_PACK)
00115 #pragma pack(pop)
00116 #endif
00117 
00118     // Returns code that creates a table in the above format
00119     bool GenerateMinimalStaticTable(const char *TableName, std::ostream &osout);
00120 
00121     // Check for errors in the in-memory version of the table that was generated
00122     static bool VerifyTableIntegrity(const TableFormat *table);
00123 };
00124 
00125 
00126 /*
00127     Range Encoder
00128 
00129     Encodes a single message one field at a time using the minimum
00130     number of bits, rounded up to the next highest byte.
00131 
00132     To insure that the message does not grow in size, provide limited
00133     space for the output buffer and check .Fail() at the end.  If it
00134     failed, this means it ran out of space during encoding.
00135 
00136     If encoding succeeded, check .Used() to determine the number of
00137     bytes used by the output buffer.
00138 */
00139 class RangeEncoder
00140 {
00141     u8 *output;
00142     int limit, remaining;
00143 
00144     u64 low, range;
00145 
00146     // Write out bytes as needed
00147     CAT_INLINE void Normalize();
00148 
00149     CAT_INLINE void GetTableSymbol(const TextStatsCollector::TableFormat *stats, u32 &last, u8 ch, u16 &symbol_low, u16 &symbol_range);
00150 
00151 public:
00152     // Ctors
00153     RangeEncoder(void *output_i, int limit_i);
00154     RangeEncoder(RangeEncoder &cp);
00155 
00156     // Overwrite one context with another.  Using context references instead
00157     // of working on the contexts directly is probably more efficient.
00158     RangeEncoder &operator=(RangeEncoder &cp);
00159 
00160     // State accessors
00161     bool Fail() { return output == 0; }
00162     int Used() { return limit - remaining; }
00163 
00164 public:
00165     // Encode the given text with the given statistics
00166     // NOTE: May be up to one byte longer than the original message
00167     void Text(const char *msg, const TextStatsCollector::TableFormat *stats);
00168 
00169     // Encode a biased bit given the frequency it is 0
00170     // frequency = average times out of 2^32 trials the bit will be 0
00171     void BiasedBit(u32 b, u32 frequency);
00172 
00173     // Encode a field that takes on [0, total) values with equal likelihood
00174     void Range(u32 symbol, u32 total);
00175 
00176     // Emit the final byte(s) needed to encode the symbols
00177     void Finish();
00178 };
00179 
00180 
00181 /*
00182     Range Decoder
00183 
00184     Interprets buffers produced by RangeEncoder
00185 */
00186 class RangeDecoder
00187 {
00188     const u8 *input;
00189     int remaining;
00190     u64 code, low, range;
00191 
00192     // Read in bytes as needed
00193     CAT_INLINE void Normalize();
00194 
00195     // Grab symbol low frequency and range from the table
00196     CAT_INLINE u8 GetTableSymbol(const TextStatsCollector::TableFormat *stats, u32 &last, u16 freq, u16 &symbol_low, u16 &symbol_range);
00197 
00198 public:
00199     // Initializing constructor
00200     RangeDecoder(const void *message, int bytes);
00201 
00202     int Remaining() { return remaining; }
00203 
00204 public:
00205     // Decode the given text with the given statistics
00206     int Text(char *msg, int buffer_size, const TextStatsCollector::TableFormat *stats);
00207 
00208     // Decode a biased bit given the frequency it is 0
00209     // frequency = average times out of 2^32 trials the bit will be 0
00210     u32 BiasedBit(u32 frequency);
00211 
00212     // Decode a field that takes on [0, total) values with equal likelihood
00213     u32 Range(u32 total);
00214 };
00215 
00216 
00217 } // namespace cat
00218 
00219 #endif // CAT_RANGE_CODER_HPP

Copyright © 2007-2010 by The Shadowrun: Awakened Team. This work is licensed under the GNU Lesser General Public License 3.

GNU Lesser General Public License 3 Sourceforge.net