![]() |
Shadowrun: Awakened 29 September 2011 - Build 871
|
00001 /* 00002 Copyright (c) 2009-2010 Christopher A. Taylor. All rights reserved. 00003 00004 Redistribution and use in source and binary forms, with or without 00005 modification, are permitted provided that the following conditions are met: 00006 00007 * Redistributions of source code must retain the above copyright notice, 00008 this list of conditions and the following disclaimer. 00009 * Redistributions in binary form must reproduce the above copyright notice, 00010 this list of conditions and the following disclaimer in the documentation 00011 and/or other materials provided with the distribution. 00012 * Neither the name of LibCat nor the names of its contributors may be used 00013 to endorse or promote products derived from this software without 00014 specific prior written permission. 00015 00016 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 00017 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00018 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00019 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 00020 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 00021 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 00022 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 00023 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 00024 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 00025 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00026 POSSIBILITY OF SUCH DAMAGE. 00027 */ 00028 00029 #ifndef CAT_RANGE_CODER_HPP 00030 #define CAT_RANGE_CODER_HPP 00031 00032 #include <cat/Platform.hpp> 00033 #include <ostream> 00034 #include <map> 00035 #include <string> 00036 00037 namespace cat { 00038 00039 00040 /* 00041 TextStatsCollector 00042 00043 Collects order-1 statistics of the text given one character at a time. 00044 Order-1 statistics include the likelihood of a character given the previous one. 00045 00046 This is intended to be used on a large sample of text (of unlimited length) 00047 to come up with statistics that most text should follow. When the resulting 00048 table is used with a range coder, compression should be reliably achieved, 00049 though a bit should be allocated for the case where the result of the coder 00050 would be longer than encoding with uniform ranges, like if someone enters "ZZZZ". 00051 00052 The RangeEncoder class has a text compressor based on the output of this class. 00053 00054 I opted for a static table because this is to be run on a network server 00055 where many clients are connected. The memory needed for this type of 00056 compression does not increase with the number of clients. 00057 */ 00058 class TextStatsCollector 00059 { 00060 u32 last, total, frequencies[256][256]; 00061 u8 seen[256]; 00062 00063 public: 00064 TextStatsCollector(); 00065 00066 public: 00067 // Record a character occurance 00068 // 0 = end of line, so next character counts towards initial character frequency 00069 void Tally(u8 x); 00070 00071 #if defined(CAT_PRAGMA_PACK) 00072 #pragma pack(push) 00073 #pragma pack(1) 00074 #endif 00075 00076 struct TableFormat 00077 { 00078 // MurmurHash2 of remainder, with seed = 0 00079 u32 hash; 00080 00081 // Total symbols in the table <= 256 00082 u16 total; 00083 00084 // Fraction of a byte represented by total, scaled from [0, 2^15] 00085 u16 log2total; 00086 00087 // ASCII character code -> Table index map 00088 u8 char2index[256]; 00089 00090 // Table index -> ASCII character code map 00091 u8 index2char[256]; 00092 00093 /* 00094 Start of frequency table 00095 00096 The first 32 entries are used for reverse lookup (freq->symbol): 00097 GET_SYMBOL_LUT() will get this address: 00098 frequencies[0..15] = array of 16 bytes creating a lookup table (LUT) given 00099 the high 4 bits of the frequency, for the low range 00100 frequencies[16..31] = array of 16 bytes creating a lookup table (LUT) given 00101 the high 4 bits of the frequency, for the high range 00102 00103 GET_SYMBOL_BASE() will get this address: 00104 frequencies[32] = cumulative frequency for (last=0, this=1) out of 2^16 trials 00105 frequencies[33] = cumulative frequency for (last=0, this=2) out of 2^16 trials 00106 frequencies[34] = ... 00107 00108 Note: (0, 0) is implicitly zero, and (0, TOTAL) is implicitly 2^16 00109 So these tables don't include those implicit entries. 00110 */ 00111 u16 frequencies[1]; 00112 } CAT_PACKED; 00113 00114 #if defined(CAT_PRAGMA_PACK) 00115 #pragma pack(pop) 00116 #endif 00117 00118 // Returns code that creates a table in the above format 00119 bool GenerateMinimalStaticTable(const char *TableName, std::ostream &osout); 00120 00121 // Check for errors in the in-memory version of the table that was generated 00122 static bool VerifyTableIntegrity(const TableFormat *table); 00123 }; 00124 00125 00126 /* 00127 Range Encoder 00128 00129 Encodes a single message one field at a time using the minimum 00130 number of bits, rounded up to the next highest byte. 00131 00132 To insure that the message does not grow in size, provide limited 00133 space for the output buffer and check .Fail() at the end. If it 00134 failed, this means it ran out of space during encoding. 00135 00136 If encoding succeeded, check .Used() to determine the number of 00137 bytes used by the output buffer. 00138 */ 00139 class RangeEncoder 00140 { 00141 u8 *output; 00142 int limit, remaining; 00143 00144 u64 low, range; 00145 00146 // Write out bytes as needed 00147 CAT_INLINE void Normalize(); 00148 00149 CAT_INLINE void GetTableSymbol(const TextStatsCollector::TableFormat *stats, u32 &last, u8 ch, u16 &symbol_low, u16 &symbol_range); 00150 00151 public: 00152 // Ctors 00153 RangeEncoder(void *output_i, int limit_i); 00154 RangeEncoder(RangeEncoder &cp); 00155 00156 // Overwrite one context with another. Using context references instead 00157 // of working on the contexts directly is probably more efficient. 00158 RangeEncoder &operator=(RangeEncoder &cp); 00159 00160 // State accessors 00161 bool Fail() { return output == 0; } 00162 int Used() { return limit - remaining; } 00163 00164 public: 00165 // Encode the given text with the given statistics 00166 // NOTE: May be up to one byte longer than the original message 00167 void Text(const char *msg, const TextStatsCollector::TableFormat *stats); 00168 00169 // Encode a biased bit given the frequency it is 0 00170 // frequency = average times out of 2^32 trials the bit will be 0 00171 void BiasedBit(u32 b, u32 frequency); 00172 00173 // Encode a field that takes on [0, total) values with equal likelihood 00174 void Range(u32 symbol, u32 total); 00175 00176 // Emit the final byte(s) needed to encode the symbols 00177 void Finish(); 00178 }; 00179 00180 00181 /* 00182 Range Decoder 00183 00184 Interprets buffers produced by RangeEncoder 00185 */ 00186 class RangeDecoder 00187 { 00188 const u8 *input; 00189 int remaining; 00190 u64 code, low, range; 00191 00192 // Read in bytes as needed 00193 CAT_INLINE void Normalize(); 00194 00195 // Grab symbol low frequency and range from the table 00196 CAT_INLINE u8 GetTableSymbol(const TextStatsCollector::TableFormat *stats, u32 &last, u16 freq, u16 &symbol_low, u16 &symbol_range); 00197 00198 public: 00199 // Initializing constructor 00200 RangeDecoder(const void *message, int bytes); 00201 00202 int Remaining() { return remaining; } 00203 00204 public: 00205 // Decode the given text with the given statistics 00206 int Text(char *msg, int buffer_size, const TextStatsCollector::TableFormat *stats); 00207 00208 // Decode a biased bit given the frequency it is 0 00209 // frequency = average times out of 2^32 trials the bit will be 0 00210 u32 BiasedBit(u32 frequency); 00211 00212 // Decode a field that takes on [0, total) values with equal likelihood 00213 u32 Range(u32 total); 00214 }; 00215 00216 00217 } // namespace cat 00218 00219 #endif // CAT_RANGE_CODER_HPP
Copyright © 2007-2010 by The Shadowrun: Awakened Team. This work is licensed under the GNU Lesser General Public License 3.