00001
00009
00010 #include "ngrams.h"
00011
00012 namespace ace {
00013
00016 const ngram_size_t _max_n = 8;
00017
00018 #ifdef _64_BIT
00019 const size_t _lemma_bitmasks[_max_n+1] = {
00020 0x00,
00021 0xFFFFFFFF,
00022 0x00FFFFFF,
00023 0x0000FFFF,
00024 0x000003FF,
00025 0x000000FF,
00026 0x000000FF,
00027 0x000000FF,
00028 0x000000FF,
00029 };
00030 #ifdef _DEPENDENCY_MODE
00031 const size_t _dependency_bitmasks[_max_n+1] = {
00032 0x00,
00033 0x7C,
00034 0x7C,
00035 0x3C,
00036 0x3C,
00037 0x1C,
00038 0x1C,
00039 0x0C,
00040 0x04
00041 };
00042 const size_t _parent_bitmasks[_max_n+1] = {
00043 0x00,
00044 0x03,
00045 0x03,
00046 0x03,
00047 0x03,
00048 0x03,
00049 0x00, 0x00, 0x00
00050 };
00051 const size_t _dp_shift[_max_n+1] = {
00052 0,
00053 32,
00054 24,
00055 16,
00056 10,
00057 8,
00058 6,
00059 6,
00060 0
00061 };
00062 #endif
00063
00064 const size_t _bits_per_member[_max_n+1] = { 0, 64, 32, 22, 16, 13, 11, 10, 8 };
00065
00066 #else // 32-bit
00067 const size_t _lemma_bitmasks[_max_n+1] = {
00068 0x00,
00069 0x00FFFFFF,
00070 0x00000FFF,
00071 0x000000FF,
00072 0x0000003F,
00073 0x0000003F,
00074 0x0000003F,
00075 0x0000001F,
00076 0x0000000F,
00077 };
00078
00079 #ifdef _DEPENDENCY_MODE
00080 const size_t _dependency_bitmasks[_max_n+1] = {
00081 0x00,
00082 0x7C,
00083 0x0C,
00084 0x1C,
00085 0x0C,
00086 0x04,
00087 0x00, 0x00, 0x00
00088 };
00089 const size_t _parent_bitmasks[_max_n+1] = {
00090 0x00,
00091 0x03,
00092 0x03,
00093 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
00094 };
00095 const size_t _dp_shift[_max_n+1] = {
00096 0,
00097 24,
00098 12,
00099 6,
00100 4,
00101 4,
00102 0, 0, 0
00103 };
00104 #endif
00105
00106 const size_t _bits_per_member[_max_n+1] = { 0, 32, 16, 11, 8, 7, 6, 5, 4 };
00107
00108 #endif // 64-bit vs. 32-bit
00109
00110
00111
00112 size_t NGramHashCounter::count_hash(const NGram *gram) {
00113
00114
00115 size_t hash = _seed, loop = 0;
00116
00117 ngram_size_t mc = gram->degree();
00118 #ifndef _SURFACE_MODE
00119 size_t db = _dependency_bitmasks[mc], pb = _parent_bitmasks[mc], lb = _lemma_bitmasks[mc], dp_s = _dp_shift[mc], bpm = _bits_per_member[mc];
00120 #else
00121 size_t lb = _lemma_bitmasks[mc], bpm = _bits_per_member[mc];
00122 #endif
00123
00124 for ( const NGram::Member * i_mem = gram->begin(); i_mem != gram->end(); ++i_mem, ++loop ) {
00125 hash ^=
00126 (
00127 #ifndef _SURFACE_MODE
00128 (
00129 (
00130
00131 (i_mem->dependency() & db) | (i_mem->parent() & pb)
00132 )
00133
00134 << dp_s
00135 )
00136 |
00137 #endif
00138
00139 (i_mem->lemma() & lb)
00140 )
00141
00142 << loop * bpm;
00143 }
00144 return hash;
00145
00146 }
00147
00148
00149
00150 bool NGramHashCounter::check(ngram_size_t n) {
00151 return _max_n >= n;
00152 }
00153
00154 #ifdef _USE_HASHSET
00155
00156 #if defined(__GNUC__)
00157
00158
00159
00160 bool NGramEqualComparator::operator()(const NGram *lhs, const NGram *rhs) const {
00161 return *lhs == *rhs;
00162 }
00163
00164 #elif defined(_MSC_VER)
00165
00166
00167
00168 bool NGramLessComparator::operator()(const NGram *lhs, const NGram *rhs) const {
00169 return *lhs < *rhs;
00170 }
00171
00172
00173
00174 NGramHasher NGramHashCompare::_hasher;
00175 NGramLessComparator NGramHashCompare::_comparator;
00176
00177 #endif
00178
00179 #endif // _USE_HASHSET
00180
00181
00183
00184
00185
00186 NGramStore::NGramStore(size_t memory_pool_block_size): _memory_pools(), _hash_tables() {
00187 if ( memory_pool_block_size ) {
00188 init(memory_pool_block_size);
00189 }
00190
00191 }
00192
00193
00194
00195 const NGram * NGramStore::get(const NGram *ngram) const {
00196 if ( ngram->is_zero() ) {
00197 return &zero_ngram;
00198 }
00199
00200 _hash_table_pair_cib_t pair_cib = _find(ngram);
00201
00202 if ( pair_cib.second ) {
00203
00204 return *(pair_cib.first);
00205 }
00206
00207 return NULL;
00208 }
00209
00210
00211
00212 void NGramStore::init(size_t memory_pool_block_size) {
00213 if ( memory_pool_block_size && _hash_tables.empty() && _memory_pools.empty() ) {
00214
00215 #ifdef _USE_HASHSET
00216
00217 _hash_tables.resize(NGram::types_count());
00218 #else
00219
00220 _hash_tables.push_back(hash_table_t(0, 0, 0));
00221 for ( ngram_type_t type = 1; type < NGram::types_count(); ++type ) {
00222
00223 _hash_tables.push_back(hash_table_t(8, settings::bucket_size, settings::crop_ratio));
00224 }
00225 #endif
00226
00227 _memory_pools.push_back(VoidPool(0));
00228 for ( ngram_size_t degree = 1; degree <= NGram::n(); ++degree ) {
00229
00230 _memory_pools.push_back(VoidPool(memory_pool_block_size*NGram::size_of(degree)));
00231 }
00232 }
00233 }
00234
00235
00236
00237 NGramStore::insert_status_t NGramStore::insert(const NGram *ngram) {
00238 if ( ngram->is_zero() ) {
00239 return insert_status_t(&zero_ngram, false);
00240 }
00241
00242 _hash_table_pair_cib_t pair_cib = _find(ngram);
00243
00244 if ( pair_cib.second ) {
00245
00246 return insert_status_t(*pair_cib.first, false);
00247 }
00248
00249
00250
00251
00252
00253 void *mem = _memory_pools[ngram->degree()].get_raw_memory(ngram->size_of());
00254
00255
00256
00257 return insert_status_t(*_hash_tables[ngram->type()].insert(new (mem) NGram(*ngram)).first, true);
00258 }
00259
00260
00261
00262 NGramStore::_hash_table_pair_cib_t NGramStore::_find(const NGram *ngram) const {
00263
00264 std::pair<hash_table_t::const_iterator, hash_table_t::const_iterator> ip = _hash_tables[ngram->type()].equal_range(const_cast<NGram *>(ngram));
00265
00266 return _hash_table_pair_cib_t(ip.first, ip.first != ip.second);
00267 }
00268
00269 #ifndef _USE_HASHSET
00270 void NGramStore::sort(ngram_type_t type) {
00271 if ( type == 0 ) {
00272
00273 for ( type = 1; type <= NGram::full_ngram_type(); ++type ) {
00274 _hash_tables[type].sort();
00275 }
00276 } else {
00277 _hash_tables[type].sort();
00278 }
00279 }
00280 #endif
00281
00282
00284
00285
00286
00287 bool NGramsProcessor::next(void) {
00288 if ( _next == _end ) {
00289 return false;
00290 }
00291
00292 _ngram_frequencies.ngram = *_next;
00293
00294
00295 _ngram_frequencies.frequencies[NGram::full_ngram_type()] = _ngram_frequencies.ngram->frequency();
00296
00297 for ( ngram_type_t type = 0; type < NGram::full_ngram_type(); ++type ) {
00298
00299 _ngram_frequencies.frequencies[type] = _ngram_store.get(NGramToken::get(*_next, type))->frequency();;
00300 }
00301
00302 ++_next;
00303
00304 return true;
00305 }
00306
00307 }