00001
00011
00012 #include <algorithm>
00013
00014 #include <limits>
00015
00016
00017 #include "ngram.h"
00018
00019 namespace ace {
00020
00022
00023
00024
00025
00026 bool NGram::_context_tracing_on = false;
00027
00028
00029
00030 const freq_counter_t NGram::_max_frequency = std::numeric_limits<freq_counter_t>::max() >> (sizeof(ngram_type_t)*constants::bits_per_char);
00031
00032 const ngram_type_t NGram::_overflow_flag_mask = static_cast<ngram_type_t>(1 << (sizeof(ngram_type_t)*constants::bits_per_char - 1));
00033
00034 const size_t NGram::_last_bit_index = sizeof(freq_counter_t) / sizeof(ngram_type_t) - 1;
00035
00036 const ngram_type_t NGram::_type_mask = ff<ngram_type_t>() >> 1;
00037
00038
00039
00040 ngram_type_t NGram::_full_ngram_type = 0;
00041
00042 std::vector<ngram_size_t> NGram::_members_count;
00043
00044 std::vector<size_t> NGram::_ngrams_memory_sizes;
00045
00046 NGram::_overflows_map_t NGram::_overflows;
00047
00048 ngram_type_t NGram::_types_count = 0;
00049
00050 void NGram::_type(ngram_type_t ngram_type) {
00051 bool overflow = _overflow();
00052 _frequency_bits[_last_bit_index] = ngram_type;
00053 if ( overflow ) {
00054 _frequency_bits[_last_bit_index] |= _overflow_flag_mask;
00055 }
00056 }
00057
00058
00059
00060 void NGram::init(void) {
00061
00062 _context_tracing_on = settings::context_tracing_on();
00063
00064 _types_count = static_cast<ngram_type_t>(1 << n());
00065
00066 for ( ngram_type_t type = 0; type < _types_count; ++type ) {
00067 _members_count.push_back(static_cast<ngram_size_t>(bits_in(type)));
00068 }
00069
00070 _full_ngram_type = static_cast<ngram_type_t>(~(ff<ngram_type_t>() << n()));
00071
00072 for ( ngram_size_t degree = 0; degree <= n(); ++degree ) {
00073 _ngrams_memory_sizes.push_back(sizeof(NGram) + degree*sizeof(Member));
00074 }
00075 if ( _context_tracing_on ) {
00076
00077 for ( ngram_size_t degree = 1; degree <= n(); ++degree ) {
00078
00079 _ngrams_memory_sizes[degree] += sizeof(context_t);
00080 }
00081 }
00082 }
00083
00084
00085
00086 NGram::NGram(const raw_ngram_t& raw_ngram, ngram_type_t ngram_type): _frequency(1) {
00087
00088 _type(ngram_type);
00089
00090 unsigned s = 0, i = n();
00091 for ( raw_ngram_t::const_iterator i_mem = raw_ngram.begin(); i_mem != raw_ngram.end(); ++i_mem ) {
00092
00093 if ( ith_bit(ngram_type, --i) ) {
00094 #ifdef _DEPENDENCY_MODE
00095 new (get(s)) Member( i_mem->second->lemma(), i_mem->second->tag(), i_mem->first, (i_mem->first == 0) ? dependency::head : i_mem->second->dependency() );
00096 #else
00097 new (get(s)) Member((*i_mem)->lemma(), (*i_mem)->tag());
00098 #endif
00099 ++s;
00100 }
00101 }
00102 if ( _context_tracing_on ) {
00103 new (context()) context_t();
00104 }
00105 }
00106
00107
00108
00109 NGram::NGram(const NGram& source): _frequency(source._frequency) {
00110
00111
00112
00113 std::copy(source.begin(), source.end(), this->begin());
00114
00115 if ( _context_tracing_on ) {
00116 new (context()) context_t();
00117 }
00118 }
00119
00120
00121
00122 NGram::NGram(const NGram& source, ngram_type_t ngram_type): _frequency(source._frequency) {
00123
00124 _type(ngram_type);
00125 for ( ngram_size_t i = 0, j = n(), s = 0; i < n(); ++i ) {
00126
00127 if ( ith_bit(ngram_type, --j) ) {
00128 new (get(s)) Member(*source.get(i));
00129 ++s;
00130 }
00131 }
00132
00133 if ( _context_tracing_on ) {
00134 new (context()) context_t();
00135 }
00136 }
00137
00138
00139
00140 void NGram::inc(void) {
00141 if ( frequency() == _max_frequency ) {
00142
00143 ngram_type_t t = type();
00144
00145 _overflows.add(this, frequency());
00146
00147 _frequency = 0;
00148
00149 _type(t);
00150
00151 _frequency_bits[_last_bit_index] |= _overflow_flag_mask;
00152 }
00153
00154 ++_frequency;
00155 }
00156
00157
00158
00159 void NGram::add_to_context(const PartOfContext& part_of_context) {
00160 if ( !_context_tracing_on ) {
00161 return;
00162 }
00163
00164 context_t *ctx = context();
00165 context_t::iterator i_part_of_context = std::find(ctx->begin(), ctx->end(), part_of_context);
00166 if ( i_part_of_context != ctx->end() ) {
00167 i_part_of_context->inc();
00168 if ( i_part_of_context != ctx->begin() ) {
00169
00170 context_t::iterator prev = i_part_of_context - 1;
00171 while ( prev->frequency() < i_part_of_context->frequency() ) {
00172
00173 std::swap(*prev, *i_part_of_context);
00174 i_part_of_context = prev;
00175 if ( i_part_of_context == ctx->begin() ) {
00176 break;
00177 } else {
00178 prev = i_part_of_context - 1;
00179 }
00180 }
00181 }
00182 return;
00183 }
00184
00185 ctx->push_back(part_of_context);
00186 }
00187
00188
00189
00190 bool operator==(const NGram& lhs, const NGram& rhs) {
00191
00192
00193 if ( lhs.type() != rhs.type() ) {
00194 return false;
00195 }
00196
00197
00198 ngram_size_t mc = lhs.degree();
00199
00200 for ( ngram_size_t i = 0; i < mc; ++i ) {
00201
00202 if ( lhs.lemma(i) != rhs.lemma(i) ) {
00203 return false;
00204 }
00205 #ifdef _DEPENDENCY_MODE
00206 if ( lhs.parent(i) != rhs.parent(i) ) {
00207 return false;
00208 }
00209 if ( lhs.dependency(i) != rhs.dependency(i) ) {
00210 return false;
00211 }
00212 #endif
00213 if ( lhs.tag(i) != rhs.tag(i) ) {
00214 return false;
00215 }
00216 }
00217
00218 return true;
00219
00220 }
00221
00222
00223
00224 bool operator<(const NGram& lhs, const NGram& rhs) {
00225
00226
00227 if ( lhs.type() < rhs.type() ) {
00228 return true;
00229 }
00230 if ( rhs.type() < lhs.type() ) {
00231 return false;
00232 }
00233
00234 ngram_size_t mc = lhs.degree();
00235
00236 for ( ngram_size_t i = 0; i < mc; ++i ) {
00237
00238 if ( lhs.lemma(i) < rhs.lemma(i) ) {
00239
00240 return true;
00241 }
00242 if ( rhs.lemma(i) < lhs.lemma(i) ) {
00243
00244 return false;
00245 }
00246 #ifdef _DEPENDENCY_MODE
00247 if ( lhs.parent(i) < rhs.parent(i) ) {
00248
00249 return true;
00250 }
00251 if ( rhs.parent(i) < lhs.parent(i) ) {
00252
00253 return false;
00254 }
00255 if ( lhs.dependency(i) < rhs.dependency(i) ) {
00256
00257 return true;
00258 }
00259 if ( rhs.dependency(i) < lhs.dependency(i) ) {
00260
00261 return false;
00262 }
00263 #endif
00264 if ( lhs.tag(i) < rhs.tag(i) ) {
00265
00266 return true;
00267 }
00268 if ( rhs.tag(i) < lhs.tag(i) ) {
00269
00270 return false;
00271 }
00272 }
00273
00274 return false;
00275 }
00276
00278
00279 void *NGramToken::_ngram_space = 0;
00280
00281
00282
00283 void NGramToken::init(void) {
00284
00285 _ngram_space = operator new(NGram::size_of(NGram::n()));
00286 }
00287
00288
00289
00290 NGram * NGramToken::get(const raw_ngram_t& raw_ngram, ngram_type_t ngram_type) {
00291 return new (_ngram_space) NGram(raw_ngram, ngram_type);
00292 }
00293
00294
00295
00296 NGram * NGramToken::get(const NGram *source, ngram_type_t ngram_type) {
00297 return new (_ngram_space) NGram(*source, ngram_type);
00298 }
00299
00300 }