00001
00009 #ifndef _NGRAM_H
00010 #define _NGRAM_H
00011
00012
00013 #include <utility>
00014 #include <vector>
00015
00016 #include "config.h"
00017
00018 #include "context.h"
00019 #ifdef _DEPENDENCY_MODE
00020
00021 #include "dependencies.h"
00022 #endif
00023
00024 #include "overflow.h"
00025
00026 #include "utils.h"
00027
00028 #include "word.h"
00029
00030 namespace ace {
00031
00032 #ifdef _SURFACE_MODE
00033
00036 typedef words_store_t::const_iterator raw_ngram_member_t;
00037 #else
00038
00042 typedef std::pair<ngram_size_t, words_store_t::const_iterator> raw_ngram_member_t;
00043 #endif
00044
00046 typedef std::vector<raw_ngram_member_t> raw_ngram_t;
00047
00048
00049 class NGramToken;
00050
00055 typedef unsigned int freq_counter_t;
00056
00062 typedef size_t frequency_t;
00063
00064 #if defined(_MSC_VER) || defined(__GNUC__)
00065
00066 #pragma pack(push)
00067
00068 #pragma pack(1)
00069 #endif
00070
00079 class NGram {
00080 public:
00084 class Member {
00087 TaggedLemma _tagged_lemma;
00088 #ifdef _DEPENDENCY_MODE
00089
00091 ngram_size_t _parent;
00094 dependency_index_t _dependency;
00095 public:
00103 Member(string_index_t lemma, tag_index_t tag, ngram_size_t parent, dependency_index_t dependency): _tagged_lemma(lemma, tag), _parent(parent), _dependency(dependency) {}
00106 inline ngram_size_t parent(void) const { return _parent; }
00109 inline dependency_index_t dependency(void) const { return _dependency; }
00110 #else
00111 public:
00117 Member(string_index_t lemma, tag_index_t tag): _tagged_lemma(lemma, tag) {}
00118 #endif
00119
00121 inline string_index_t lemma(void) const { return _tagged_lemma.lemma(); }
00124 inline tag_index_t tag(void) const { return _tagged_lemma.tag(); }
00125 };
00126 private:
00129 typedef Overflow<const NGram *, frequency_t> _overflows_map_t;
00130
00132
00135 static const freq_counter_t _max_frequency;
00138 static const ngram_type_t _overflow_flag_mask;
00142 static const size_t _last_bit_index;
00145 static const ngram_type_t _type_mask;
00146
00148
00151 static bool _context_tracing_on;
00154 static ngram_type_t _full_ngram_type;
00157 static std::vector<ngram_size_t> _members_count;
00160 static std::vector<size_t> _ngrams_memory_sizes;
00163 static _overflows_map_t _overflows;
00166 static ngram_type_t _types_count;
00167
00169
00178 union {
00181 freq_counter_t _frequency;
00186 ngram_type_t _frequency_bits[sizeof(freq_counter_t)/sizeof(ngram_type_t)];
00187 };
00189
00190
00191
00192 NGram& operator=(const NGram& source);
00193
00195
00198 inline bool _overflow(void) const { return (_frequency_bits[_last_bit_index] & _overflow_flag_mask) != 0; }
00203 void _type(ngram_type_t ngram_type);
00204 public:
00207 static void init(void);
00208
00213 NGram(void): _frequency(0) {}
00218 NGram(const raw_ngram_t& raw_ngram, ngram_type_t ngram_type);
00222 NGram(const NGram& source);
00227 NGram(const NGram& source, ngram_type_t ngram_type);
00228
00231 inline ngram_size_t degree() const { return _members_count[type()]; }
00234 static ngram_type_t full_ngram_type(void) { return _full_ngram_type; }
00237 inline bool is_zero(void) const { return type() == 0; }
00240 static ngram_size_t n(void) { return settings::n; }
00243 static size_t size_of(ngram_size_t degree) { return _ngrams_memory_sizes[degree]; }
00246 inline size_t size_of(void) const { return _ngrams_memory_sizes[degree()]; }
00249 inline ngram_type_t type(void) const { return _frequency_bits[_last_bit_index] & _type_mask; }
00252 static ngram_type_t types_count(void) { return _types_count; }
00253
00256 inline Member * begin(void) { return static_cast<Member *>(static_cast<void *>(this + 1)); }
00259 inline const Member * begin(void) const { return static_cast<const Member *>(static_cast<const void *>(this + 1)); }
00262 inline Member * end(void) { return this->begin() + degree(); }
00265 inline const Member * end(void) const { return this->begin() + degree(); }
00268 inline Member * get(ngram_size_t index) { return this->begin() + index; }
00271 inline const Member * get(ngram_size_t index) const { return this->begin() + index; }
00274 frequency_t frequency(void) const {
00275
00276
00277
00278 return _overflows_map_t::add_to_maximum((_overflow() ? _overflows.get(this) : 0), (_frequency & _max_frequency));
00279 }
00282 void inc(void);
00286 inline string_index_t lemma(ngram_size_t i) const { return this->get(i)->lemma(); }
00290 inline tag_index_t tag(ngram_size_t i) const { return this->get(i)->tag(); }
00291 #ifdef _DEPENDENCY_MODE
00292
00295 inline ngram_size_t parent(ngram_size_t i) const { return this->get(i)->parent(); }
00299 inline dependency_index_t dependency(ngram_size_t i) const { return this->get(i)->dependency(); }
00300 #endif
00301
00302
00306 void add_to_context(const PartOfContext& part_of_context);
00309 inline context_t * context(void) {
00310 return static_cast<context_t *>(static_cast<void *>(this->end()));
00311 }
00314 inline const context_t * context(void) const {
00315 return static_cast<const context_t *>(static_cast<const void *>(this->end()));
00316 }
00317
00320 friend class NGramToken;
00321 };
00322
00328 bool operator==(const NGram& lhs, const NGram& rhs);
00334 bool operator<(const NGram& lhs, const NGram& rhs);
00335
00336
00337 #if defined(_MSC_VER) || defined(__GNUC__)
00338
00339 #pragma pack(pop)
00340 #endif
00341
00346 class NGramToken {
00350 static void *_ngram_space;
00351 public:
00355 static void init(void);
00360 static NGram * get(const raw_ngram_t& raw_ngram, ngram_type_t ngram_type);
00365 static NGram * get(const NGram *source, ngram_type_t ngram_type);
00366 };
00367
00368 }
00369
00370 #endif