00001
00022 #ifndef _NGRAMS_H
00023 #define _NGRAMS_H
00024
00025
00026 #include "config.h"
00027
00028 #ifdef _USE_HASHSET
00029
00030 #if defined(_MSC_VER)
00031 #include <hash_set>
00032 #elif defined(__GNUC__)
00033
00034 #include <ext/hash_set>
00035 #endif
00036 #else
00037
00038 #include "hash_vector.h"
00039 #endif
00040
00041
00042 #include <vector>
00043
00044 #include <utility>
00045
00046
00047 #include "ngram.h"
00048
00049 #include "pool.h"
00050
00051 namespace ace {
00052
00055 class NGramHashCounter {
00056
00057 #ifdef _64_BIT
00058
00061 static const size_t _seed = 0xdeadbeeffeeddeaf;
00062 #else
00063
00065 static const size_t _seed = 0xdeadbeef;
00066 #endif
00067
00068 public:
00073 static size_t count_hash(const NGram *gram);
00076 static bool check(ngram_size_t n);
00077 };
00078
00081 struct NGramHasher: public std::unary_function<size_t, const NGram *> {
00086 inline size_t operator()(const NGram *key) const {
00087 return NGramHashCounter::count_hash(key);
00088 }
00089 };
00090
00091 #ifdef _USE_HASHSET
00092
00093 #if defined(_MSC_VER)
00094
00097 struct NGramLessComparator: public std::binary_function<bool, const NGram *, const NGram *> {
00101 bool operator()(const NGram *lhs, const NGram *rhs) const;
00102 };
00103
00107 class NGramHashCompare: public stdext::hash_compare<const NGram *> {
00110 static NGramHasher _hasher;
00113 static NGramLessComparator _comparator;
00114 public:
00115
00116
00117 enum {
00118 bucket_size = 8,
00119 min_buckets = 16
00120 };
00125 inline size_t operator()(const NGram *key) const {
00126
00127 return _hasher(key);
00128 }
00134 inline bool operator()(const NGram *lhs, const NGram *rhs) const {
00135
00136 return _comparator(lhs, rhs);
00137 }
00138 };
00139
00140 #elif defined(__GNUC__)
00141
00143 struct NGramEqualComparator: public std::binary_function<bool, const NGram *, const NGram *> {
00149 bool operator()(const NGram *lhs, const NGram *rhs) const;
00150 };
00151
00152 #endif
00153
00154 #endif // _USE_HASHSET
00155
00161 class NGramStore {
00162 public:
00163 #ifdef _USE_HASHSET
00164
00166 #if defined(_MSC_VER)
00167 typedef stdext::hash_set<NGram *, NGramHashCompare> hash_table_t;
00168 #elif defined(__GNUC__)
00169 typedef __gnu_cxx::hash_set<NGram *, NGramHasher, NGramEqualComparator> hash_table_t;
00170 #endif
00171 #else
00172 typedef HashVector<NGram *, NGramHasher> hash_table_t;
00173 #endif
00174
00176 NGram zero_ngram;
00177
00178 private:
00184 typedef std::pair<hash_table_t::const_iterator, bool> _hash_table_pair_cib_t;
00188 std::vector<VoidPool> _memory_pools;
00193 std::vector<hash_table_t> _hash_tables;
00194
00198 _hash_table_pair_cib_t _find(const NGram *ngram) const;
00199
00202 NGramStore(const NGramStore&);
00205 NGramStore& operator=(const NGramStore&);
00206 public:
00211 typedef std::pair<NGram *, bool> insert_status_t;
00212
00217 NGramStore(size_t memory_pool_block_size = 0);
00218
00219
00220
00224 inline hash_table_t::const_iterator begin(ngram_size_t type) const { return _hash_tables[type].begin(); }
00225
00226
00227 inline hash_table_t::const_iterator begin(void) const { return _hash_tables[NGram::full_ngram_type()].begin(); }
00231 inline hash_table_t::const_iterator end(ngram_size_t type) const { return _hash_tables[type].end(); }
00232
00233
00234 inline hash_table_t::const_iterator end(void) const { return _hash_tables[NGram::full_ngram_type()].end(); }
00238 const NGram * get(const NGram *ngram) const;
00243 void init(size_t memory_pool_block_size);
00246 insert_status_t insert(const NGram *ngram);
00250 inline size_t size(ngram_type_t type) const { return (type == 0) ? 1 : _hash_tables[type].size(); }
00253 inline size_t size(void) const { return _hash_tables[NGram::full_ngram_type()].size(); }
00254 #ifndef _USE_HASHSET
00255
00257 void sort(ngram_type_t type = 0);
00258 #endif
00259 };
00260
00263 typedef std::vector<frequency_t> freq_table_t;
00264
00267 struct NGramFrequencies {
00270 const NGram *ngram;
00278 freq_table_t frequencies;
00282 NGramFrequencies(void): ngram(NULL), frequencies(NGram::types_count()) {}
00283 };
00284
00288 class NGramsProcessor {
00291 const NGramStore& _ngram_store;
00294 NGramStore::hash_table_t::const_iterator _next;
00297 NGramStore::hash_table_t::const_iterator _end;
00300 NGramFrequencies _ngram_frequencies;
00301 public:
00305 NGramsProcessor(NGramStore& ngrams): _ngram_store(ngrams), _next(ngrams.begin()), _end(ngrams.end()), _ngram_frequencies() {}
00306
00310 bool next(void);
00311
00316 inline const NGramFrequencies& ngram_frequencies(void) const { return _ngram_frequencies; }
00317 };
00318
00319 }
00320
00321 #endif