00001
00020
00021
00022 #ifndef _CONFIG_H
00023 #define _CONFIG_H
00024
00025 #include <cstddef>
00026
00027
00028 #if defined(_M_X64) || defined(_M_IA64) || defined(__x86_64__) || defined(__IA64__)
00029
00030
00033 #define _64_BIT
00034
00035 #elif defined(_M_IX86) || defined(__i386__)
00036
00037
00040 #define _32_BIT
00041 #else
00042 #error "CPU word size not detected!"
00043 #endif
00044
00045
00046 #if !defined(_DEPENDENCY_MODE) && !defined(_SURFACE_MODE)
00047 #define _DEPENDENCY_MODE // Default mode.
00048 #elif defined(_DEPENDENCY_MODE) && defined(_SURFACE_MODE)
00049 #undef _SURFACE_MODE // Only one mode allowed.
00050 #endif
00051
00053
00054 namespace ace {
00055
00058 namespace constants {
00059
00063 extern const size_t max_grammity;
00064
00065
00068 extern const size_t bits_per_char;
00072 const size_t max_string_length = 65535;
00075 extern const size_t max_collocation_window_size;
00078 extern const size_t ngram_within_block;
00081 extern const size_t default_precision;
00084 const char null_char = '\0';
00087 const char eol = '\n';
00090 extern const size_t max_sentence_length;
00094 extern const char significant_position_mark;
00098 extern const char insignificant_position_mark;
00102 extern const size_t sentences_radius_max_size;
00106 extern const size_t words_radius_max_size;
00107
00108 }
00109
00110
00112
00116 typedef unsigned short word_order_t;
00120 typedef unsigned int string_index_t;
00124 typedef unsigned char ngram_size_t;
00128 typedef unsigned char dependency_index_t;
00132 typedef unsigned char tag_index_t;
00135 typedef unsigned char ngram_type_t;
00136
00142 class TaggedLemma {
00147 union {
00148 string_index_t _lemma;
00149 tag_index_t _bits[sizeof(string_index_t) / sizeof(tag_index_t)];
00150 };
00153 static const size_t _last_bit_index;
00156 static const string_index_t _lemma_bits_mask;
00157 public:
00161 TaggedLemma(string_index_t lemma, tag_index_t tag): _lemma(lemma) {
00162 _bits[_last_bit_index] = tag;
00163 }
00166 inline string_index_t lemma(void) const { return _lemma & _lemma_bits_mask; }
00169 inline tag_index_t tag(void) const { return _bits[_last_bit_index]; }
00170 };
00171
00174 struct ContextWindow {
00177 enum Mode {
00178 SENTENCE_MODE,
00179 WORD_MODE
00180 } mode;
00183 size_t radius;
00188 ContextWindow(Mode m = SENTENCE_MODE, size_t r = 0): mode(m), radius(r) {}
00191 inline bool check(void) {
00192 return ((mode == SENTENCE_MODE) && (radius <= constants::sentences_radius_max_size)) || ((mode == WORD_MODE) && (1 <= radius) && (radius <= constants::words_radius_max_size));
00193 }
00194 };
00195
00199 struct Sieves {
00202 double expected_frequency;
00205 size_t frequency;
00208 Sieves(void): expected_frequency(0.0), frequency(0) {}
00209 };
00210
00213 struct Thresholds {
00216 double chi_square_test;
00219 double log_likelihood_ratio;
00222 double mutual_information;
00225 double pearsons_coefficient;
00228 double t_test;
00231 double z_score;
00234 Thresholds(void): chi_square_test(0.0), log_likelihood_ratio(0.0), mutual_information(0.0), pearsons_coefficient(0.0), t_test(0.0), z_score(0.0) {}
00235 };
00236
00237
00239
00242 namespace settings {
00245 extern size_t bucket_size;
00246 #ifdef _SURFACE_MODE
00247
00249 extern size_t collocation_window_size;
00250 #endif
00251
00253 extern ContextWindow context_window;
00256 extern const char *context_filter_file;
00259 extern const char *context_output_file;
00264
00265
00268 extern size_t crop_ratio;
00271 extern const char *dir;
00274 extern const char *morphologic_filter_file;
00277 extern const char *morphologic_filter_stats_file;
00280 extern const char *input_file;
00283 extern ngram_size_t n;
00286 extern const char *output_file;
00289 extern size_t precision;
00292 extern bool sort;
00295 extern const char *stats_file;
00298 extern Sieves sieves;
00301 extern Thresholds thresholds;
00305 extern bool all_thresholds_together;
00308 extern const char *morphologic_tag_mask;
00311 extern bool verbose;
00312
00315 bool check_mask(const char *mask);
00318 bool context_tracing_on(void);
00321 bool correct(void);
00324 bool ready(void);
00325
00326 }
00327
00328 }
00329
00330 #endif