00001 00011 #include <limits> 00012 00013 #include "config.h" 00014 00015 namespace ace { 00016 00017 /* Constants. 00018 */ 00019 namespace constants { 00020 00021 // Reserve most significant bit for counter overflow flag (see ngram.h). 00022 const size_t max_grammity = std::numeric_limits<ngram_type_t>::digits - 1; 00023 00024 const size_t bits_per_char = std::numeric_limits<unsigned char>::digits; 00025 00026 const size_t ngram_within_block = 16*1024; 00027 00028 const size_t default_precision = 10; 00029 00030 const size_t max_sentence_length = std::numeric_limits<word_order_t>::max(); 00031 00032 const size_t max_collocation_window_size = std::numeric_limits<size_t>::digits; 00033 00034 const char significant_position_mark = '*'; 00035 00036 const char insignificant_position_mark = '-'; 00037 00038 const size_t sentences_radius_max_size = 2; 00039 00040 const size_t words_radius_max_size = 50; 00041 00042 } 00043 00044 00045 /* TaggedLemma static members definition. 00046 */ 00047 const size_t TaggedLemma::_last_bit_index = sizeof(string_index_t) / sizeof(tag_index_t) - 1; 00048 00049 const string_index_t TaggedLemma::_lemma_bits_mask = std::numeric_limits<string_index_t>::max() >> (sizeof(tag_index_t) * constants::bits_per_char); 00050 00051 00052 /* Settings (default values). 00053 */ 00054 namespace settings { 00055 00056 bool all_thresholds_together = false; 00057 00058 size_t bucket_size = 4; // (real value is 2 ^ bucket_size) 00059 00060 #ifdef _SURFACE_MODE 00061 size_t collocation_window_size = 0; 00062 #endif 00063 00064 ContextWindow context_window; 00065 00066 const char *context_filter_file = 0; 00067 00068 const char *context_output_file = 0; 00069 00070 //const char *context_tag_mask = 0; 00071 00072 size_t crop_ratio = 2; 00073 00074 const char *dir = 0; 00075 00076 const char *morphologic_filter_file = 0; 00077 00078 const char *morphologic_filter_stats_file = 0; 00079 00080 const char *input_file = 0; 00081 00082 ngram_size_t n = 0; 00083 00084 const char *output_file = 0; 00085 00086 size_t precision = constants::default_precision; 00087 00088 bool sort = false; 00089 00090 const char *stats_file = 0; 00091 00092 Sieves sieves; 00093 00094 Thresholds thresholds; 00095 00096 const char *morphologic_tag_mask = 0; 00097 00098 bool verbose = false; 00099 00100 00106 bool _mask_less_than(const char *lhs, const char *rhs) { 00107 if ( lhs == 0 ) { 00108 return true; 00109 } 00110 if ( rhs == 0 ) { 00111 // Right hand arg. can be zero, but not when the left one isn't. 00112 return false; 00113 } 00114 while ( (*lhs) && (*rhs) ) { 00115 if ( (*lhs == constants::significant_position_mark) && (*rhs == constants::insignificant_position_mark) ) { 00116 return false; 00117 } 00118 ++lhs, ++rhs; 00119 } 00120 // Left hand argument must be at his end, otherwise it's not lesser. 00121 return (*lhs == constants::null_char); 00122 } 00123 00124 /* Interface item implementation. 00125 */ 00126 bool check_mask(const char *mask) { 00127 if ( mask == 0 ) { 00128 // Empty mask is allowed. 00129 return true; 00130 } 00131 const char *pc = mask; 00132 while ( *pc ) { 00133 if ( (*pc != constants::significant_position_mark) && (*pc != constants::insignificant_position_mark) ) { 00134 // Only '*' and '-' are allowed... 00135 return false; 00136 } 00137 ++pc; 00138 } 00139 return true; 00140 } 00141 00142 /* Interface item implementation. 00143 */ 00144 bool context_tracing_on(void) { return context_output_file != 0; } 00145 00146 /* Interface item implementation. 00147 */ 00148 bool correct(void) { 00149 if ( (input_file == 0) || (output_file == 0) || (n <= 1) || (constants::max_grammity < n) ) { 00150 return false; 00151 } 00152 if ( !check_mask(morphologic_tag_mask) ) { 00153 return false; 00154 } 00155 if ( precision == 0 ) { 00156 return false; 00157 } 00158 if ( context_tracing_on() && !context_window.check() ) { 00159 //|| !check_mask(context_tag_mask) || !_mask_less_than(context_tag_mask, morphologic_tag_mask)) ) { 00160 return false; 00161 } 00162 #ifdef _SURFACE_MODE 00163 if ( (collocation_window_size + 1) < n ) { 00164 // To small. 00165 // TODO - move to init? Report? 00166 collocation_window_size = n - 1; 00167 } else if ( collocation_window_size > constants::max_collocation_window_size ) { 00168 return false; 00169 } 00170 00171 #endif 00172 return true; 00173 } 00174 00175 /* Interface item implementation. 00176 */ 00177 bool ready(void) { 00178 // Must be non-defaults. 00179 return (input_file != 0) && (n != 0) && (output_file != 0); 00180 } 00181 00182 } // namespace settings 00183 00184 } // namespace ace
1.5.6