00001 00011 #include <fstream> 00012 00013 // Settings 00014 #include "config.h" 00015 // Static members initialization 00016 #include "evaluation.h" 00017 // Self header file 00018 #include "init.h" 00019 // Static members initialization 00020 #include "ngram.h" 00021 // Hash function tables check. 00022 #include "ngrams.h" 00023 // Errors, warnings 00024 #include "notifier.h" 00025 // Global vars "set up" 00026 #include "persistent.h" 00027 00028 namespace ace { 00029 00030 /* Interface item implementation - for info see description in header file. 00031 */ 00032 bool read_and_check_input_datafiles(filenames_t& files_to_process, size_t& files_total_size) { 00033 00034 std::ifstream file_list(settings::input_file); 00035 00036 if ( !file_list ) { 00037 notifier::error.fatal_error("Cannot open the file: " + std::string(settings::input_file) + "!"); 00038 return false; 00039 } 00040 00041 bool status = true; 00042 00043 std::ifstream tmp; 00044 00045 std::string dir = (settings::dir == 0) ? std::string() : std::string(settings::dir); 00046 00047 while ( file_list ) { 00048 std::string str; 00049 std::getline(file_list, str); 00050 // Ignore empty lines. 00051 if ( !str.empty() ) { 00052 // Check, if file can be opened for reading. 00053 tmp.open((dir + str).c_str()); 00054 if ( !tmp ) { 00055 // What a pity... 00056 notifier::error.fatal_error("Input datafile : " + dir + str + " cannot be opened for reading!"); 00057 status = false; 00058 } else { 00059 // Count file length. 00060 tmp.seekg(0, std::ios::end); 00061 std::streampos pos = tmp.tellg(); 00062 tmp.seekg(0, std::ios::beg); 00063 if ( pos > 0 ) { // tellg() returns -1 in case of failure. 00064 files_total_size += pos; 00065 } 00066 // Close and store. 00067 tmp.close(); 00068 files_to_process.push_back(dir + str); 00069 } 00070 } 00071 } 00072 00073 file_list.close(); 00074 00075 if ( files_to_process.empty() ) { 00076 notifier::error.fatal_error("Input data files list is empty!"); 00077 return false; 00078 } 00079 00080 // Otherwise. 00081 return status; 00082 } 00083 00084 /* Interface item implementation - for info see description in header file. 00085 */ 00086 bool startup_init(void) { 00087 00088 NGram::init(); 00089 00090 NGramToken::init(); 00091 00092 if ( !NGramHashCounter::check(settings::n) ) { 00093 // TODO: This should be compile time error. 00094 notifier::error.fatal_error("Predefined # counter tables are to small!"); 00095 } 00096 00097 EvaluationTables::init(); 00098 00099 persistent::ngrams.init(constants::ngram_within_block); 00100 00101 // TODO: Literals -m! 00102 00103 // Read morphological filter rules? 00104 if ( settings::morphologic_filter_file != 0 ) { 00105 // Morphological filter is set on. 00106 if ( settings::morphologic_tag_mask == 0 ) { 00107 // No tag info to use with filter! 00108 notifier::error.warning("Extraction of morphologic information is not set on (parameter -m not specified). Morphologic filter will not be applied..."); 00109 } 00110 // Otherwise read the rules. 00111 else if ( !persistent::morphologic_filter.read_rules(settings::morphologic_filter_file, settings::n) ) { 00112 // Error, rules cannot be read. 00113 notifier::error.fatal_error("Cannot read rules from file: " + std::string(settings::morphologic_filter_file) + "!"); 00114 return false; 00115 } 00116 } 00117 00118 if ( settings::morphologic_filter_stats_file != 0 ) { 00119 // 00120 if ( settings::morphologic_filter_file == 0 ) { 00121 // Sorry man, but stats just cannot be sucked out from the toe...:) 00122 notifier::error.warning("Morphologic filter is not turn on (parameter -f not specified). Morphologic filter stats cannot be retrieved..."); 00123 } 00124 } 00125 00126 // Read context filter rules? 00127 if ( settings::context_filter_file != 0 ) { 00128 00129 if ( settings::context_output_file == 0 ) { 00130 // No context output file specified! 00131 notifier::error.warning("Context output file is not specified (parameter -c). Morphologic filter for N-gram context will not be applied..."); 00132 } else if ( settings::morphologic_tag_mask == 0 ) { 00133 // No tag info to use with filter! 00134 notifier::error.warning("Extraction of morphologic information is not set on (parameter -m is not specified). Morphologic filter for N-gram context will not be applied..."); 00135 } 00136 // Otherwise read the rules. 00137 else if ( !persistent::context_filter.read_rules(settings::context_filter_file) ) { 00138 // Error, rules cannot be read. 00139 notifier::error.fatal_error("Cannot read rules from file: " + std::string(settings::context_filter_file) + "!"); 00140 return false; 00141 } 00142 } 00143 00144 // Some statistics can be obtained only for bigrams... 00145 if ( settings::n != 2 ) { 00146 if ( settings::thresholds.mutual_information > 0.0 ) { 00147 notifier::error.warning("Mutual information evaluation only available for N equal 2. Threshold ignored..."); 00148 } 00149 if ( settings::thresholds.pearsons_coefficient > 0.0 ) { 00150 notifier::error.warning("Pearson's coefficient evaluation only available for N equal 2. Threshold ignored..."); 00151 } 00152 if ( settings::thresholds.t_test > 0.0 ) { 00153 notifier::error.warning("t test evaluation only available for N equal 2. Threshold ignored..."); 00154 } 00155 if ( settings::thresholds.z_score > 0.0 ) { 00156 notifier::error.warning("z score evaluation only available for N equal 2. Threshold ignored..."); 00157 } 00158 } 00159 00160 return true; 00161 } 00162 00163 } // namespace ace
1.5.6