00001 00011 #include <fstream> 00012 #include <string> 00013 00014 // Extractor interface 00015 #include "extractor.h" 00016 // Notification 00017 #include "notifier.h" 00018 // Overall stats storage 00019 #include "persistent.h" 00020 // Slef header file 00021 #include "processor.h" 00022 // Stats for files 00023 #include "stats.h" 00024 00025 namespace ace { 00026 00027 unsigned processing_progress_precision = 4; 00028 00029 /* Interface item implementation - for info see description in header file. 00030 */ 00031 void process(filenames_t& files_to_process, size_t files_total_size) { 00032 00033 // 00034 std::ifstream input_file; 00035 00036 // 00037 size_t processed_so_far = 0, file_size = 0; 00038 00039 // 00040 notifier::progress.info("Processing input datafiles."); 00041 00042 for ( filenames_t::const_iterator i_file = files_to_process.begin(); i_file != files_to_process.end(); ++i_file ) { 00043 00045 00046 input_file.open(i_file->c_str()); 00047 00048 if ( !input_file ) { 00049 notifier::error.warning("Input datafile: " + *i_file + " cannot be opened for reading! Skipping..."); 00050 continue; 00051 } 00052 00054 00055 // Stats for current file. 00056 NamedDataFileStats file_stats(*i_file); 00057 00058 // Show time elapsed. 00059 notifier::progress.info("Processing datafile: " + *i_file); 00060 00061 // Error message. If empty -> no error. 00062 std::string error_message; 00063 00064 try { 00065 // Extract. 00066 file_size = extract(input_file, file_stats); 00067 } catch ( std::bad_alloc& ) { 00068 error_message = "Program has met with out-of-memory condition while processing datafile: " + *i_file + "!"; 00069 } catch ( index_overflow& ) { 00070 if ( persistent::lemmas.is_full() ) { 00071 error_message = std::string("Number of unique strings in lemmas store has overflowen numeric index type range!"); 00072 } else if ( persistent::tags.is_full() ) { 00073 error_message = std::string("Number of unique strings in morphologic tags store has overflowen numeric index type range!"); 00074 } else { 00075 error_message = std::string("Undetermined index overflow exception!"); 00076 } 00077 } 00078 00079 // Close the file! 00080 input_file.close(); 00081 00082 if ( !error_message.empty() ) { 00083 // What a pity. 00084 throw fatal_error(error_message); 00085 } 00086 00087 // Store the stats. 00088 persistent::overall_stats.push_back(file_stats); 00089 00090 if ( persistent::morphologic_filter.is_on() ) { 00091 // Store the morphologic filter stats... 00092 persistent::morphologic_filter_file_stats.insert(morphologic_filter_file_stats_t::value_type(*i_file, persistent::morphologic_filter.recent_stats())); 00093 // ...and reset (make clear for next file). 00094 persistent::morphologic_filter.reset_recent_stats(); 00095 } 00096 00097 // Show time elapsed. 00098 notifier::progress.elapsed("Finished processing datafile: " + *i_file + " (" + unsigned2str(file_size) + " bytes)"); 00099 00100 if ( files_total_size ) { 00101 // Show file processing progress. 00102 processed_so_far += file_size; 00103 notifier::progress.info("Overall processing progress: " + double2str(static_cast<double>(processed_so_far)/static_cast<double>(files_total_size) * 100, processing_progress_precision) + "%"); 00104 } 00105 00106 } 00107 00108 // Elapsed: 00109 notifier::progress.elapsed("Finished processing input datafiles."); 00110 00111 #ifndef _USE_HASHSET 00112 if ( settings::sort ) { 00113 notifier::progress.info("Sorting N-grams by their frequency."); 00114 // Sort all hash tables except the one for full N-gram type (each full N-gram is accessed only one). 00115 for ( ngram_type_t type = 1; type < NGram::full_ngram_type(); ++type ) { 00116 ace::persistent::ngrams.sort(type); 00117 } 00118 notifier::progress.elapsed("Finished sorting N-grams by their frequency."); 00119 } 00120 #endif 00121 00122 } 00123 00124 } // namespace ace
1.5.6