00001
00011 #include <algorithm>
00012 #include <iomanip>
00013
00014
00015 #include "config.h"
00016 #ifdef _DEPENDENCY_MODE
00017
00018 #include "dependencies.h"
00019 #endif
00020
00021 #include "evaluation.h"
00022
00023 #include "filter.h"
00024
00025 #include "ngrams.h"
00026
00027 #include "notifier.h"
00028
00029 #include "persistent.h"
00030
00031 #include "utils.h"
00032
00033 #include "writer.h"
00034
00035 namespace ace {
00036
00039 typedef std::vector<double> scores_t;
00040
00043 const char fields_separator = '\t';
00044
00045
00046 using constants::eol;
00047
00053 void _print_member(std::ostream& output, ngram_size_t index, const NGram::Member * member) {
00054 output
00055 << static_cast<unsigned>(index)
00056 << fields_separator
00057 << persistent::lemmas.get(member->lemma())
00058 << fields_separator
00059 << persistent::tags.get(member->tag())
00060 #ifdef _DEPENDENCY_MODE
00061 << fields_separator
00062 << static_cast<unsigned>(member->parent())
00063 << fields_separator
00064 << dependency2cstring(member->dependency())
00065 #endif
00066 << eol;
00067 }
00068
00073 void _print_members(std::ostream& output, const NGram *ngram) {
00074 ngram_size_t index = 1;
00075 for ( const NGram::Member * member = ngram->begin(); member != ngram->end(); ++member, ++index ) {
00076 _print_member(output, index, member);
00077 }
00078 }
00079
00086 void _print_n_members(std::ostream& output, const NGram *ngram) {
00087 ngram_size_t index = 1;
00088
00089 const NGram::Member *member = ngram->begin();
00090
00091 for ( ngram_type_t i = NGram::n(); i > 0; ++index ) {
00092 if ( ith_bit(ngram->type(), --i) ) {
00093 _print_member(output, index, member);
00094 ++member;
00095 } else {
00096
00097 output << static_cast<unsigned>(index) << fields_separator << "*" << eol;
00098 }
00099 }
00100 }
00101
00107 void _print_tables(std::ostream& output, const EvaluationTables& tables) {
00108 #ifdef _DEBUG
00109
00110 static const size_t width = 18;
00111 output << " " << std::right
00112 << std::setw(width) << "Frequency:"
00113 << std::setw(width) << "Contingent freq.:"
00114 << std::setw(width) << "Inverted freq.:"
00115 << std::setw(width) << "Expected c. fr.:"
00116 << eol;
00117
00118 for ( size_t i = 0; i < tables.size(); ++i ) {
00119 output << std::setw(3) << i << ":"
00120 << std::setw(width) << tables.frequency()[i]
00121 << std::setw(width) << tables.contigency()[i]
00122 << std::setw(width) << tables.inverted_frequency()[i]
00123 << std::setw(width) << tables.expected_frequency()[i]
00124 << eol;
00125 }
00126 #else
00127
00128 for ( size_t i = 0; i < tables.contigency().size(); ++i ) {
00129 if ( i != 0 ) {
00130 output << fields_separator;
00131 }
00132 output << tables.contigency()[i];
00133 }
00134 output << eol;
00135
00136 #endif
00137 }
00138
00145 void _print_stats(std::ostream& output, const scores_t& scores) {
00146 #ifdef _DEBUG
00147
00148 static const size_t width = 38;
00149 output << std::right
00150 << std::setw(width) << "Pearson's chi square test:"
00151 << std::setw(width) << "Log likelihood ratio:"
00152 << eol
00153 << std::setw(width) << scores[0]
00154 << std::setw(width) << scores[1]
00155 << eol;
00156
00157 if ( scores.size() > 2 ) {
00158 output
00159 << std::setw(width) << "Pointwise mutual information:"
00160 << std::setw(width) << "Pearson's coefficient:"
00161 << eol
00162 << std::setw(width) << scores[2]
00163 << std::setw(width) << scores[3]
00164 << eol
00165 << std::setw(width) << "Student's t-test:"
00166 << std::setw(width) << "Z score:"
00167 << eol
00168 << std::setw(width) << scores[4]
00169 << std::setw(width) << scores[5]
00170 << eol;
00171 }
00172 #else
00173 output << scores[0] << fields_separator << scores[1];
00174
00175 if ( scores.size() > 2 ) {
00176 output
00177 << fields_separator << scores[2] << fields_separator << scores[3]
00178 << fields_separator << scores[4] << fields_separator << scores[5]
00179 ;
00180 }
00181
00182 output << eol;
00183 #endif
00184 }
00185
00187
00188
00189
00190 void write(std::ofstream& output_file) {
00191
00192
00193 scores_t scores(2);
00194 scores_t thresholds;
00195 std::vector<double (*)(const EvaluationTables&)> evaluators;
00196
00197 thresholds.push_back(settings::thresholds.chi_square_test);
00198 thresholds.push_back(settings::thresholds.log_likelihood_ratio);
00199
00200 evaluators.push_back(&eval::chi_square_test);
00201 evaluators.push_back(&eval::log_likelihood_ratio);
00202
00203
00204 if ( settings::n == 2 ) {
00205
00206 scores.resize(6);
00207
00208 thresholds.push_back(settings::thresholds.mutual_information);
00209 thresholds.push_back(settings::thresholds.pearsons_coefficient);
00210 thresholds.push_back(settings::thresholds.t_test);
00211 thresholds.push_back(settings::thresholds.z_score);
00212
00213 evaluators.push_back(&eval::mutual_information);
00214 evaluators.push_back(&eval::pearsons_coefficient);
00215 evaluators.push_back(&eval::t_test);
00216 evaluators.push_back(&eval::z_score);
00217 }
00218
00219 #ifdef _BOOST_ON
00220
00221 progress_display_t display(static_cast<unsigned long>(persistent::ngrams.size()), notifier::progress.output_stream(), "");
00222 #endif
00223
00224
00225 output_file << std::setprecision(static_cast<std::streamsize>(settings::precision));
00226
00227
00228 NGramsProcessor ngrams_processor(persistent::ngrams);
00229
00230 while ( ngrams_processor.next() ) {
00231
00232 #ifdef _BOOST_ON
00233
00234 ++display;
00235 #endif
00236 const NGramFrequencies& ngram_freq = ngrams_processor.ngram_frequencies();
00237
00238
00239 if ( settings::sieves.frequency > ngram_freq.frequencies[NGram::full_ngram_type()]) {
00240
00241 continue;
00242 }
00243
00244
00245 EvaluationTables eval_tables(ngram_freq.frequencies);
00246
00247
00248 if ( settings::sieves.expected_frequency > eval_tables.expected_frequency()[NGram::full_ngram_type()] ) {
00249
00250 continue;
00251 }
00252
00253
00254 std::fill(scores.begin(), scores.end(), 0.0);
00255
00256
00257
00258
00259
00260
00261
00262
00263 bool passed = true;
00264 for ( size_t i = 0; i < evaluators.size(); ++i ) {
00265
00266 if ( thresholds[i] > 0.0 ) {
00267
00268 scores[i] = evaluators[i](eval_tables);
00269 if ( thresholds[i] > scores[i] ) {
00270 passed = false;
00271 if ( settings::all_thresholds_together ) {
00272
00273 break;
00274 }
00275 } else {
00276 passed = true;
00277 if ( !settings::all_thresholds_together ) {
00278
00279 break;
00280 }
00281 }
00282 }
00283 }
00284
00285 if ( !passed ) {
00286 continue;
00287 }
00288
00289
00290 for ( size_t i = 0; i < scores.size(); ++i ) {
00291 if ( !(scores[i] > 0) ) {
00292 scores[i] = evaluators[i](eval_tables);
00293 }
00294 }
00295
00296
00297 _print_members(output_file, ngram_freq.ngram);
00298
00299 _print_tables(output_file, eval_tables);
00300
00301 _print_stats(output_file, scores);
00302
00303 output_file << eol;
00304 }
00305
00306
00307 output_file << std::endl;
00308 }
00309
00310 void _print_context(std::ofstream& output_file, const context_t * ctx, bool narrow = false) {
00311 for ( context_t::const_iterator i_ctx = ctx->begin(); i_ctx != ctx->end(); ++i_ctx ) {
00312
00313 if ( (narrow && i_ctx->is_part_of_narrow()) || (!narrow && i_ctx->is_part_of_wide()) ) {
00314
00315 output_file
00316 << persistent::lemmas.get(i_ctx->lemma()) << fields_separator
00317 << persistent::tags.get(i_ctx->tag()) << fields_separator
00318 << i_ctx->frequency() << eol;
00319 }
00320 }
00321 output_file << eol;
00322 }
00323
00324
00325
00326 void write_contexts(std::ofstream& output_file) {
00327
00328 size_t ngrams_to_process = 0;
00329 for ( ngram_type_t type = NGram::full_ngram_type(); type > 0; --type ) {
00330 ngrams_to_process += persistent::ngrams.size(type);
00331 }
00332
00333 #ifdef _BOOST_ON
00334
00335 progress_display_t display(static_cast<unsigned long>(ngrams_to_process), notifier::progress.output_stream(), "");
00336 #endif
00337
00338
00339
00340 for ( ngram_type_t type = NGram::full_ngram_type(); type > 0; --type ) {
00341
00342 for ( NGramStore::hash_table_t::const_iterator i_ngram = persistent::ngrams.begin(type); i_ngram != persistent::ngrams.end(type); ++i_ngram ) {
00343
00344 _print_n_members(output_file, *i_ngram);
00345
00346 output_file << eol;
00347 if ( (*i_ngram)->type() == NGram::full_ngram_type() ) {
00348
00349 _print_context(output_file, (*i_ngram)->context(), true);
00350 }
00351
00352 _print_context(output_file, (*i_ngram)->context(), false);
00353
00354 output_file << eol;
00355
00356 #ifdef _BOOST_ON
00357
00358 ++display;
00359 #endif
00360 }
00361 }
00362
00363 output_file << std::endl;
00364 }
00365
00366
00367
00368
00369 void write_stats(std::ofstream& output_file) {
00370
00371 DataFileStats overall;
00372
00373 for ( stats_t::const_iterator i_stats = persistent::overall_stats.begin(); i_stats != persistent::overall_stats.end(); ++i_stats) {
00374
00375 output_file
00376 << "Statistics for datafile " << i_stats->filename << ":" << eol
00377 << "# of words: " << i_stats->words << eol
00378 << "# of sentences: " << i_stats->sentences << eol
00379 << "# of " << static_cast<unsigned>(settings::n) << "-grams extracted: " << i_stats->ngrams.extracted << eol
00380 ;
00381 if ( persistent::morphologic_filter.is_on() ) {
00382 output_file
00383 << "# of " << static_cast<unsigned>(settings::n) << "-grams passed: " << i_stats->ngrams.passed << eol
00384 << "# of " << static_cast<unsigned>(settings::n) << "-grams filtered: " << i_stats->ngrams.filtered << eol
00385 ;
00386 }
00387 output_file << eol;
00388
00389
00390
00391 overall += *i_stats;
00392 }
00393
00394 output_file
00395 << "Overall datafiles statistics:" << eol
00396 << "# of words: " << overall.words << eol
00397 << "# of sentences: " << overall.sentences << eol
00398 << "# of " << static_cast<unsigned>(settings::n) << "-grams extracted: " << overall.ngrams.extracted << eol
00399 ;
00400 if ( persistent::morphologic_filter.is_on() ) {
00401 output_file
00402 << "# of " << static_cast<unsigned>(settings::n) << "-grams passed: " << overall.ngrams.passed << eol
00403 << "# of " << static_cast<unsigned>(settings::n) << "-grams filtered: " << overall.ngrams.filtered << eol
00404 ;
00405 }
00406 output_file << eol;
00407
00408 output_file
00409 << "Strings storage statistics:" << eol
00410 << "# of unique lemmas: " << persistent::lemmas.size() << eol
00411 << "# of unique tags: " << persistent::tags.size() << eol
00412 << eol
00413 << static_cast<unsigned>(settings::n) << "-grams storage statistics:" << eol;
00414 for ( ngram_type_t type = 0; type <= NGram::full_ngram_type(); ++type ) {
00415 output_file << "# of unique " << static_cast<unsigned>(settings::n) << "-grams of type " << static_cast<unsigned>(type) << ": " << persistent::ngrams.size(type) << eol;
00416 }
00417
00418
00419 output_file << std::endl;
00420
00421 }
00422
00423
00424
00425 void write_morphologic_filter_stats(std::ofstream& output_file) {
00426
00427 const MultiRulesFilter::rules_t& rules = persistent::morphologic_filter.get_rules();
00428
00429 for ( morphologic_filter_file_stats_t::const_iterator i_stats = persistent::morphologic_filter_file_stats.begin(); i_stats != persistent::morphologic_filter_file_stats.end(); ++i_stats) {
00430
00431 output_file << "Passed " << static_cast<unsigned>(settings::n) << "-grams statistics for datafile " << i_stats->first << ":" << eol;
00432
00433 const Filter::Stats& stats = i_stats->second;
00434
00435 for ( size_t i = 0; i < rules.size(); ++i ) {
00436
00437 for ( size_t j = 0; j < rules[i].size(); ++j ) {
00438 output_file << ' ' << rules[i][j];
00439 }
00440
00441 output_file << fields_separator << stats.matched[i] << eol;
00442 }
00443
00444 output_file << eol;
00445
00446 }
00447
00448 output_file << "Overall passed " << static_cast<unsigned>(settings::n) << "-grams statistics:" << eol;
00449 for ( size_t i = 0; i < rules.size(); ++i ) {
00450
00451 for ( size_t j = 0; j < rules[i].size(); ++j ) {
00452 output_file << ' ' << rules[i][j];
00453 }
00454
00455 output_file << fields_separator << persistent::morphologic_filter.stats_so_far().matched[i] << eol;
00456 }
00457
00458
00459 output_file << std::endl;
00460
00461 }
00462
00463 }