00001
00011 #include <algorithm>
00012 #include <utility>
00013 #include <vector>
00014
00015
00016 #include "buffer.h"
00017
00018 #include "config.h"
00019
00020 #include "extractor.h"
00021
00022 #include "ngram.h"
00023
00024 #include "notifier.h"
00025 #ifdef _DEPENDENCY_MODE
00026
00027 #include "ntree.h"
00028 #endif
00029
00030 #include "parser.h"
00031
00032 #include "persistent.h"
00033
00034 #include "utils.h"
00035
00036 #include "word.h"
00037
00038 namespace ace {
00039
00040 #ifdef _DEPENDENCY_MODE
00041
00047 void _build_tree(const words_range_t& sentence, ntree_t& nodes) {
00048
00049 words_range_t::first_type word = sentence.first;
00050
00051
00052 while ( word != sentence.second ) {
00053 nodes.push_back(NTreeNode(word->order(), word->parent()));
00054 ++word;
00055 }
00056
00057
00058 for ( word_order_t i = 1; i < nodes.size(); ++i ) {
00059 nodes[nodes[i].parent()].add_child(i);
00060 }
00061 }
00062
00063
00070 words_store_t::const_iterator _extract_ngram(words_store_t::const_iterator sentence_start, const subtree_t& subtree, raw_ngram_t& raw_ngram) {
00071
00072 words_store_t::const_iterator word, head;
00073
00074
00075 for ( subtree_t::const_iterator i_subtree = subtree.begin(); i_subtree != subtree.end(); ++i_subtree ) {
00076
00077 word = sentence_start + *i_subtree - 1;
00078
00079 std::pair<subtree_t::const_iterator, subtree_t::const_iterator> i_parent = std::equal_range(subtree.begin(), subtree.end(), word->parent());
00080 if ( i_parent.first == i_parent.second ) {
00081
00082
00083 raw_ngram.push_back(raw_ngram_t::value_type(0, word));
00084
00085 head = word;
00086 } else {
00087
00088 raw_ngram.push_back(raw_ngram_t::value_type(static_cast<ngram_size_t>(i_parent.second - subtree.begin()), word));
00089 }
00090 }
00091
00092 return head;
00093 }
00094 #endif // _DEPENDENCY_MODE
00095
00103 NGram * _store_ngram(const raw_ngram_t& raw_ngram, ngram_type_t type) {
00104
00105
00106
00107 if ( persistent::morphologic_filter.is_on() && (type != 0) ) {
00108
00109 static multirule_t tags(NGram::n());
00110 static std::string empty;
00111
00112 ngram_type_t i = NGram::n() - 1;
00113
00114
00115 for ( raw_ngram_t::const_iterator i_mem = raw_ngram.begin(); i_mem != raw_ngram.end(); ++i_mem, --i) {
00116 #ifdef _DEPENDENCY_MODE
00117 tags[i] = ith_bit(type, i) ? persistent::tags.get(i_mem->second->tag()) : empty;
00118 #else
00119 tags[i] = ith_bit(type, i) ? persistent::tags.get((*i_mem)->tag()) : empty;
00120 #endif
00121 }
00122
00123 if ( !persistent::morphologic_filter.match(tags, type == NGram::full_ngram_type()) ) {
00124 return NULL;
00125 }
00126 }
00127
00128
00129 NGramStore::insert_status_t pb = persistent::ngrams.insert(NGramToken::get(raw_ngram, type));
00130
00131 if ( !pb.second ) {
00132
00133 pb.first->inc();
00134 }
00135
00136 return pb.first;
00137 }
00138
00144 void _store_wide_context(NGram *ngram, const raw_ngram_t& raw_ngram, words_range_t context_range) {
00145
00146 size_t found_so_far = 0;
00147
00148 for ( words_store_t::const_iterator w = context_range.first; w != context_range.second; ++w ) {
00149 next_loop:
00150 if ( found_so_far < raw_ngram.size() ) {
00151
00152 ngram_type_t i = NGram::n();
00153 for ( raw_ngram_t::const_iterator i_mem = raw_ngram.begin(); i_mem != raw_ngram.end(); ++i_mem ) {
00154
00155 if ( ith_bit(ngram->type(), --i) ) {
00156
00157 #ifdef _DEPENDENCY_MODE
00158
00159 if ( w == i_mem->second ) {
00160 #else
00161
00162 if ( w == *i_mem ) {
00163 #endif
00164 ++found_so_far;
00165 ++w;
00166 if ( w != context_range.second ) {
00167 goto next_loop;
00168 } else {
00169 goto end;
00170 }
00171 }
00172 }
00173
00174 }
00175 }
00176
00177 if ( persistent::context_filter.is_on()
00178 && !persistent::context_filter.match(persistent::tags.get(w->tag()), false) ) {
00179
00180 continue;
00181 }
00182
00183 ngram->add_to_context(PartOfContext(w->lemma(), w->tag()));
00184 }
00185 end:
00186 ;
00187 }
00188
00195 #ifdef _DEPENDENCY_MODE
00196 void _store_raw_ngram(const raw_ngram_t& raw_ngram, const Buffer& buffer, NamedDataFileStats& stats, words_store_t::const_iterator head)
00197 #else
00198 void _store_raw_ngram(const raw_ngram_t& raw_ngram, const Buffer& buffer, NamedDataFileStats& stats)
00199 #endif
00200 {
00201 NGram * ngram = NULL;
00202
00203 for ( ngram_type_t type = 0; type <= NGram::full_ngram_type(); ++type ) {
00204
00205
00206 ngram = _store_ngram(raw_ngram, type);
00207
00208 if ( type == NGram::full_ngram_type() ) {
00209
00210 ++stats.ngrams.extracted;
00211 if ( ngram == NULL ) {
00212 ++stats.ngrams.filtered;
00213 } else {
00214 ++stats.ngrams.passed;
00215 }
00216 }
00217
00218 if ( (ngram != NULL) && settings::context_tracing_on() ) {
00219
00220 if ( (type != 0) ) {
00221
00222 if ( settings::context_window.mode == ContextWindow::SENTENCE_MODE ) {
00223 _store_wide_context(ngram, raw_ngram, buffer.context_range());
00224 } else {
00225 #ifdef _DEPENDENCY_MODE
00226 _store_wide_context(ngram, raw_ngram, buffer.context_range(head));
00227 #else
00228 _store_wide_context(ngram, raw_ngram, buffer.context_range(raw_ngram.front(), raw_ngram.back()));
00229 #endif
00230 }
00231 }
00232
00233 if ( type == NGram::full_ngram_type() ) {
00234
00235
00236 #ifdef _DEPENDENCY_MODE
00237 words_store_t::const_iterator front = raw_ngram.front().second, back = raw_ngram.back().second;
00238 #else
00239 words_store_t::const_iterator front = raw_ngram.front(), back = raw_ngram.back();
00240 #endif
00241 if ( front != buffer.current().first ) {
00242 ngram->add_to_context(PartOfContext((front - 1)->lemma(), (front - 1)->tag(), true));
00243 }
00244 if ( (back + 1) != buffer.current().second ) {
00245 ngram->add_to_context(PartOfContext((back + 1)->lemma(), (back + 1)->tag(), true));
00246 }
00247 }
00248 }
00249 }
00250 }
00251
00252
00253
00254 size_t extract(std::ifstream& input_file, NamedDataFileStats& stats) {
00255
00256
00257 input_file.seekg(0, std::ios::end);
00258 std::streampos pos = input_file.tellg();
00259 input_file.seekg(0, std::ios::beg);
00260
00261 size_t file_size = (pos < 0) ? static_cast<std::streampos>(0) : pos;
00262
00263 #ifdef _BOOST_ON
00265
00266 bool progress_bar_on = true;
00267
00268
00269 progress_display_t display(static_cast<unsigned long>(file_size), notifier::progress.output_stream(), "");
00270
00271 if ( pos < 0 ) {
00272 notifier::error.warning("Progress bar initialization failed! Going on without progress bar display...");
00273 progress_bar_on = false;
00274 }
00275
00276 size_t processed_so_far = 0;
00277 #endif
00278
00280
00281
00282 raw_ngram_t raw_ngram;
00283
00284 #ifdef _DEPENDENCY_MODE
00285
00286 ntree_t nodes;
00287 NTreeNode abstract_root_node = NTreeNode();
00288 nodes.push_back(abstract_root_node);
00289
00290
00291 mapped_subtrees_t subtrees;
00292 #else
00293 static bools_table_t decomps = bools_table(settings::n - 1, settings::collocation_window_size);
00294
00295 words_range_t sentence;
00296 words_store_t::const_iterator word, current_word;
00297
00298 raw_ngram.resize(settings::n);
00299 #endif
00300
00302
00303
00304 Parser parser(input_file);
00305 Buffer buffer(parser, settings::context_window);
00306
00308
00309 if ( buffer.empty() ) {
00310
00311 return 0;
00312 }
00313
00314 do {
00315 #ifdef _DEPENDENCY_MODE
00316
00317 _build_tree(buffer.current(), nodes);
00318
00319
00320 extract_subtrees(nodes, NGram::n(), subtrees);
00321
00322
00323 for ( mapped_subtrees_t::const_iterator i_subtree = subtrees.begin(); i_subtree != subtrees.end(); ++i_subtree ) {
00324
00325 words_store_t::const_iterator head = _extract_ngram(buffer.current().first, i_subtree->second, raw_ngram);
00326
00327 _store_raw_ngram(raw_ngram, buffer, stats, head);
00328
00329 raw_ngram.clear();
00330 }
00331 #else
00332
00333 sentence = buffer.current();
00334 if ( (sentence.second - sentence.first) < settings::n ) {
00335
00336 continue;
00337 }
00338 current_word = sentence.first;
00339 do {
00340 size_t rn = 0;
00341 raw_ngram[rn] = current_word;
00342
00343
00344 for ( size_t i = 0; i < decomps.size(); ++i ) {
00345
00346 rn = 0;
00347
00348 size_t stop_at = std::min(decomps[i].size(), static_cast<size_t>((sentence.second - current_word) - 1));
00349
00350 for ( size_t j = 0; j < stop_at; ++j ) {
00351
00352
00353 if ( decomps[i][j] ) {
00354
00355 raw_ngram[++rn] = current_word + j + 1;
00356 }
00357 }
00358
00359 if ( ++rn == settings::n ) {
00360
00361 _store_raw_ngram(raw_ngram, buffer, stats);
00362 }
00363 }
00364
00365 ++current_word;
00366
00367
00368 } while ( current_word <= (sentence.second - settings::n) );
00369
00370 #endif
00371 #ifdef _BOOST_ON
00372
00373 if ( progress_bar_on ) {
00374 if ( parser.eof() ) {
00375
00376 display += display.expected_count() - display.count();
00377
00378 break;
00379 }
00380 pos = input_file.tellg();
00381 if ( (pos < 0) ) {
00382
00383 notifier::progress.info("\n");
00384 notifier::error.warning("Progress bar update failed! Stopping progress bar display...");
00385 progress_bar_on = false;
00386 } else {
00387
00388 display += static_cast<unsigned long>(static_cast<size_t>(pos) - processed_so_far);
00389 processed_so_far = static_cast<size_t>(pos);
00390 }
00391 }
00392 #endif
00393 #ifdef _DEPENDENCY_MODE
00394
00395
00396 nodes.resize(1);
00397 nodes[0] = abstract_root_node;
00398
00399 subtrees.clear();
00400 #endif
00401
00402 } while ( buffer.next() );
00403
00404
00405 stats += buffer.stats();
00406
00407 return file_size;
00408
00409 }
00410
00411 }