00001
00011
00012 #include "config.h"
00013 #ifdef _DEPENDENCY_MODE
00014
00015 #include "dependencies.h"
00016 #endif
00017
00018 #include "notifier.h"
00019
00020 #include "parser.h"
00021
00022 #include "persistent.h"
00023
00024 #include "utils.h"
00025
00026 namespace ace {
00027
00028
00029
00030 bool Parser::next(size_t count, ContextWindow::Mode mode, words_store_t& store) {
00031
00032
00033 static int c;
00034
00035 static std::string lemma;
00036 static std::string tag;
00037 static std::string context_tag;
00038 #ifdef _DEPENDENCY_MODE
00039 static std::string occurrence;
00040 static std::string dependency;
00041
00042 static unsigned int order;
00043 static unsigned int parent;
00044 #endif
00045
00046 size_t words_recently_parsed = 0, words_totally_parsed = 0;
00047 bool incorrect_input_format_detected = false;
00048
00049
00050 _parsed_counter.clear();
00051
00052 while ( _input_file ) {
00053
00054
00055 c = _input_file.get();
00056
00057 if ( c < 0 ) {
00058
00059 break;
00060 }
00061
00062
00063 ++_line_no;
00064
00065 if ( c != constants::eol ) {
00066
00067 _input_file.unget();
00068 #ifdef _DEPENDENCY_MODE
00069
00070 _input_file >> order >> occurrence >> lemma >> tag >> parent >> dependency;
00071 #else
00072
00073 _input_file >> lemma >> tag;
00074 #endif
00075 if ( _input_file.fail() && !_input_file.eof() ) {
00076
00077 incorrect_input_format_detected = true;
00078 _input_file.clear();
00079 }
00080
00081
00082
00083
00084 _input_file.ignore(1, constants::eol);
00085 #ifdef _DEPENDENCY_MODE
00086 if ( incorrect_input_format_detected || (order != (words_recently_parsed + 1)) || (words_recently_parsed == constants::max_sentence_length) ) {
00087 #else
00088 if ( incorrect_input_format_detected || (words_recently_parsed == constants::max_sentence_length) ) {
00089 #endif
00090
00091 if ( incorrect_input_format_detected ) {
00092 notifier::error.warning("Incorrect input format detected at line " + unsigned2str(_line_no) + ". Skipping current sentence...");
00093
00094 incorrect_input_format_detected = false;
00095 } else if ( words_recently_parsed == constants::max_sentence_length ) {
00096
00097 notifier::error.warning("Word order overflow detected at line " + unsigned2str(_line_no) + ". Got word ordered as " + unsigned2str(words_recently_parsed + 1) + ", while maximum allowed value for word order is " + unsigned2str(constants::max_sentence_length) + ". Skipping current sentence...");
00098 #ifdef _DEPENDENCY_MODE
00099 } else {
00100
00101 notifier::error.warning("Word order mismatch detected at line " + unsigned2str(_line_no) + ". Got word ordered as " + unsigned2str(order) + ", while expecting " + unsigned2str(words_recently_parsed + 1) + ". Skipping current sentence...");
00102 #endif
00103 }
00104
00105
00106 for ( ; words_recently_parsed > 0; --words_recently_parsed ) {
00107 store.pop_back();
00108 }
00109
00110
00111 while ( _input_file &&(_input_file.get() != constants::eol) ) {
00112
00113 _input_file.ignore(100000, constants::eol);
00114 ++_line_no;
00115 }
00116
00117 ++_line_no;
00118 if ( _input_file ) {
00119 continue;
00120 } else {
00121
00122 break;
00123 }
00124 }
00125
00126
00127 for ( size_t i = 0; i < lemma.length(); ++i ) {
00128
00129 switch ( lemma[i] ) {
00130 case ':':
00131 case '^':
00132 case ';':
00133 case '\'':
00134 case '_':
00135 lemma.erase(i);
00136 goto jump;
00137 }
00138 }
00139 jump:
00140
00141 if ( settings::morphologic_tag_mask != 0 ) {
00142
00143 unsigned i = 0;
00144
00145 for ( ; (settings::morphologic_tag_mask[i] != 0) && (i < tag.size()); ++i ) {
00146 if ( settings::morphologic_tag_mask[i] != constants::significant_position_mark ) {
00147
00148 tag[i] = constants::insignificant_position_mark;
00149 }
00150 }
00151
00152 tag.resize(strlen(settings::morphologic_tag_mask), constants::insignificant_position_mark);
00153
00154 } else {
00155
00156 tag = std::string();
00157 }
00158 #ifdef _DEPENDENCY_MODE
00159
00160
00161 store.push_back(Word(static_cast<word_order_t>(order), static_cast<word_order_t>(parent), persistent::lemmas.store(lemma.c_str()), persistent::tags.store(tag.c_str()), string2dependency(dependency)));
00162 #else
00163
00164 store.push_back(Word(persistent::lemmas.store(lemma.c_str()), persistent::tags.store(tag.c_str())));
00165 #endif
00166
00167 ++words_recently_parsed;
00168 } else {
00169
00170 if ( words_recently_parsed ) {
00171
00172 _parsed_counter.push_back(words_recently_parsed);
00173 words_totally_parsed += words_recently_parsed;
00174
00175 words_recently_parsed = 0;
00176 }
00177 if ( (mode == ContextWindow::WORD_MODE) && (words_totally_parsed >= count) || (mode == ContextWindow::SENTENCE_MODE) && (_parsed_counter.size() >= count) ) {
00178
00179 break;
00180 }
00181 }
00182 }
00183
00184 if ( words_recently_parsed ) {
00185
00186 _parsed_counter.push_back(words_recently_parsed);
00187 }
00188
00189 return !_parsed_counter.empty();
00190
00191 }
00192
00193 }