00001 00011 // Evaluation requires some advanced math... 00012 #include <cmath> 00013 // ...and floating point operations. 00014 #include <cfloat> 00015 00016 #include "evaluation.h" 00017 // ith_bit() needed 00018 #include "utils.h" 00019 00020 namespace ace { 00021 00026 double _log10(double x) { 00027 return (std::abs(x - 0.0) < DBL_EPSILON) ? DBL_MIN : std::log10(x); 00028 } 00029 00030 /* Manning. 00031 double _L(double k, double n, double x) { 00032 return k*_log10(x) + (n - k)*_log10(1 - x); 00033 } 00034 */ 00035 00038 std::vector<std::vector<size_t> > EvaluationTables::_bits_for_cont_table; 00041 std::vector<std::vector<size_t> > EvaluationTables::_bits_for_inv_table; 00042 00043 /* Interface item implementation - for info see description in header file. 00044 */ 00045 void EvaluationTables::init() { 00046 // Requirement: NGram::init() already called. 00047 // 00048 _bits_for_cont_table.resize(NGram::types_count()); 00049 _bits_for_inv_table.resize(NGram::types_count()); 00050 // 00051 for ( ngram_type_t outer = 0; outer < NGram::types_count(); ++outer ) { 00052 // Do it in the most naive way, it won't have any significant impact 00053 // on program speed anyway... 00054 for ( ngram_type_t inner = 0; inner < NGram::types_count(); ++inner ) { 00055 // Superset exclusive: Inner non-zero bits must be exclusive superset of outer non-zero bits. 00056 if ( (inner ^ outer) && (bits_in<ngram_type_t>(inner) == bits_in<ngram_type_t>(inner | outer)) ) { 00057 // if ( ^ ) -> ther're different and if also () == () -> inner bits are superset of outer bits 00058 _bits_for_cont_table[outer].push_back(inner); 00059 } 00060 // Outer non-zero bits must meet inner zero bits (but 0 is excluded) 00061 if ( inner && !(outer & inner) ) { 00062 _bits_for_inv_table[outer].push_back(inner); 00063 } 00064 } 00065 } 00066 } 00067 00068 /* Interface item implementation - for info see description in header file. 00069 */ 00070 // Init contigency table with frequency values! 00071 EvaluationTables::EvaluationTables(const freq_table_t frequencies): _freq(frequencies), _cont(_freq), _inv(), _exp() { 00072 00073 // Now adjust values in contigency table. 00074 ngram_type_t type = NGram::full_ngram_type(); 00075 do { 00076 --type; // Values of contigency and *-frequency tables for full N-gram type are always equal. 00077 // 00078 for ( size_t i = 0; i < _bits_for_cont_table[type].size(); ++i ) { 00079 _cont[type] -= _cont[_bits_for_cont_table[type][i]]; 00080 } 00081 } while (type != 0); 00082 00083 // Init inverted frequencies table with contigency table 0-index value. 00084 _inv.resize(_cont.size(), _cont[0]); 00085 // Now adjust values in inverted frequencies table. 00086 for ( ngram_type_t type = 0; type < _freq.size(); ++type ) { 00087 for ( ngram_type_t i = 0; i < _bits_for_inv_table[type].size(); ++i ) { 00088 _inv[type] += _cont[_bits_for_inv_table[type][i]]; 00089 } 00090 } 00091 // Count expected values (for likelihood ratio and chi square test). 00092 _exp.resize(_freq.size(), 1.0); 00093 for ( ngram_type_t type = 0; type < _freq.size(); ++type ) { 00094 _exp[type] *= _freq[0]; // #N 00095 for ( ngram_type_t i = 0; i < NGram::n(); i++ ) { 00096 if ( ith_bit(type, i) ) { 00097 _exp[type] *= static_cast<double>(_freq[1 << i]) / _freq[0]; // p(x) = #x/#N 00098 } else { 00099 _exp[type] *= static_cast<double>(_inv[1 << i]) / _freq[0]; 00100 } 00101 } 00102 } 00103 } 00104 00105 namespace eval { 00106 00107 /* Interface item implementation - for info see description in header file. 00108 */ 00109 double chi_square_test(const EvaluationTables& tables) { 00110 00111 double chi_square = 0.0; 00112 00113 for ( ngram_type_t type = 0; type < tables._freq.size(); ++type ) { 00114 chi_square += std::pow(tables._cont[type] - tables._exp[type], 2) / tables._exp[type]; 00115 } 00116 00117 return chi_square; 00118 } 00119 00120 /* Interface item implementation - for info see description in header file. 00121 */ 00122 double log_likelihood_ratio(const EvaluationTables& tables) { 00123 00124 double likelihood = 0.0; 00125 00126 for ( ngram_type_t type = 0; type < tables._freq.size(); ++type ) { 00127 likelihood += tables._cont[type] * _log10(tables._cont[type]/ tables._exp[type]); 00128 } 00129 00130 return 2 * likelihood; // 00131 00132 // Based on Manning. 00133 /* 00134 double p = static_cast<double>(_tables._freq[1]) / static_cast<double>(_tables._freq[0]), 00135 p1 = static_cast<double>(_tables._freq[3]) / static_cast<double>(_tables._freq[2]), 00136 p2 = static_cast<double>(_tables._freq[1] - _tables._freq[3]) / static_cast<double>(_tables._freq[0] - _tables._freq[2]); 00137 00138 double 00139 L11 = // L( f(X,Y), f(X,*), p ) 00140 _L(_tables._freq[3], _tables._freq[2], p), 00141 L12 = // L( f(*,Y)-f(X,Y), f(*,*)-f(X,*), p ) 00142 _L(_tables._freq[1] - _tables._freq[3], _tables._freq[0] - _tables._freq[2], p), 00143 L21 = // L( f(X,Y), f(X,*), p1 ) 00144 _L(_tables._freq[3], _tables._freq[2], p1), 00145 L22 = // L( f(*,Y)-f(X,Y), f(*,*)-f(X,*), p2 ) 00146 _L(_tables._freq[1] - _tables._freq[3], _tables._freq[0] - _tables._freq[2], p2); 00147 00148 //return -2 * ((std::log10(L11) - std::log10(L21)) + (std::log10(L12) - std::log10(L22))); 00149 return -2 * (L11 - L21 + L12 - L22); 00150 */ 00151 } 00152 00153 /* Interface item implementation - for info see description in header file. 00154 */ 00155 double mutual_information(const EvaluationTables& tables) { 00156 // For given X and Y, the function returns log2( p(XY)/ p(Y)*p(Y) ), where: 00157 // p(XY) = f(X,Y) / f(*,*) 00158 // p(X) = f(X,*) / f(*,*) 00159 // p(Y) = f(*,Y) / f(*,*) 00160 00161 double pxy = static_cast<double>(tables._freq[3]) / static_cast<double>(tables._freq[0]), 00162 px = static_cast<double>(tables._freq[2]) / static_cast<double>(tables._freq[0]), 00163 py = static_cast<double>(tables._freq[1]) / static_cast<double>(tables._freq[0]); 00164 00165 // We want log2, but there is no such function in cmath library. 00166 return _log10(pxy/(px*py)) / std::log10(2.0); 00167 } 00168 00169 /* Interface item implementation - for info see description in header file. 00170 */ 00171 double pearsons_coefficient(const EvaluationTables& tables) { 00172 return (tables._cont[3]*tables._cont[0] - tables._cont[2]*tables._cont[1]) 00173 / std::sqrt(static_cast<double>( 00174 (tables._cont[3] + tables._cont[2]) 00175 * (tables._cont[3] + tables._cont[1]) 00176 * (tables._cont[0] + tables._cont[2]) 00177 * (tables._cont[0] + tables._cont[1]) 00178 )); 00179 } 00180 00181 /* Interface item implementation - for info see description in header file. 00182 */ 00183 double t_test(const EvaluationTables& tables) { 00184 // (f(X,Y) - e(X,Y)) / (f(X,Y)*(1-f(X,Y)/f(*,*))) 00185 return (tables._freq[3] - tables._exp[3]) / std::sqrt(tables._freq[3]*(1 - static_cast<double>(tables._freq[3])/static_cast<double>(tables._freq[0]))); 00186 } 00187 00188 /* Interface item implementation - for info see description in header file. 00189 */ 00190 double z_score(const EvaluationTables& tables) { 00191 // (f(X,Y) - e(X,Y)) / (e(X,Y)*(1-e(X,Y)/f(*,*))) 00192 return (tables._freq[3] - tables._exp[3]) / std::sqrt(tables._exp[3]*(1 - static_cast<double>(tables._exp[3])/static_cast<double>(tables._freq[0]))); 00193 } 00194 00195 } // namespace eval 00196 00197 } // namespace ace
1.5.6