Cleaned up the intelligent lookup algorithm a bit
This commit is contained in:
parent
4858725377
commit
94c30dcf7f
1 changed files with 69 additions and 84 deletions
153
src/lookup.cc
153
src/lookup.cc
|
|
@ -48,9 +48,8 @@ namespace {
|
|||
}
|
||||
};
|
||||
|
||||
typedef std::map<account_t *, int> account_use_map;
|
||||
typedef std::pair<account_t *, int> account_use_pair;
|
||||
typedef std::vector<account_use_pair> account_use_vector;
|
||||
typedef std::map<account_t *, int> account_use_map;
|
||||
typedef std::pair<account_t *, int> account_use_pair;
|
||||
|
||||
struct usage_sorter {
|
||||
bool operator()(const account_use_pair& left,
|
||||
|
|
@ -90,7 +89,7 @@ lookup_probable_account(const string& ident,
|
|||
if ((CURRENT_DATE() - xact->date()).days() > 700)
|
||||
continue;
|
||||
|
||||
// An exact match is worth a score of 100
|
||||
// An exact match is worth a score of 100 and terminates the search
|
||||
if (ident == xact->payee) {
|
||||
DEBUG("lookup", " we have an exact match, score = 100");
|
||||
scores.push_back(score_entry_t(xact, 100));
|
||||
|
|
@ -117,51 +116,57 @@ lookup_probable_account(const string& ident,
|
|||
|
||||
// Walk each letter in the source identifier
|
||||
foreach (const uint32_t& ch, lowered_ident.utf32chars) {
|
||||
int addend = 0;
|
||||
int addend = 0;
|
||||
bool added_bonus = false;
|
||||
std::size_t value_len = value_key.length();
|
||||
|
||||
pos = value_key.find(ch);
|
||||
|
||||
// If it occurs in the same order as the source identifier -- that is,
|
||||
// without intervening letters to break the pattern -- it's worth 10
|
||||
// points. Plus, an extra point is added for every letter in chains of
|
||||
// 3 or more.
|
||||
// Ensure that a letter which has been matched is not matched twice, so
|
||||
// that the two x's of Exxon don't both match to the single x in Oxford.
|
||||
// This part of the loop is very expensive, but avoids a lot of bogus
|
||||
// matches.
|
||||
|
||||
bool added_bonus = false;
|
||||
if (pos == last_match_pos + 1) {
|
||||
DEBUG("lookup",
|
||||
" char " << index << " in-sequence match with bonus " << bonus);
|
||||
addend += 10;
|
||||
if (bonus > 2)
|
||||
addend += bonus - 2;
|
||||
bonus++;
|
||||
added_bonus = true;
|
||||
char_positions_map::iterator pi = positions.find(ch);
|
||||
while (pi != positions.end() &&
|
||||
pos != unistring::npos && pos <= (*pi).second &&
|
||||
(*pi).second + 1 < value_len)
|
||||
pos = value_key.find(ch, (*pi).second + 1);
|
||||
|
||||
last_match_pos = pos;
|
||||
}
|
||||
if (pos != unistring::npos) {
|
||||
if (pi != positions.end())
|
||||
(*pi).second = pos;
|
||||
else
|
||||
positions.insert(char_positions_map::value_type(ch, pos));
|
||||
|
||||
// If it occurs in the same general sequence as the source identifier,
|
||||
// it's worth 5 points, plus an extra point if it's within the next 3
|
||||
// characters, and an extra point if it's preceded by a non-alphabetic
|
||||
// character.
|
||||
//
|
||||
// If the letter occurs at all in the target identifier, it's worth 1
|
||||
// point, plus an extra point if it's within 3 characters, and an extra
|
||||
// point if it's preceded by a non-alphabetic character.
|
||||
// If it occurs in the same order as the source identifier -- that is,
|
||||
// without intervening letters to break the pattern -- it's worth 10
|
||||
// points. Plus, an extra point is added for every letter in chains
|
||||
// of 3 or more.
|
||||
|
||||
else if (pos != unistring::npos) {
|
||||
// Ensure that a letter which has been matched is not matched twice,
|
||||
// so that the two x's of Exxon don't both match to the single x in
|
||||
// Oxford.
|
||||
char_positions_map::iterator pi = positions.find(ch);
|
||||
if (pi != positions.end()) {
|
||||
while (pi != positions.end() &&
|
||||
pos != unistring::npos && pos <= (*pi).second &&
|
||||
(*pi).second + 1 < value_key.length()) {
|
||||
pos = value_key.find(ch, (*pi).second + 1);
|
||||
}
|
||||
if (last_match_pos == unistring::npos ?
|
||||
index == 0 && pos == 0 : pos == last_match_pos + 1) {
|
||||
DEBUG("lookup",
|
||||
" char " << index << " in-sequence match with bonus " << bonus);
|
||||
addend += 10;
|
||||
if (bonus > 2)
|
||||
addend += bonus - 2;
|
||||
bonus++;
|
||||
added_bonus = true;
|
||||
|
||||
last_match_pos = pos;
|
||||
}
|
||||
|
||||
if (pos != unistring::npos) {
|
||||
// If it occurs in the same general sequence as the source identifier,
|
||||
// it's worth 5 points, plus an extra point if it's within the next 3
|
||||
// characters, and an extra point if it's preceded by a non-alphabetic
|
||||
// character.
|
||||
//
|
||||
// If the letter occurs at all in the target identifier, it's worth 1
|
||||
// point, plus an extra point if it's within 3 characters, and an
|
||||
// extra point if it's preceded by a non-alphabetic character.
|
||||
|
||||
else {
|
||||
bool in_order_match = (last_match_pos != unistring::npos &&
|
||||
pos > last_match_pos);
|
||||
DEBUG("lookup", " char " << index << " " <<
|
||||
|
|
@ -169,11 +174,6 @@ lookup_probable_account(const string& ident,
|
|||
<< " match" << (in_order_match && pos - index < 3 ?
|
||||
" with proximity bonus of 1" : ""));
|
||||
|
||||
if (pi != positions.end())
|
||||
(*pi).second = pos;
|
||||
else
|
||||
positions.insert(char_positions_map::value_type(ch, pos));
|
||||
|
||||
if (pos < index)
|
||||
addend += 1;
|
||||
else
|
||||
|
|
@ -190,17 +190,11 @@ lookup_probable_account(const string& ident,
|
|||
#endif
|
||||
|
||||
last_match_pos = pos;
|
||||
} else {
|
||||
last_match_pos = unistring::npos;
|
||||
|
||||
DEBUG("lookup", " char " << index << " does not match here");
|
||||
addend--;
|
||||
}
|
||||
}
|
||||
|
||||
// If the letter does not appear at all, decrease the score by 1
|
||||
|
||||
else {
|
||||
} else {
|
||||
last_match_pos = unistring::npos;
|
||||
|
||||
DEBUG("lookup", " char " << index << " does not match");
|
||||
|
|
@ -230,64 +224,55 @@ lookup_probable_account(const string& ident,
|
|||
index++;
|
||||
}
|
||||
|
||||
scores.push_back(score_entry_t(xact, score));
|
||||
// Only consider payees with a score of 30 or greater
|
||||
if (score >= 30)
|
||||
scores.push_back(score_entry_t(xact, score));
|
||||
}
|
||||
|
||||
// Sort the results by descending score, then look at every account ever
|
||||
// used among the top ten. Rank these by number of times used. Lastly,
|
||||
// used among the top five. Rank these by number of times used. Lastly,
|
||||
// "decay" any latter accounts, so that we give recently used accounts a
|
||||
// slightly higher rating in case of a tie.
|
||||
|
||||
std::stable_sort(scores.begin(), scores.end(), score_sorter());
|
||||
|
||||
account_use_map account_usage;
|
||||
|
||||
scorecard_t::iterator si = scores.begin();
|
||||
|
||||
int decay = 0;
|
||||
xact_t * best_xact = si != scores.end() ? (*si).first : NULL;
|
||||
scorecard_t::iterator si = scores.begin();
|
||||
int decay = 0;
|
||||
xact_t * best_xact = si != scores.end() ? (*si).first : NULL;
|
||||
account_use_map account_usage;
|
||||
|
||||
for (int i = 0; i < 5 && si != scores.end(); i++, si++) {
|
||||
DEBUG("lookup.account",
|
||||
"Payee: " << std::setw(5) << std::right << (*si).second <<
|
||||
" - " << (*si).first->payee);
|
||||
|
||||
// Only consider payees with a score of 20 or greater
|
||||
if ((*si).second >= 50) {
|
||||
foreach (post_t * post, (*si).first->posts) {
|
||||
if (! post->has_flags(ITEM_TEMP | ITEM_GENERATED) &&
|
||||
post->account != ref_account &&
|
||||
! post->account->has_flags(ACCOUNT_TEMP | ACCOUNT_GENERATED)) {
|
||||
account_use_map::iterator x = account_usage.find(post->account);
|
||||
if (x == account_usage.end())
|
||||
account_usage.insert(account_use_pair(post->account,
|
||||
((*si).second - decay)));
|
||||
else
|
||||
(*x).second += ((*si).second - decay);
|
||||
}
|
||||
decay++;
|
||||
foreach (post_t * post, (*si).first->posts) {
|
||||
if (! post->has_flags(ITEM_TEMP | ITEM_GENERATED) &&
|
||||
post->account != ref_account &&
|
||||
! post->account->has_flags(ACCOUNT_TEMP | ACCOUNT_GENERATED)) {
|
||||
account_use_map::iterator x = account_usage.find(post->account);
|
||||
if (x == account_usage.end())
|
||||
account_usage.insert(account_use_pair(post->account,
|
||||
((*si).second - decay)));
|
||||
else
|
||||
(*x).second += ((*si).second - decay);
|
||||
}
|
||||
decay++;
|
||||
}
|
||||
}
|
||||
|
||||
if (account_usage.size() > 0) {
|
||||
account_use_vector flat_account_usage(account_usage.begin(),
|
||||
account_usage.end());
|
||||
|
||||
std::stable_sort(flat_account_usage.begin(), flat_account_usage.end(),
|
||||
usage_sorter());
|
||||
|
||||
#if defined(DEBUG_ON)
|
||||
if (SHOW_DEBUG("lookup.account")) {
|
||||
foreach (account_use_pair& value, flat_account_usage) {
|
||||
foreach (const account_use_pair& value, account_usage) {
|
||||
DEBUG("lookup.account",
|
||||
"Account: " << value.second << " - " << value.first->fullname());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return std::pair<xact_t *, account_t *>(best_xact,
|
||||
flat_account_usage.front().first);
|
||||
return std::pair<xact_t *, account_t *>
|
||||
(best_xact, (*std::max_element(account_usage.begin(), account_usage.end(),
|
||||
usage_sorter())).first);
|
||||
} else {
|
||||
return std::pair<xact_t *, account_t *>(best_xact, NULL);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue