ledger/src/lookup.cc
2012-02-29 22:32:23 -06:00

285 lines
9.8 KiB
C++

/*
* Copyright (c) 2003-2012, John Wiegley. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* - Neither the name of New Artisans LLC nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <system.hh>
#include "lookup.h"
#include "unistring.h"
namespace ledger {
namespace {
typedef std::pair<xact_t *, int> score_entry_t;
typedef std::deque<score_entry_t> scorecard_t;
typedef std::map<uint32_t, std::size_t> char_positions_map;
struct score_sorter {
bool operator()(const score_entry_t& left,
const score_entry_t& right) const {
return left.second > right.second;
}
};
typedef std::map<account_t *, int> account_use_map;
typedef std::pair<account_t *, int> account_use_pair;
struct usage_sorter {
bool operator()(const account_use_pair& left,
const account_use_pair& right) const {
return left.second > right.second;
}
};
}
std::pair<xact_t *, account_t *>
lookup_probable_account(const string& ident,
xacts_list::reverse_iterator iter,
xacts_list::reverse_iterator end,
account_t * ref_account)
{
scorecard_t scores;
#if !defined(HAVE_BOOST_REGEX_UNICODE)
string lident = ident;
to_lower(lident);
unistring lowered_ident(lident);
#else
// jww (2010-03-07): Not yet implemented
unistring lowered_ident(ident);
#endif
DEBUG("lookup.account",
"Looking up identifier '" << lowered_ident.extract() << "'");
#if defined(DEBUG_ON)
if (ref_account != NULL)
DEBUG("lookup.account",
" with reference account: " << ref_account->fullname());
#endif
xact_t * xact;
while (iter != end && (xact = *iter++) != NULL) {
#if 0
// Only consider transactions from the last two years (jww (2010-03-07):
// make this an option)
if ((CURRENT_DATE() - xact->date()).days() > 700)
continue;
#endif
// An exact match is worth a score of 100 and terminates the search
if (ident == xact->payee) {
DEBUG("lookup", " we have an exact match, score = 100");
scores.push_back(score_entry_t(xact, 100));
break;
}
#if !defined(HAVE_BOOST_REGEX_UNICODE)
string payee = xact->payee;
to_lower(payee);
unistring value_key(payee);
#else
// jww (2010-03-07): Not yet implemented
unistring value_key(xact->payee);
#endif
DEBUG("lookup", "Considering payee: " << value_key.extract());
std::size_t index = 0;
std::size_t last_match_pos = unistring::npos;
int bonus = 0;
int score = 0;
std::size_t pos;
char_positions_map positions;
// Walk each letter in the source identifier
foreach (const uint32_t& ch, lowered_ident.utf32chars) {
int addend = 0;
bool added_bonus = false;
std::size_t value_len = value_key.length();
pos = value_key.find(ch);
// Ensure that a letter which has been matched is not matched twice, so
// that the two x's of Exxon don't both match to the single x in Oxford.
// This part of the loop is very expensive, but avoids a lot of bogus
// matches.
char_positions_map::iterator pi = positions.find(ch);
while (pi != positions.end() &&
pos != unistring::npos && pos <= (*pi).second &&
(*pi).second + 1 < value_len)
pos = value_key.find(ch, (*pi).second + 1);
if (pos != unistring::npos) {
if (pi != positions.end())
(*pi).second = pos;
else
positions.insert(char_positions_map::value_type(ch, pos));
// If it occurs in the same order as the source identifier -- that is,
// without intervening letters to break the pattern -- it's worth 10
// points. Plus, an extra point is added for every letter in chains
// of 3 or more.
if (last_match_pos == unistring::npos ?
index == 0 && pos == 0 : pos == last_match_pos + 1) {
DEBUG("lookup",
" char " << index << " in-sequence match with bonus " << bonus);
addend += 10;
if (bonus > 2)
addend += bonus - 2;
bonus++;
added_bonus = true;
last_match_pos = pos;
}
// If it occurs in the same general sequence as the source identifier,
// it's worth 5 points, plus an extra point if it's within the next 3
// characters, and an extra point if it's preceded by a non-alphabetic
// character.
//
// If the letter occurs at all in the target identifier, it's worth 1
// point, plus an extra point if it's within 3 characters, and an
// extra point if it's preceded by a non-alphabetic character.
else {
bool in_order_match = (last_match_pos != unistring::npos &&
pos > last_match_pos);
DEBUG("lookup", " char " << index << " " <<
(in_order_match ? "in-order" : "out-of-order")
<< " match" << (in_order_match && pos - index < 3 ?
" with proximity bonus of 1" : ""));
if (pos < index)
addend += 1;
else
addend += 5;
if (in_order_match && pos - index < 3)
addend++;
#if !defined(HAVE_BOOST_REGEX_UNICODE)
if (pos == 0 || (pos > 0 && !std::isalnum(value_key[pos - 1])))
addend++;
#else
// jww (2010-03-07): Not yet implemented
#endif
last_match_pos = pos;
}
// If the letter does not appear at all, decrease the score by 1
} else {
last_match_pos = unistring::npos;
DEBUG("lookup", " char " << index << " does not match");
addend--;
}
// Finally, decay what is to be added to the score based on its position
// in the word. Since credit card payees in particular often share
// information at the end (such as the location where the purchase was
// made), we want to give much more credence to what occurs at the
// beginning. Every 5 character positions from the beginning becomes a
// divisor for the addend.
if ((int(index / 5) + 1) > 1) {
DEBUG("lookup",
" discounting the addend by / " << (int(index / 5) + 1));
addend = int(double(addend) / (int(index / 5) + 1));
}
DEBUG("lookup", " final addend is " << addend);
score += addend;
DEBUG("lookup", " score is " << score);
if (! added_bonus)
bonus = 0;
index++;
}
// Only consider payees with a score of 30 or greater
if (score >= 30)
scores.push_back(score_entry_t(xact, score));
}
// Sort the results by descending score, then look at every account ever
// used among the top five. Rank these by number of times used. Lastly,
// "decay" any latter accounts, so that we give recently used accounts a
// slightly higher rating in case of a tie.
std::stable_sort(scores.begin(), scores.end(), score_sorter());
scorecard_t::iterator si = scores.begin();
int decay = 0;
xact_t * best_xact = si != scores.end() ? (*si).first : NULL;
account_use_map account_usage;
for (int i = 0; i < 5 && si != scores.end(); i++, si++) {
DEBUG("lookup.account",
"Payee: " << std::setw(5) << std::right << (*si).second <<
" - " << (*si).first->payee);
foreach (post_t * post, (*si).first->posts) {
if (! post->has_flags(ITEM_TEMP | ITEM_GENERATED) &&
post->account != ref_account &&
! post->account->has_flags(ACCOUNT_TEMP | ACCOUNT_GENERATED)) {
account_use_map::iterator x = account_usage.find(post->account);
if (x == account_usage.end())
account_usage.insert(account_use_pair(post->account,
((*si).second - decay)));
else
(*x).second += ((*si).second - decay);
}
decay++;
}
}
if (account_usage.size() > 0) {
#if defined(DEBUG_ON)
if (SHOW_DEBUG("lookup.account")) {
foreach (const account_use_pair& value, account_usage) {
DEBUG("lookup.account",
"Account: " << value.second << " - " << value.first->fullname());
}
}
#endif
return std::pair<xact_t *, account_t *>
(best_xact, (*std::max_element(account_usage.begin(), account_usage.end(),
usage_sorter())).first);
} else {
return std::pair<xact_t *, account_t *>(best_xact, NULL);
}
}
} // namespace ledger