Added experimental account lookup logic
This is used by the "xact" command, and the CSV importer. It is very slow O(xacts*records).
This commit is contained in:
parent
e070cdfc8d
commit
1bf0220f24
5 changed files with 406 additions and 24 deletions
|
|
@ -38,7 +38,8 @@
|
|||
#include "iterators.h"
|
||||
#include "report.h"
|
||||
#include "xact.h"
|
||||
#include "output.h"
|
||||
#include "print.h"
|
||||
#include "lookup.h"
|
||||
|
||||
namespace ledger {
|
||||
|
||||
|
|
@ -54,7 +55,8 @@ value_t convert_command(call_scope_t& scope)
|
|||
else
|
||||
bucket_name = "Equity:Unknown";
|
||||
|
||||
account_t * bucket = journal.master->find_account(bucket_name);
|
||||
account_t * bucket = journal.master->find_account(bucket_name);
|
||||
account_t * unknown = journal.master->find_account(_("Expenses:Unknown"));
|
||||
|
||||
// Make an amounts mapping for the account under consideration
|
||||
|
||||
|
|
@ -81,14 +83,15 @@ value_t convert_command(call_scope_t& scope)
|
|||
}
|
||||
}
|
||||
|
||||
// Create a flat list o
|
||||
xacts_list current_xacts(journal.xacts_begin(), journal.xacts_end());
|
||||
|
||||
// Read in the series of transactions from the CSV file
|
||||
|
||||
format_posts formatter(report,
|
||||
report.report_format(report.HANDLER(print_format_)),
|
||||
false);
|
||||
print_xacts formatter(report);
|
||||
ifstream data(path(args.get<string>(0)));
|
||||
csv_reader reader(data);
|
||||
|
||||
ifstream data(path(args.get<string>(0)));
|
||||
csv_reader reader(data);
|
||||
while (xact_t * xact = reader.read_xact(journal, bucket)) {
|
||||
bool matched = false;
|
||||
post_map_t::iterator i = post_map.find(- xact->posts.front()->amount);
|
||||
|
|
@ -111,17 +114,34 @@ value_t convert_command(call_scope_t& scope)
|
|||
DEBUG("convert.csv", "Ignored xact with code: " << *xact->code);
|
||||
delete xact; // ignore it
|
||||
}
|
||||
else if (! journal.add_xact(xact)) {
|
||||
delete xact;
|
||||
throw_(std::runtime_error,
|
||||
_("Failed to finalize derived transaction (check commodities)"));
|
||||
}
|
||||
else {
|
||||
xact_posts_iterator xact_iter(*xact);
|
||||
while (post_t * post = xact_iter())
|
||||
formatter(*post);
|
||||
if (xact->posts.front()->account == NULL) {
|
||||
xacts_iterator xi;
|
||||
xi.xacts_i = current_xacts.begin();
|
||||
xi.xacts_end = current_xacts.end();
|
||||
xi.xacts_uninitialized = false;
|
||||
|
||||
// jww (2010-03-07): Bind this logic to an option: --auto-match
|
||||
if (account_t * acct =
|
||||
lookup_probable_account(xact->payee, xi, bucket).second)
|
||||
xact->posts.front()->account = acct;
|
||||
else
|
||||
xact->posts.front()->account = unknown;
|
||||
}
|
||||
|
||||
if (! journal.add_xact(xact)) {
|
||||
delete xact;
|
||||
throw_(std::runtime_error,
|
||||
_("Failed to finalize derived transaction (check commodities)"));
|
||||
}
|
||||
else {
|
||||
xact_posts_iterator xact_iter(*xact);
|
||||
while (post_t * post = xact_iter())
|
||||
formatter(*post);
|
||||
}
|
||||
}
|
||||
}
|
||||
formatter.flush();
|
||||
|
||||
// If not, transform the payee according to regexps
|
||||
|
||||
|
|
|
|||
26
src/draft.cc
26
src/draft.cc
|
|
@ -38,6 +38,7 @@
|
|||
#include "journal.h"
|
||||
#include "session.h"
|
||||
#include "report.h"
|
||||
#include "lookup.h"
|
||||
#include "print.h"
|
||||
|
||||
namespace ledger {
|
||||
|
|
@ -242,18 +243,25 @@ xact_t * draft_t::insert(journal_t& journal)
|
|||
if (tmpl->payee_mask.empty())
|
||||
throw std::runtime_error(_("xact' command requires at least a payee"));
|
||||
|
||||
xact_t * matching = NULL;
|
||||
xact_t * matching = NULL;
|
||||
|
||||
std::auto_ptr<xact_t> added(new xact_t);
|
||||
|
||||
for (xacts_list::reverse_iterator j = journal.xacts.rbegin();
|
||||
j != journal.xacts.rend();
|
||||
j++) {
|
||||
if (tmpl->payee_mask.match((*j)->payee)) {
|
||||
matching = *j;
|
||||
DEBUG("derive.xact",
|
||||
"Found payee match: transaction on line " << (*j)->pos->beg_line);
|
||||
break;
|
||||
xacts_iterator xi(journal);
|
||||
if (xact_t * xact = lookup_probable_account(tmpl->payee_mask.str(), xi).first) {
|
||||
DEBUG("derive.xact", "Found payee by lookup: transaction on line "
|
||||
<< xact->pos->beg_line);
|
||||
matching = xact;
|
||||
} else {
|
||||
for (xacts_list::reverse_iterator j = journal.xacts.rbegin();
|
||||
j != journal.xacts.rend();
|
||||
j++) {
|
||||
if (tmpl->payee_mask.match((*j)->payee)) {
|
||||
matching = *j;
|
||||
DEBUG("derive.xact",
|
||||
"Found payee match: transaction on line " << (*j)->pos->beg_line);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
296
src/lookup.cc
Normal file
296
src/lookup.cc
Normal file
|
|
@ -0,0 +1,296 @@
|
|||
/*
|
||||
* Copyright (c) 2003-2010, John Wiegley. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* - Neither the name of New Artisans LLC nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <system.hh>
|
||||
|
||||
#include "lookup.h"
|
||||
#include "unistring.h"
|
||||
|
||||
namespace ledger {
|
||||
|
||||
namespace {
|
||||
typedef std::pair<xact_t *, int> score_entry_t;
|
||||
typedef std::deque<score_entry_t> scorecard_t;
|
||||
typedef std::map<uint32_t, std::size_t> char_positions_map;
|
||||
|
||||
struct score_sorter {
|
||||
bool operator()(const score_entry_t& left,
|
||||
const score_entry_t& right) const {
|
||||
return left.second > right.second;
|
||||
}
|
||||
};
|
||||
|
||||
typedef std::map<account_t *, int> account_use_map;
|
||||
typedef std::pair<account_t *, int> account_use_pair;
|
||||
typedef std::vector<account_use_pair> account_use_vector;
|
||||
|
||||
struct usage_sorter {
|
||||
bool operator()(const account_use_pair& left,
|
||||
const account_use_pair& right) const {
|
||||
return left.second > right.second;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
std::pair<xact_t *, account_t *>
|
||||
lookup_probable_account(const string& ident,
|
||||
xacts_iterator& iter_func,
|
||||
account_t * ref_account)
|
||||
{
|
||||
scorecard_t scores;
|
||||
|
||||
#if !defined(HAVE_BOOST_REGEX_UNICODE)
|
||||
string lident = ident;
|
||||
to_lower(lident);
|
||||
unistring lowered_ident(lident);
|
||||
#else
|
||||
// jww (2010-03-07): Not yet implemented
|
||||
unistring lowered_ident(ident);
|
||||
#endif
|
||||
|
||||
DEBUG("lookup.account",
|
||||
"Looking up identifier '" << lowered_ident.extract() << "'");
|
||||
#if defined(DEBUG_ON)
|
||||
if (ref_account != NULL)
|
||||
DEBUG("lookup.account",
|
||||
" with reference account: " << ref_account->fullname());
|
||||
#endif
|
||||
|
||||
while (xact_t * xact = iter_func()) {
|
||||
// Only consider transactions from the last two years (jww (2010-03-07):
|
||||
// make this an option)
|
||||
if ((CURRENT_DATE() - xact->date()).days() > 700)
|
||||
continue;
|
||||
|
||||
// An exact match is worth a score of 100
|
||||
if (ident == xact->payee) {
|
||||
DEBUG("lookup", " we have an exact match, score = 100");
|
||||
scores.push_back(score_entry_t(xact, 100));
|
||||
break;
|
||||
}
|
||||
|
||||
#if !defined(HAVE_BOOST_REGEX_UNICODE)
|
||||
string payee = xact->payee;
|
||||
to_lower(payee);
|
||||
unistring value_key(payee);
|
||||
#else
|
||||
// jww (2010-03-07): Not yet implemented
|
||||
unistring value_key(xact->payee);
|
||||
#endif
|
||||
|
||||
DEBUG("lookup", "Considering payee: " << value_key.extract());
|
||||
|
||||
std::size_t index = 0;
|
||||
std::size_t last_match_pos = unistring::npos;
|
||||
int bonus = 0;
|
||||
int score = 0;
|
||||
std::size_t pos;
|
||||
char_positions_map positions;
|
||||
|
||||
// Walk each letter in the source identifier
|
||||
foreach (const uint32_t& ch, lowered_ident.utf32chars) {
|
||||
int addend = 0;
|
||||
|
||||
pos = value_key.find(ch);
|
||||
|
||||
// If it occurs in the same order as the source identifier -- that is,
|
||||
// without intervening letters to break the pattern -- it's worth 10
|
||||
// points. Plus, an extra point is added for every letter in chains of
|
||||
// 3 or more.
|
||||
|
||||
bool added_bonus = false;
|
||||
if (pos == last_match_pos + 1) {
|
||||
DEBUG("lookup",
|
||||
" char " << index << " in-sequence match with bonus " << bonus);
|
||||
addend += 10;
|
||||
if (bonus > 2)
|
||||
addend += bonus - 2;
|
||||
bonus++;
|
||||
added_bonus = true;
|
||||
|
||||
last_match_pos = pos;
|
||||
}
|
||||
|
||||
// If it occurs in the same general sequence as the source identifier,
|
||||
// it's worth 5 points, plus an extra point if it's within the next 3
|
||||
// characters, and an extra point if it's preceded by a non-alphabetic
|
||||
// character.
|
||||
//
|
||||
// If the letter occurs at all in the target identifier, it's worth 1
|
||||
// point, plus an extra point if it's within 3 characters, and an extra
|
||||
// point if it's preceded by a non-alphabetic character.
|
||||
|
||||
else if (pos != unistring::npos) {
|
||||
// Ensure that a letter which has been matched is not matched twice,
|
||||
// so that the two x's of Exxon don't both match to the single x in
|
||||
// Oxford.
|
||||
char_positions_map::iterator pi = positions.find(ch);
|
||||
if (pi != positions.end()) {
|
||||
while (pi != positions.end() &&
|
||||
pos != unistring::npos && pos <= (*pi).second &&
|
||||
(*pi).second + 1 < value_key.length()) {
|
||||
pos = value_key.find(ch, (*pi).second + 1);
|
||||
}
|
||||
}
|
||||
|
||||
if (pos != unistring::npos) {
|
||||
bool in_order_match = (last_match_pos != unistring::npos &&
|
||||
pos > last_match_pos);
|
||||
DEBUG("lookup", " char " << index << " " <<
|
||||
(in_order_match ? "in-order" : "out-of-order")
|
||||
<< " match" << (in_order_match && pos - index < 3 ?
|
||||
" with proximity bonus of 1" : ""));
|
||||
|
||||
if (pi != positions.end())
|
||||
(*pi).second = pos;
|
||||
else
|
||||
positions.insert(char_positions_map::value_type(ch, pos));
|
||||
|
||||
if (pos < index)
|
||||
addend += 1;
|
||||
else
|
||||
addend += 5;
|
||||
|
||||
if (in_order_match && pos - index < 3)
|
||||
addend++;
|
||||
|
||||
#if !defined(HAVE_BOOST_REGEX_UNICODE)
|
||||
if (pos == 0 || (pos > 0 && !std::isalnum(value_key[pos - 1])))
|
||||
addend++;
|
||||
#else
|
||||
// jww (2010-03-07): Not yet implemented
|
||||
#endif
|
||||
|
||||
last_match_pos = pos;
|
||||
} else {
|
||||
last_match_pos = unistring::npos;
|
||||
|
||||
DEBUG("lookup", " char " << index << " does not match here");
|
||||
addend--;
|
||||
}
|
||||
}
|
||||
|
||||
// If the letter does not appear at all, decrease the score by 1
|
||||
|
||||
else {
|
||||
last_match_pos = unistring::npos;
|
||||
|
||||
DEBUG("lookup", " char " << index << " does not match");
|
||||
addend--;
|
||||
}
|
||||
|
||||
// Finally, decay what is to be added to the score based on its position
|
||||
// in the word. Since credit card payees in particular often share
|
||||
// information at the end (such as the location where the purchase was
|
||||
// made), we want to give much more credence to what occurs at the
|
||||
// beginning. Every 5 character positions from the beginning becomes a
|
||||
// divisor for the addend.
|
||||
|
||||
if ((int(index / 5) + 1) > 1) {
|
||||
DEBUG("lookup",
|
||||
" discounting the addend by / " << (int(index / 5) + 1));
|
||||
addend = int(double(addend) / (int(index / 5) + 1));
|
||||
}
|
||||
|
||||
DEBUG("lookup", " final addend is " << addend);
|
||||
score += addend;
|
||||
DEBUG("lookup", " score is " << score);
|
||||
|
||||
if (! added_bonus)
|
||||
bonus = 0;
|
||||
|
||||
index++;
|
||||
}
|
||||
|
||||
scores.push_back(score_entry_t(xact, score));
|
||||
}
|
||||
|
||||
// Sort the results by descending score, then look at every account ever
|
||||
// used among the top ten. Rank these by number of times used. Lastly,
|
||||
// "decay" any latter accounts, so that we give recently used accounts a
|
||||
// slightly higher rating in case of a tie.
|
||||
|
||||
std::stable_sort(scores.begin(), scores.end(), score_sorter());
|
||||
|
||||
account_use_map account_usage;
|
||||
|
||||
scorecard_t::iterator si = scores.begin();
|
||||
|
||||
int decay = 0;
|
||||
xact_t * best_xact = si != scores.end() ? (*si).first : NULL;
|
||||
|
||||
for (int i = 0; i < 5 && si != scores.end(); i++, si++) {
|
||||
DEBUG("lookup.account",
|
||||
"Payee: " << std::setw(5) << std::right << (*si).second <<
|
||||
" - " << (*si).first->payee);
|
||||
|
||||
// Only consider payees with a score of 20 or greater
|
||||
if ((*si).second >= 50) {
|
||||
foreach (post_t * post, (*si).first->posts) {
|
||||
if (! post->has_flags(ITEM_TEMP | ITEM_GENERATED) &&
|
||||
post->account != ref_account &&
|
||||
! post->account->has_flags(ACCOUNT_TEMP | ACCOUNT_GENERATED)) {
|
||||
account_use_map::iterator x = account_usage.find(post->account);
|
||||
if (x == account_usage.end())
|
||||
account_usage.insert(account_use_pair(post->account,
|
||||
((*si).second - decay)));
|
||||
else
|
||||
(*x).second += ((*si).second - decay);
|
||||
}
|
||||
decay++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (account_usage.size() > 0) {
|
||||
account_use_vector flat_account_usage(account_usage.begin(),
|
||||
account_usage.end());
|
||||
|
||||
std::stable_sort(flat_account_usage.begin(), flat_account_usage.end(),
|
||||
usage_sorter());
|
||||
|
||||
#if defined(DEBUG_ON)
|
||||
if (SHOW_DEBUG("lookup.account")) {
|
||||
foreach (account_use_pair& value, flat_account_usage) {
|
||||
DEBUG("lookup.account",
|
||||
"Account: " << value.second << " - " << value.first->fullname());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return std::pair<xact_t *, account_t *>(best_xact,
|
||||
flat_account_usage.front().first);
|
||||
} else {
|
||||
return std::pair<xact_t *, account_t *>(best_xact, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ledger
|
||||
56
src/lookup.h
Normal file
56
src/lookup.h
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
/*
|
||||
* Copyright (c) 2003-2010, John Wiegley. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* - Neither the name of New Artisans LLC nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @addtogroup data
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file lookup.h
|
||||
* @author John Wiegley
|
||||
*
|
||||
* @ingroup data
|
||||
*/
|
||||
#ifndef _LOOKUP_H
|
||||
#define _LOOKUP_H
|
||||
|
||||
#include "iterators.h"
|
||||
|
||||
namespace ledger {
|
||||
|
||||
std::pair<xact_t *, account_t *>
|
||||
lookup_probable_account(const string& ident,
|
||||
xacts_iterator& iter_func,
|
||||
account_t * ref_account = NULL);
|
||||
|
||||
} // namespace ledger
|
||||
|
||||
#endif // _LOOKUP_H
|
||||
|
|
@ -55,6 +55,7 @@ libledger_expr_la_CPPFLAGS = $(lib_cppflags)
|
|||
libledger_expr_la_LDFLAGS = -release $(VERSION)
|
||||
|
||||
libledger_data_la_SOURCES = \
|
||||
src/lookup.cc \
|
||||
src/compare.cc \
|
||||
src/iterators.cc \
|
||||
src/timelog.cc \
|
||||
|
|
@ -129,6 +130,7 @@ pkginclude_HEADERS = \
|
|||
src/timelog.h \
|
||||
src/iterators.h \
|
||||
src/compare.h \
|
||||
src/lookup.h \
|
||||
\
|
||||
src/session.h \
|
||||
src/report.h \
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue