Compare with Previous | Blame | View Log
#!/usr/bin/php
<?php
/**
* memberdirect-parsestatements.php
*
* A PHP script to parse transactions from the PDF files of
* "Electronic Statements" from Canadian Credit Union online banking
* sites that use the MemberDirect (http://www.memberdirect.ca/) system
* and dump transaction information into a MySQL table.
*
* The statements themselves can be automatically downloaded using the
* sister script memberdirect-getstatements.php.
*
* The script assumes you've placed all of your PDF statements in a
* single directory (set $pdfdir below) and will process all PDF files
* in that directory.
*
* Requirements:
* - PHP (http://www.php.net) - on a Mac you have this; on a Linux host you probably do
* - Xpdf (http://www.foolabs.com/xpdf/download.html) - only needed for pdftotext
* - MySQL (http://mysql.com/)
*
* You need to create the MySQL database and table referenced in the user-
* configurable settings below with the following table structure:
*
* CREATE TABLE `transactions` (
* `id` int(11) NOT NULL auto_increment,
* `account` char(15) NOT NULL,
* `date` date NOT NULL,
* `transactiontype` text NOT NULL,
* `item` text,
* `confirmationnumber` text,
* `amount` decimal(10,2) default NULL,
* `balance` decimal(10,2) NOT NULL,
* PRIMARY KEY (`id`)
* );
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA
*
* @version 0.1, 8 July 2009
* @link http://ruk.ca/wiki/Credit_Union_MemberDirect_Tools
* @author Peter Rukavina <peter@rukavina.net>
* @copyright Reinvented Inc., 2009
* @license http://www.fsf.org/licensing/licenses/gpl.txt GNU Public License
*/
// ---------------------------------------------------
// User-configurable options
// ---------------------------------------------------
$pdfdir = "/Users/peter/Statements"; // Directory holding PDF files of statements
$pdftotext = "/usr/local/bin/pdftotext"; // location of pdftotext binary
$db['host'] = "localhost"; // MySQL server host name or IP address
$db['user'] = ""; // MySQL username
$db['passwd'] = ""; // MySQL password
$db['database'] = "memberdirect"; // MySQL database name
$db['table'] = "transactions"; // MySQL table name
// ---------------------------------------------------
// End of user-configurable options
// ---------------------------------------------------
if ($handle = opendir($pdfdir)) {
while (false !== ($file = readdir($handle))) {
if (preg_match("/\.pdf/",$file)) {
exec("$pdftotext -layout $pdfdir/$file");
$file = str_replace(".pdf",".txt",$file);
$st = file_get_contents("$pdfdir/$file");
parseFile($st,$db);
}
}
}
/**
* Parse the text extracted from a single PDF file.
* @param string $text The text to parse
* @param array $db Settings for the MySQL database server
*/
function parseFile($text,$db) {
// Split the text into an array of individual lines.
$lines = split("\n",$text);
// Loop through each line, looking for patterns.
foreach($lines as $l) {
// A line that starts with a date (i.e. 'Apr 02 07'). For example:
// Apr 02 07 Interac withdrawal Trace # 004254 81.50- 3,505.43
if (preg_match("/^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d\d\s\d\d/",$l,$m)) {
// Convert Apr 02 07 into 2007-04-02 and set this as the current date.
$current['date'] = strftime("%Y-%m-%d",strtotime($m[0]));
// Remove the date from the line so it looks like any other line.
$l = ltrim(str_replace($m[0],'',$l));
// Parse the components of the line.
parseLine($current,$db,$l);
}
// A line that ends with a balance (but doesn't start with a date). For example:
// Dir Pymt Purchase THAT'S ENTERTAINMENT 092001001040 23.73- 3,381.70
else if (preg_match("/\.\d\d$/",$l)) {
// Trim space from the left-hand side of the line.
$l = ltrim($l);
// Parse the components of the line.
parseLine($current,$db,$l);
}
// A line that contains an account number. For example:
// SHARES - PERSONAL #99999-99-9 Joint: REGULAR SHARE
else if (preg_match("/(#\d\d\d\d\d-\d\d-\d)/",$l,$m)) {
// Set the current account number
$current['account'] = $m[0];
// Remove the '#' sign from the account number.
$current['account'] = str_replace("#",'',$current['account']);
}
// If we reach a line that contains the string 'Transaction totals'
// then we've reach the end of transaction information and can end.
else if (preg_match("/Transaction totals/",$l)) {
break;
}
}
}
/**
* Parse a single transaction.
* @param array $current Settings for current date and account number
* @param array $db Settings for the MySQL database server
* @param string $l The text of the transaction to parse
*/
function parseLine($current,$db,$l) {
$query = '';
// A line that looks like this:
// Dir Pymt Purchase THAT'S ENTERTAINMENT 092001001040 23.73- 3,381.70
if (preg_match("/^(.*)\s{2,}([,|\d]+\.\d\d-?)\s{2,}([,|\d]+\.\d\d)$/",$l,$m)) {
// A line that contains a 12-digit transaction number, for example:
// Dir Pymt Purchase THAT'S ENTERTAINMENT 092001001040 23.73- 3,381.70
if (preg_match("/^(.*)(\d\d\d\d\d\d\d\d\d\d\d\d)$/",trim($m[1]),$t)) {
$confirmationnumber = $t[2];
preg_match("/^(.*)\s{2,}(.*)$/",trim($t[1]),$i);
$transactiontype = trim($i[1]);
$item = trim($i[2]);
}
// A line that contains a 6-digit confirmation number, for example:
// Mbr Direct Bill Pmt ALIANT TELECOM INC / PEI CONF#144108 160.59- 11,607.61
else if (preg_match("/^(.*)CONF#(\d\d\d\d\d\d)$/",trim($m[1]),$t)) {
$confirmationnumber = $t[2];
preg_match("/^(.*)\s{2,}(.*)$/",trim($t[1]),$i);
$transactiontype = trim($i[1]);
$item = trim($i[2]);
}
// A line that contains a 6-digit trace number, for example:
// Apr 23 07 Interac withdrawal Trace # 004606 81.50- 2,936.53
else if (preg_match("/^(.*)Trace # (\d\d\d\d\d\d)$/",trim($m[1]),$t)) {
$confirmationnumber = $t[2];
$transactiontype = trim($t[1]);
$item = '';
}
// Any other sort of transaction, for example:
// Interac serv chg 7.50- 636.19
else {
$transactiontype = trim($m[1]);
$item = '';
$confirmationnumber = '';
}
// If the amount has a '-' as a suffix, move it around to the front.
// So 20.00- becomes -20.00
if (substr($m[2],strlen($m[2])-1,1) == "-") {
$m[2] = str_replace('-','',$m[2]);
$m[2] = "-" . $m[2];
}
// Remove commas from the amount and the balance.
// So 1,000.00 becomes 1000.00
$m[2] = str_replace(",",'',$m[2]);
$m[3] = str_replace(",",'',$m[3]);
// Make the MySQL query needed to insert this transaction into the
// database table.
$query = "INSERT into " . $db['table'] . " (account,date,transactiontype,item,
confirmationnumber,amount,balance) values (
'" . $current['account'] . "',
'" . $current['date'] . "',
'" . addslashes($transactiontype) . "',
'" . addslashes($item) . "',
'" . addslashes($confirmationnumber) . "',
'" . $m[2] . "',
'" . $m[3] . "')";
}
// A line non-transaction information line that looks like this:
// Opening Balance 3,586.93
else if (preg_match("/(.*)\s{2,}([,|\d]+\.\d\d-?)$/",$l,$m)) {
// If the amount has a '-' as a suffix, move it around to the front.
// So 20.00- becomes -20.00
if (substr($m[2],strlen($m[2])-1,1) == "-") {
$m[2] = str_replace('-','',$m[2]);
$m[2] = "-" . $m[2];
}
// Remove commas from the amount and the balance.
// So 1,000.00 becomes 1000.00
$m[2] = str_replace(",",'',$m[2]);
// Make the MySQL query needed to insert this transaction into the
// database table. These transactions don't have item, confirmation
// number nor amount information as they're just "balance" items.
$query = "INSERT into " . $db['table'] . " (account,date,transactiontype,item,
confirmationnumber,amount,balance) values (
'" . $current['account'] . "',
'" . $current['date'] . "',
'" . addslashes(trim($m[1])) . "',
'',
'',
'',
'" . $m[2] . "')";
}
// Print an error message for a transaction that, for some reason, cannot be parsed.
else {
print "-----------------------------------------\n";
print "CANNOT PARSE TRANSACTION:\n";
print $l . "\n";
print "-----------------------------------------\n";
}
// If we have a database query, then execute.
if ($query <> '') {
MYSQL_CONNECT($db['host'],$db['user'],$db['passwd']);
MYSQL_SELECT_DB($db['database']) or die( "Unable to select database");
$result = MYSQL_QUERY($query);
}
}