#
# Second attempt at greylisting for Aber.
#
# N.B. This perl module logs to /var/log/exim/greylist.log
# It's not tied directly to exim, and if you want to turn off 
# the logging, remove the appropriate lines in the module!
#
# Copyright 2003 Alun Jones, auj@aber.ac.uk
#
# This may be freely copied and modified.
#
# This is my version of greylisting. We have been using it for around
# 6 months (as of March 2004) for all inbound mail to Aberystywyth
# and it doesn't seem to cause any significant collateral damage. 
# We handle around 70,000 inbound messages per day. Of these,
# around half pass through undelayed because they've already been 
# whitelisted. The other half get greylisted as new addresses and, of
# these, 15% manage to whitelist themselves later. This suggests that,
# at the cost of delaying under 7.5% of legitimate mail (some spam still
# gets through), we are blocking 85% of the spam hitting our servers.
#
# You need a MySQL database to connect to. The database should contain the
# following tables:
# CREATE TABLE Greylist (
#   relationship varchar(32) NOT NULL default '',
#   IP varchar(20) default NULL,
#   state enum('GREY','WHITE') default NULL,
#   start int(11) default NULL,
#   last int(11) default NULL,
#   PRIMARY KEY  (relationship),
#   KEY IP (IP)
# );
#
# CREATE TABLE IPlist (
#   IP varchar(16) NOT NULL default '',
#   name varchar(255) default NULL,
#   added int(11) default NULL,
#   last int(11) default NULL,
#   keep int(11) default 0,
#   PRIMARY KEY  (IP)
# );
#
# How to use it:
#
# Call Greylist::defercheck($sender_address, $recipient_address, $source_ip)
# If the function returns > 0 then you should defer the message. In fact,
# the return code is the number of seconds until we're willing to accept
# the mail, so you could send a 4xx code with this number in it if you
# wanted.
#
# Every day, you should call Greylist::cleanup from cron. This will drop
# ancient records and RBL'd hosts.
#
# There's also a function 
# Greylist::prewhitelist($sender_address, $recipient_address)
# If you can arrage to call this for all outbound mail, then replies in 
# response will get through without having to wait for the greylist timeout.
#
# How it works
#
# When defercheck is called, the first thing we check is the originating
# IP address in the "IPlist" table. If present, we allow the message
# through (by returning 0) and timestamp the IPlist record for that IP. 
# This lets us manually (and automatically - see later) whitelist IP 
# addresses. N.B. when you add an IP manually, you can choose that
# it should be kept forever by setting keep=1 or allow it to be cleaned
# up automatically, by setting keep=0
#
# If the message hasn't passed the IP address test, we generate an MD5 hash 
# of the sender and recipient addresses (trying to remove VERP from
# addresses first). We use an MD5 so that we don't end up with 
# a database full of sender/recipient pairs and get into dire data 
# protection issues. We then lookup the MD5 sum in the Greylist table. If 
# it is present, we look at the state column. If "GREY", we look at the
# time the hash was registered and decide whether to allow the message 
# (and whitelist future messages between this sender/recipient). If we 
# whitelist or allow, then we update the Greylist record with the new state
# and/or timestamp it.
#
# If the state is "WHITE" then we timestamp the record and allow the message
# through.
# 
# If there is no entry for this MD5, we insert a new record into the
# Greylist table, with state "GREY" and defer the message.
#
# The "prewhitelist" function merely enters a whitelisted entry for the
# MD5 of recipient/sender into the Greylist table (or updates an existing
# entry).
#
# The cleanup function clears out whitelisted entries in Greylist and IPlist
# which haven't been used in $whitelist_timeout seconds. It also drops 
# greylisted entries in Greylist which haven't validated themselves within
# $cleanup_timeout seconds (these are probably the spammers!). 
#
# In addition, it checks all entries in IPlist for DNSBL listing. If present,
# then these are dropped from IPlist so that mail can only get through by
# passing through the Greylisting process. 
#
# Finally, the cleanup function looks at IP addresses in the greylist, and
# finds ones which have managed to whitelist more than $ipminn address pairs
# recently. These addresses are added to IPlist if they aren't currently in
# one of the DNSBLs listed in @rbls. This bit means that places like hotmail
# tend to go straight through the greylist because all their outbound relays
# manage to get into IPlist quite quickly.
#

package Greylist;

use DBI;
use Socket;
use Digest::MD5;

# You should configure these!
my $dbuser = "spamdb";
my $dbpass = "database_password";
my $dbhost = "database_host_name";
my $dbname = "database_name";
my $initial_timeout = 3600;
my $whitelist_timeout = 36*86400;
my $cleanup_timeout = 4*86400;

# This is used in trying to remove VERP from addresses.
my $localdomain = "aber.ac.uk";

my @rbls = (
	"rbl-plus.mail-abuse.ja.net",
	"bl.spamcop.net",
	"sbl.spamhaus.org",
);

my $dbh;
my $check;
my $ipcheck;

# This IP address must be responsible for $ipminn entries before we
# consider it. The earliest of these must have been whitelisted
# more than $iphirange seconds ago and the latest must have been
# whitelisted less than $iplorange seconds ago.
my $ipminn = 24;
my $iplorange = 24*3600;
my $iphirange = 48*3600;

sub defercheck
{
	my ($sender, $recipient, $ip) = @_;
	my $now = time;

	my @t = localtime($now);
	my $tod_log = sprintf("%04d-%02d-%02d %02d:%02d:%02d", 
			1900+$t[5], 1+$t[4], $t[3], $t[2], $t[1], $t[0]);
	my $ret = 1;
	my $md5key = Digest::MD5::md5_hex(&deVERP($sender)."\0".$recipient);

	open(F, ">>/var/log/exim/greylist.log");

	# Make sure we're connected.
	unless (&ensure_connection)
	{
		return 0;
	}

	# Check IP whitelist first
	unless ($ipcheck->execute($ip))
	{
		print F "$tod_log execute($ip): $DBI::errstr\n";
		$dbh->disconnect;
		$dbh = undef;
		close(F);
		return 0;
	}
	my $ipent = $ipcheck->fetchrow_hashref;
	if (defined($ipent))
	{
		$dbh->do("UPDATE IPlist SET last=? WHERE IP=?", undef,
			$now, $ip);
		print F "$tod_log allowing $ip\n";
		return 0;
	}

	# OK, check sender/recipient greylist.
	unless ($check->execute($md5key))
	{
		print F "$tod_log execute($md5key): $DBI::errstr\n";
		$dbh->disconnect;
		$dbh = undef;
		close(F);
		return 0;
	}

	my $greyent = $check->fetchrow_hashref;

	if (defined($greyent))
	{
		if ($greyent->{state} eq "GREY")
		{
			if ($greyent->{start}+$initial_timeout < $now)
			{
				if ($greyent->{start}+$cleanup_timeout > $now)
				{
					# Whitelist relationship.
					print F "$tod_log whitelisting $md5key from $ip\n";
					$dbh->do("UPDATE Greylist SET state='WHITE', last=?, IP=? WHERE relationship=?", undef, $now, $ip, $md5key);
					$ret = 0;
				}
				else
				{
					# Too old - greylist again.
					print F "$tod_log greylisting $md5key from $ip\n";
					$dbh->do("UPDATE Greylist SET state='GREY', IP=?, start=? WHERE relationship=?", undef, $ip, $now, $md5key);
					$ret = $initial_timeout;
				}
			}
			else
			{
				# Stays greylisted.
				print F "$tod_log deferring $md5key from $ip\n";
				$ret = ($greyent->{start}+$initial_timeout - $now);
			}
		}
		elsif ($greyent->{state} eq "WHITE")
		{
			print F "$tod_log allowing $md5key from $ip\n";
			$dbh->do("UPDATE Greylist SET last=?,IP=? WHERE relationship = ?", undef, $now, $ip, $md5key);
			$ret = 0;
		}
	}
	else
	{
		print F "$tod_log greylisting $md5key from $ip\n";
		$dbh->do("INSERT INTO Greylist (relationship, state, IP, start) VALUES (?, 'GREY', ?, ?)", undef, $md5key, $ip, $now);
		$ret = $initial_timeout;
	}

	close(F);
	return $ret;
}

sub ensure_connection
{
	# Connect database.
	unless (defined($dbh))
	{
		$dbh = DBI->connect("DBI:mysql:database=$dbname;host=$dbhost",
			$dbuser, $dbpass, { RaiseError => 0, PrintError => 0, AutoCommit => 1 } );
		unless (defined($dbh))
		{
			print F "$tod_log connect failure: $DBI::errstr\n";
			close(F);
			return 0;
		}
	}

	unless (defined($ipcheck))
	{
		$ipcheck = $dbh->prepare("SELECT * FROM IPlist WHERE IP=?");
		unless (defined($ipcheck))
		{
			print F "$tod_log prepare: $DBI::errstr\n";
			$dbh->disconnect;
			$dbh = undef;
			close(F);
			return 0;
		}
	}

	unless (defined($check))
	{
		$check = $dbh->prepare("SELECT * FROM Greylist WHERE relationship=?");
		unless (defined($check))
		{
			print F "$tod_log prepare: $DBI::errstr\n";
			$dbh->disconnect;
			$dbh = undef;
			close(F);
			return 0;
		}
	}
	return 1;
}

# Drop a reversed record into the database to allow mail
# replies to come in (this is so that if you mail someone
# before they've ever mailed you, their reply can get back
# without being delayed). If you want to use this, you need
# to arrange that it is called for all outbound mail.
sub prewhitelist
{
	my ($local_address, $remote_address) = @_;
	my $md5key = Digest::MD5::md5_hex($remote_address."\0".$local_address);
	my $now = time;

	# Make sure we're connected.
	unless (&ensure_connection)
	{
		return 0;
	}

	# Already there?
	unless ($check->execute($md5key))
	{
		$dbh->disconnect;
		$dbh = undef;
		return 0;
	}

	my $greyent = $check->fetchrow_hashref;
	if (defined($greyent))
	{
		$dbh->do("UPDATE Greylist SET state='WHITE', last=? WHERE relationship=?", undef, $now, $md5key);
	}
	else
	{
		$dbh->do("INSERT INTO Greylist (relationship, state, start, last) VALUES (?, 'WHITE', ?, ?)", undef, $md5key, $now, $now);
	}
	return 0;
}

# Try to drop out VERP addresses.
sub deVERP
{
	my $addr = shift;
	if ($addr =~ /^([a-z0-9-]+?)[-\+][\d\.\+\-]+[\-\+]([a-z0-9]+).$localdomain@(.*)/)
	{
		$addr = "$1-VERP-$2\@$3";
	}
	return $addr;
}

# Cleanup the tables.
sub cleanup
{
	# Make sure we're connected.
	unless (&ensure_connection)
	{
		return 0;
	}

	# Get rid of any IPs in the IP whitelist which are now RBL'd
	my $newrbl = $dbh->prepare("SELECT IP FROM IPlist ORDER BY IP");
	my $del = $dbh->prepare("DELETE FROM IPlist WHERE IP=?");
	$newrbl->execute();
	my $badips = 0;
	while (defined($ipent = $newrbl->fetchrow_hashref))
	{
		my $revip = join('.', reverse(split('\.', $ipent->{IP})));
		my $rblcheck = 0;
		foreach $dom (@rbls)
		{
			my $addr = inet_aton($revip.".".$dom);
			if (defined($addr) && (inet_ntoa($addr) =~ /^127\./))
			{
				$rblcheck = 1;
				last;
			}
		}
		if ($rblcheck)
		{
			$del->execute($ipent->{IP});
			$badips++;
		}
	}
	printf "%d RBL'd IPs deleted from IP list\n", $badips;

	# Maintain the IP whitelist.
	my $ipcount = $dbh->prepare("SELECT IP,COUNT(IP) AS n,MIN(start) AS fl, MAX(last) AS ll FROM Greylist ".
		"WHERE state='WHITE' GROUP BY IP ORDER BY n DESC");
	$ipcount->execute;
	my %whitelist;
	my $ipent;
	while (defined($ipent = $ipcount->fetchrow_hashref))
	{
		# Not enough seen.
		last if ($ipent->{n} < $ipminn);

		# Oldest timestamp
		next unless ($ipent->{fl} < time-$iphirange);

		# Newest timestamp
		next unless ($ipent->{ll} > time-$iplorange);

		# We're not whitelisting IPs that don't have
		# a reverse lookup.
		my $name = gethostbyaddr(inet_aton($ipent->{IP}), AF_INET);
		next unless defined($name);

		# We're not whitelisting IPs which are RBL'd
		my $revip = join('.', reverse(split('\.', $ipent->{IP})));
		my $rblcheck = 0;
		foreach $dom (@rbls)
		{
			my $addr = inet_aton($revip.".".$dom);
			if (defined($addr) && (inet_ntoa($addr) =~ /^127\./))
			{
				$rblcheck = 1;
				last;
			}
		}
		last if ($rblcheck);

		$whitelist{$ipent->{IP}} = $name;
	}
	$ipcount->finish;

	# Now check they aren't already in the IP whitelist.
	my $dups = $dbh->prepare("SELECT IP FROM IPlist WHERE IP IN ('".
		join("','", keys(%whitelist))."')");
	$dups->execute;
	while (defined($ipent = $dups->fetchrow_hashref))
	{
		delete $whitelist{$ipent->{IP}};
	}
	$dups->finish;

	# OK, %whitelist now contains IPs that we're willing to add.
	my $ins = $dbh->prepare("INSERT INTO IPlist (IP, name, added) VALUES (?, ?, ?)");
	while (($ip, $name) = each(%whitelist))
	{
		$ins->execute($ip, $name, time);
	}
	$ins->finish;
	printf "%d IP%s added to IP whitelist\n", scalar keys(%whitelist), 
		scalar(keys(%whitelist)) == 1 ? "" : "s";

	# Zap old data.
	my $t1 = time - $cleanup_timeout;
	my $t2 = time - $whitelist_timeout;
	my $rows = $dbh->do("delete from Greylist where (!isnull(last) and last < $t2) ".
			" or (isnull(last) and start < $t1)");
	printf "%d row%s deleted from greylist\n", $rows, $rows == 1 ? "" : "s";

	$rows = $dbh->do("delete from IPlist where keep=0 AND ((last < $t2) or isnull(last) and added < $t2)");
	printf "%d obsoleterow%s deleted from IP list\n", 
		$rows, $rows == 1 ? "" : "s";
}

# Forget about an IP address. If someone's spammed us from somewhere,
# drop all record of that place from the Grey and Whitelists.
# Returns the number of greylisted and whitelisted rows dropped.
sub forget
{
	my $ip = shift;

	# Make sure we're connected.
	unless (&ensure_connection)
	{
		return 0;
	}

	my ($grows, $wrows);
	if ($ip =~ /^\d+\.\d+\.\d+\.\d+$/)
	{
		$grows = $dbh->do("delete from Greylist where IP='$ip'");
		$wrows = $dbh->do("delete from IPlist where IP='$ip'");
	}
	return ($grows, $wrows);
}

1;

