#!/usr/bin/env perl
#
# mboxpurge.pl, a program for purge old messages from an mbox file.
#
# Copyright (c) 2006 Marcus Libäck <marcus@terminal.se>
#
# This software is provided 'as-is', without any express or implied
# warranty.  In no event will the authors be held liable for any damages
# arising from the use of this software.
#
# Permission is granted to anyone to use this software for any purpose,
# including commercial applications, and to alter it and redistribute it
# freely, subject to the following restrictions:
#
# 1. The origin of this software must not be misrepresented; you must not
#    claim that you wrote the original software. If you use this software
#    in a product, an acknowledgment in the product documentation would be
#    appreciated but is not required.
# 2. Altered source versions must be plainly marked as such, and must not be
#    misrepresented as being the original software.
# 3. This notice may not be removed or altered from any source distribution.
#

use strict;
use warnings;
use Time::Local;
use Getopt::Std;
use Switch;

# Variables
# ---------------------------------------------------------------------
my (
    %opts,
    $month,
    $day,
    @time,
    $year,
    @messages,
    $timestamp,
    $age,
    $infile,
    $outfile,
);

my $fromline = '^From\s+.*\s+\w{3}\s(\w{3})\s+(\d{1,2})\s+(\d\d:\d\d:\d\d)\s+(\d{4})';
my $purge_count = 0;
my $dry_run = 0;
my $extract = 0;
my $archive = 0;
my $append = 0;
my $verbose = 0;
my $first_message = 1;
my $messagecount = 0;
my $current_time = time();
my $version = "1.0.1-20060829";

# Options
# ---------------------------------------------------------------------
getopts('aAdhm:vx', \%opts);

if (exists $opts{'h'}) {
    help();
}

if (exists $opts{'d'}) {
    $dry_run = 1;
}

if (exists $opts{'v'}) {
    if (exists $opts{'v'}) {
        $verbose++;
    }
}

if (exists $opts{'a'}) {
    if (exists $opts{'x'}) {
        print ("You cannot use -a (archive) and -x (extract) at the same time.\n");
        exit (1);
    } else {
        $archive = 1;
    }
}

if (exists $opts{'x'}) {
    if (exists $opts{'a'}) {
        print ("You cannot use -a (archive) and -x (extract) at the same time.\n");
        exit (1);
    } else {
        $extract = 1;
    }
}

if (exists $opts{'A'}) {
    if (exists $opts{'a'} || exists $opts{'x'}) {
        $append = 1;
    } else {
        print ("-A (append) must be used in combination with either -a (archive) or -x (extract).\n");
        exit (1);
    }
}

# -m must be present and in the specified format.
if (exists $opts{'m'}) {
    if ($opts{'m'} =~ m/^(\d+)([h|d|w|m|y])$/) {
        switch ($2) {
            case 'h' { $age = $1*3600 }
            case 'd' { $age = $1*86400 }
            case 'w' { $age = $1*604800 }
            case 'm' { $age = $1*2592000 }
            case 'y' { $age = $1*31536000 }
        }
    } elsif (isvaliddate($opts{'m'})) {
        $age = time()-isvaliddate($opts{'m'});
        if ($age < 1) {
            print ("$opts{'m'} is in the future");
            exit (1);
        }
    } else {
        print ("Invalid max age $opts{'m'}");
        exit (1);
    }
} else {
    help();
}

if ($archive || $extract) {
    if ($#ARGV != 1) {
        print "You must specify two files when using -a (archive) or -x (extract)\n";
        exit (1);
    }

    $infile = $ARGV[0];
    $outfile = $ARGV[1];
} elsif (!$archive && !$extract) {
    if ($#ARGV != 0) {
        print "You must specify one file when purging messages\n";
        exit (1);
    }

    $infile = $outfile = $ARGV[0];
}

# Main
# ---------------------------------------------------------------------

# Read INFILE and separate each message into the @messages array.
open (INFILE, '<', $infile);
while (<INFILE>)
{
    if ($first_message) {
        if (/$fromline/) {
            $messages[$messagecount] .= $_;
            $first_message = 0;
            next;
        } else {
            die "$infile does not seem to be a valid mbox\n";
        }
    } elsif (/^From /) {
        $messagecount++;
        $messages[$messagecount] .= $_;
        next;
    } else {
        $messages[$messagecount] .= $_;
        next;
    }
}
close INFILE;

# Extract timestamp from each message and compare to specified age.
open (ORIGINAL, '>', $infile) or die $! if ($archive && !$dry_run);
flock (ORIGINAL, 2) if (defined fileno ORIGINAL);

if ($append) {
    open (OUTFILE, '>>', $outfile) or die $! unless ($dry_run);
} else {
    open (OUTFILE, '>', $outfile) or die $! unless ($dry_run);
}
flock OUTFILE, 2 if (defined fileno ORIGINAL);

foreach (@messages) {
    m/Subject:\s+(.*)/;
    my $subject = $1;

    #print "$purge_count\n";

    if (/$fromline/) {
        # Convert month to numerical value.
        switch ($1) {
            case "Jan" { $month = 0 }
            case "Feb" { $month = 1 }
            case "Mar" { $month = 2 }
            case "Apr" { $month = 3 }
            case "May" { $month = 4 }
            case "Jun" { $month = 5 }
            case "Jul" { $month = 6 }
            case "Aug" { $month = 7 }
            case "Sep" { $month = 8 }
            case "Oct" { $month = 9 }
            case "Nov" { $month = 10 }
            case "Dec" { $month = 11 }
        }

        $day = $2;
        @time = split (/:/, $3);
        $year = $4;
        $timestamp = timelocal("$time[2]","$time[1]","$time[0]",$day,$month,$year);
    }

    # Archive mode, old messages go into the OUTFILE and the
    # newer are written back to the original file.
    if ($archive) {
        if ($current_time-$timestamp > $age) {
            print OUTFILE $_ if (!$dry_run);
            $purge_count++;
            print "Archived $subject\n" if ($verbose);
        } else {
            print ORIGINAL $_ if (!$dry_run);
        }
    }
    # Extract mode, old messages are extracted and written to OUTFILE
    # original file is kept intact.
    elsif ($extract) {
        if ($current_time-$timestamp > $age) {
            print OUTFILE $_ if (!$dry_run);
            $purge_count++;
            print "Extracted $subject\n" if ($verbose);
        }
    }
    # Purge mode, old messages are purged, kept messages either go into
    # the original file or a new file.
    else {
        if ($current_time-$timestamp < $age) {
            if (!$dry_run) {
                print OUTFILE $_;
            }
        } else {
            $purge_count++;
            print "Purged $subject\n" if ($verbose);
        }
    }
}

flock ORIGINAL, 8 if (defined fileno ORIGINAL);
flock OUTFILE, 8 if (defined fileno ORIGINAL);
close ORIGINAL;
close OUTFILE;

print "$purge_count messages affected.\n";


# Subroutines
# ---------------------------------------------------------------------

# Date validation, credits to regular-expressions.info.
sub isvaliddate {
    my $input = shift;
    if ($input =~ m/^((?:19|20)\d\d)-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])$/) {
        # At this point, $1 holds the year, $2 the month and $3 the day of the date entered
        if (($3 == 31) and ($2 == 4 or $2 == 6 or $2 == 9 or $2 == 11)) {
            return 0; # 31st of a month with 30 days
        } elsif ($3 >= 30 and $2 == 2) {
            return 0; # February 30th or 31st
        } elsif ($2 == 2 and $3 == 29 and not ($1 % 4 == 0 and ($1 % 100 > 0 or $1 % 400 == 0))) {
            return 0; # February 29th outside a leap year
        } else {
            return timelocal(0,0,0,$3,$2-1,$1);
        }
    } else {
        return 0; # Not a date
    }
}

# Help function.
sub help {
    print "Version $version

mboxpurge.pl purges, archives or extracts old messages out of mbox-style
mailboxes. If nothing else is specified it defaults to just purging
messages rather than archiving or extracting them into another mailbox.

Usage: $0 [options] [infile] [outfile]

Options:
 -h        Display this text.
 -a        Archive mode, archive messages into the second
           mbox.
 -A        Append to the second mbox instead of overwriting it.
 -d        Dry run, run the program without actually writing to
           any files.
 -m NN     Maximum age of messages to keep in hours,
           days, weeks, months or years (e.g. 3d, 4y).
 -m <date> Keep messages newer than this date.
           (dates must in ISO format (YYYY-MM-DD)
 -v        Print the subjects of purged, archived or extracted
           messages.
 -x        Extract mode, extract messages into the second
           mbox. (Keeps the original mbox untouched).\n";

    exit (1);
}