#! /usr/bin/perl -w
use strict;

# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at:
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

# Written by Theo Van Dinter <felicity@apache.org>
# Please feel free to mail with any questions. :)

# This goes with the run-masses script to take the ham/spam directories
# and spit out the appropriate spam:mbox:path statements for mass-check.
# The directory structure is assumed to look something like this:
#
# $CORPUS		(this script, run-masses, etc)
# |-- ham		(dir with mbox files for ham)
# |   |-- hamtrap	(dirs split into YYYY/MM/DD)
# |   `-- personal	(dirs split into YYYY/MM/DD)
# `-- spam		(empty)
#    |-- personal	(dirs split into YYYY/MM/DD)
#    `-- spamtrap	(dirs split into YYYY/MM/DD)


# if you don't have {ham,spam}trap mail broken out, set this to 0.
my $include_traps = 1;

# which dirs have mbox files?
my @dirs = ( 'ham' );

# how many days should we limit to searching for the dir areas?
# ie: assuming we have years of messages in YYYY/MM/DD directories, only look
# at the most recent X so that mass-check will go faster in the scan stage.
# comment out the line if you don't want to limit.
my $RECENT = 120;


my $actualdir = "./";
if (@ARGV) {
  $actualdir = shift(@ARGV) . "/";
  chdir $actualdir;
}

my @do_dirs;

foreach ( 'ham', 'spam' ) {
  push(@do_dirs, "$_/personal");
  push(@do_dirs, "$_/${_}trap") if $include_traps;
}

# mbox laden areas
while (my $dir = shift @dirs) {
  if (-d $dir) {
    $dir =~ m@^([^/]+)@;
    print "$1:mbox:$actualdir$dir\n";
  }
  else {
    die "$dir isn't a directory!\n";
  }
}

# Ok, now figure out the most recent X days of spam ...
foreach my $pdir ( @do_dirs ) {
  $pdir =~ m@^([^/]+)@;
  my $type = $1;

  my @dlist = ();

  if (opendir(DIR1, $pdir)) {
    while(my $dir = readdir(DIR1)) {
      next unless ($dir =~ /^\d+$/);

      $dir = "$pdir/$dir";
      next unless (opendir(DIR2, $dir));

      while(my $dir2 = readdir(DIR2)) {
        next unless ($dir2 =~ /^\d+$/ && opendir(DIR3, "$dir/$dir2"));
        $dir2 = "$dir/$dir2";

        push(@dlist, map { "$type:dir:$actualdir$dir2/$_" } grep(-d "$dir2/$_" && /^\d+$/, readdir(DIR3)));
        closedir(DIR3);
      }
      closedir(DIR2);
    }
    closedir(DIR1);
    @dlist = reverse sort @dlist;
    splice @dlist, $RECENT if (defined $RECENT && @dlist > $RECENT);
    push(@dirs, @dlist);
  }
}

print join("\n", @dirs, "");
exit;
