#!/usr/bin/perl -T --

# This script does not need to run under taint perl since it will be
# run from cron with the same uid as tinderbox.


# The contents of this file are subject to the Mozilla Public
# License Version 1.1 (the "License"); you may not use this file
# except in compliance with the License. You may obtain a copy of
# the License at http://www.mozilla.org/NPL/
#
# Software distributed under the License is distributed on an "AS
# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
# implied. See the License for the specific language governing
# rights and limitations under the License.
#
# The Original Code is the Tinderbox build tool.
#
# The Initial Developer of the Original Code is Netscape Communications
# Corporation. Portions created by Netscape are
# Copyright (C) 1998 Netscape Communications Corporation. All
# Rights Reserved.
#

# complete rewrite by Ken Estes for contact info see the
#     mozilla/webtools/tinderbox2/Contact file.
# Contributor(s): 

# $Revision: 1.13 $
# $Date: 2005/11/25 21:57:08 $
# $Author: timeless%mozdev.org $
# $Name:  $


use File::Basename;
use Sys::Hostname;
use Getopt::Long;


sub usage {

$usage =<<"EOF";

Usage
	$PROGRAM [--help] [--version] \
		[--min-archive-files num] [--max-archive-days num] \
				[--] \ 
		dir1 [ [dir2] . . . ]


Optional Arguments

version			Show the version number of this program.

help			View this page.

min-archive-files	Change the minimum number of files which must be in
			the directory before files are prunned. 
			The default is: $MIN_ARCHIVE_FILES

max-archive-days	Change the maximum number of days that files are kept 
			in the archive. The default is: $MAX_ARCHIVE_DAYS

index-only		Add index files to each directory but do not cull 
			any files. 

All arguments following the named arguments are treated as a list of
directories which needs pruning.

Summary

A simple script to check delete old files which accumulate in a set of
directories.  

I use this script to remove the tar'ed up results of tinderbox builds.
Each tinderbox build stores its resulting binary files in a directory
whose name is the name of the branch.  I keep tar filenames
encode the date cvs checkout timestamp date (cvs -D timestamp) like:

 		 SEAMONKEY.2001-08-21.200500.tar.gz

The source files used to build this tar file can be recoved with this
command:

	cvs checkout -r SEAMONKEY -D '2001-08-21 20:05:00' all


A new build is completed about once an hour and this script is used
to ensure that recent files and a few historical files are saved.

The files are sorted by date and kept according to this rule: The most
recent min-archive-files are kept and if there are more then
min-archive-files then additionally one file each day is kept for the
previous max-archive-days.  All other files are deleted.

Install in each directory a index.html file.  This is necessary
because browsers truncate long filenames and the naming convention
causes our file names to be considered long.


EOF

    ;

print $usage;

}




sub untaint_filename {
  my ($str) = @_;

  my $out;

  # This may be the output of a glob, make it taint safe.
  if ( $str =~ m/([0-9a-zA-Z\.\-\_\/\:]+)/ ) {
      $out = $1;
  } else {
      $out = '';
  }

  return $out;
}


sub untaint_digits {
  my ($str) = @_;

  my $out;

  if ( $str =~ m/(\d+)/ ) {
      $out = $1;
  } else {
      $out = '';
  }

  return $out;
}

sub format_digits_human_readable {
    my ($num) = @_;
    my $char = "G";

    my $break = 1000_000_000;
    $char = "G";
    if ($num >= $break) {
	my $trunc = ( $num - ($num % $break) ) / $break;
	$out = $trunc;
	$padding = ' ' x (3-length($out));
	return $padding.$out.$char;
    }

    $break = $break / 1000;
    $char = "M";
    if ($num >= $break) {
	my $trunc = ( $num - ($num % $break) ) / $break;
	$out = $trunc;
	$padding = ' ' x (3-length($out));
	return $padding.$out.$char;
    }

    $break = $break / 1000;
    $char = "K";
    if ($num >= $break) {
	my $trunc = ( $num - ($num % $break) ) / $break;
	$out = $trunc;
	$padding = ' ' x (3-length($out));
	return $padding.$out.$char;
    }

    $char='';
    $out = $num;
    $padding = ' ' x (3-length($out));
    return $padding.$out.$char;
}


sub get_mtime {
    my ($file_name) = @_;
    
    ($file_name) ||
	die("Function get_mtime requires a non null filename. \n");

    # use a cached value if we have it.
    ($MTIME{$file_name}) &&
	return $MTIME{$file_name};

    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
	$atime,$mtime,$ctime,$blksize,$blocks)
	= stat($file_name);
    
    ($mtime) or 
	die("Could not stat file: $file: $! \n");

    $MTIME{$file_name} = $mtime;
    return $mtime;
}


sub get_size {
    my ($file_name) = @_;
    
    ($file_name) ||
	die("Function get_mtime requires a non null filename. \n");

    # use a cached value if we have it.
    ($size{$file_name}) &&
	return $size{$file_name};

    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
	$atime,$mtime,$ctime,$blksize,$blocks)
	= stat($file_name);
    
    ($mtime) or 
	die("Could not stat file: $file: $! \n");

    $SIZE{$file_name} = $size;
    return $size;
}


sub get_day_of_year {
    my ($time) = @_;

    ($time) ||
	die("Function get_day_of_year requires a non zero time. \n");

    # use a cached value if we have it.
    ($YDAY{$time}) &&
	return $YDAY{$time};

    my ($sec,$min,$hour,$mday,$mon,$year,
	$wday,$yday,$isdst) = localtime($time);
    
    $mon++;
    $year += 1900;

    defined($yday) or 
	die("Could not convert time: $time: $! \n");

    $YDAY{$time} = $yday;
    return $yday;
}


sub same_day {
    my ($file_name, $old_file_name) = @_;

    # by definition if one file does not exist then the files are not
    # on the same day.

    my $both_file_names_provided = ( ($file_name) && ($old_file_name) );

    ( $both_file_names_provided ) ||
	return 0;

    my ($mtime) = get_mtime($file_name);
    my ($old_mtime) = get_mtime($old_file_name);

    my ($day_of_year) = get_day_of_year($mtime);
    my ($old_day_of_year) = get_day_of_year($old_mtime);

    my $same_day = ($day_of_year eq $old_day_of_year);

    return $same_day;
}

sub really_old {
    my ($file_name) = @_;

    ($file_name) ||
	die("Function really_old requires a non null filename. \n");

    my ($mtime) = get_mtime($file_name);
    my ($really_old) = (time() - $mtime) > $MAX_ARCHIVE_SECONDS;

    return $really_old;
}

sub time_sorted_dir_files {
    my ($dir) = @_;
    my @file_set;
    
    opendir(DIR, $dir) || 
	die("can't opendir $dir: $! \n");
    
    @file_set = grep { !/^\./ } readdir(DIR);
    
    closedir(DIR)  || 
	die("can't close $dir: $! \n");
    
    @file_set = map { untaint_filename("$dir/$_") } @file_set;
    @file_set = sort { 
			get_mtime($b) 
			    <=>
			get_mtime($a) 
		} @file_set;
    
    return @file_set;
}


# We assume each directory contains the output of some process at
# various times. Remove older files according to the policy.

sub purge_dirs {
    my @dir_list = @_;

    foreach $dir (@dir_list) {
	
	my $dir_ok = (-d $dir) && (-r $dir) && (-w $dir);

	($dir_ok) ||
	    die("Directory: '$dir', is not a writable directory\n");	

	my @file_set = time_sorted_dir_files($dir);

	my $num_files_seen = 0; 
	my $old_file_name = '';
	foreach $file_name (@file_set) {
	    $num_files_seen++; 
	    if ($num_files_seen > $MIN_ARCHIVE_FILES) {
		if ( 
		     same_day($file_name, $old_file_name) || 
		     really_old($file_name) 
		     ) {

		    unlink($file_name) || 
			die("can't unlink $file_name: $! \n");;

		}
	    }
	    $old_file_name = $file_name;
	} # foreach $file_name
    } # foreach $dir

    return ;
}


# create an index.html file for each directory given as input.

# this is necessary because our filenames are so long (embedded
# date/time) that by default browsers do not show the full name in a
# directory listing. However if we put the names in an index.html file
# then the bwoser displays the filenames, no matter how long they are.


sub print_dirs_index {
    my @dir_list = @_;

    foreach $dir (@dir_list) {

	my $dir_ok = (-d $dir) && (-r $dir) && (-w $dir);

	($dir_ok) ||
	    die("Directory: '$dir', is not a writable directory\n");	

	my $index_file = "$dir/index.html";
	open (INDEX, ">$index_file") ||
	    die("Could not open indexfile: $index_file\n");

	print INDEX "<html>\n";
	print INDEX "<pre>\n";
	print INDEX "Files in Directory: $dir\n";
	print INDEX "as of: ".localtime(time())."\n";
	print INDEX "\n";

	my @file_set = time_sorted_dir_files($dir);
	
	# we do not want the index file in the output.  All the other
	# files listed are tar files so the output should appear as a list
	# of tar files.

	@file_set = grep (!/^$index_file$/, @file_set);

	foreach $file_name (@file_set) {
	    my $basename = basename($file_name);
	    my $href = "<A  HREF=\"./$basename\" >$basename</A>";	
	    print INDEX (
			 $href.
			 " (".
			 format_digits_human_readable(get_size($file_name)).
			 ") ".
			 localtime(get_mtime($file_name)).
			 "\n"
			 );
	    print INDEX "\n";
		    
	} # foreach $file_name
	
	print INDEX "\n";
	print INDEX "</pre>\n";
	print INDEX "</html>\n";
	close(INDEX) ||
	    die("Could not close indexfile: $index_file\n");


    } # foreach $dir

    return ;
}

sub set_static_vars {

  # This functions sets all the static variables which are often
  # configuration parameters.  Since it only sets variables to static
  # quantites it can not fail at run time. Some of these variables are
  # adjusted by parse_args() but aside from that none of these
  # variables are ever written to. All global variables are defined here
  # so we have a list of them and a comment of what they are for.
  
  $MIN_ARCHIVE_FILES = 8;
  $MAX_ARCHIVE_DAYS = 8;

  $ERROR_LOG= "/var/log/tinderbox2/build.log";

  @ORIG_ARGV = @ARGV;

  # accept the path from our environment.
  
  #  $ENV{'PATH'}= (
  #                 '/bin'.
  #                 ':/usr/bin'.
  #                 ':/usr/local/bin'.
  #                 
  #                 ':/opt/gnu/bin'.
  #                 ':/usr/ucb'.
  #                 ':/usr/ccs/bin'.
  #                 '');
  
  # taint perl requires we clean up these bad environmental variables
  # and it dos not hurt to do it here.

  delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV', 'LD_PRELOAD'};

  # sudo deletes these variables as well

  delete @ENV{'KRB_CONF', 'KRB5_CONFIG'};
  delete @ENV{'LOCALDOMAIN', 'RES_OPTIONS', 'HOSTALIASES'};

  # the version number of this tinderbox release.

  $VERSION = '0.09';

  return 1;
}


sub get_env {

# this function sets variables similar to set_static variables.  This
# function may fail only if the OS is in a very strange state.  after
# we leave this function we should be all set up to give good error
# handling, should things fail.

# This function should run as early as possible (directly after
# set_static_vars) so that the error environment is setup incase of
# problems.

  umask 0022; 
  $| = 1;
  $PROGRAM = File::Basename::basename($0);
  $PID = $$; 
  $TIME = time();
  $LOCALTIME = localtime($main::TIME);

  open (LOG , ">>$ERROR_LOG") ||
    die("Could not open logfile: $ERROR_LOG\n");

  my($old_fh) = select(LOG);
  $| = 1;
  select($old_fh);

  $SIG{'__DIE__'} = \&fatal_error;
  $SIG{'__WARN__'} = \&log_warning;

  return 1;
}



sub fatal_error {
  my  @error = @_;
  foreach $_ (@error) {
    print LOG "[$LOCALTIME] $_";
  }
  print LOG "\n";
  die("\n");
}


sub log_warning {
  my  @error = @_;

  foreach $_ (@error) {
    print LOG "[$LOCALTIME] $_";
  }

  return ;
}


sub parse_args {

  Getopt::Long::config('require_order', 'auto_abbrev', 'ignore_case');
    
    if( !GetOptions (\%Prog_Args,
		     "version", "help", 
		     "min-archive-files=s", "max-archive-days=s",
		     "index-only!",
		     ) ) {
	print("Illegal options in \@ARGV: '@ARGV'\n");
	usage();
	exit 1 ;
    }

    if($Prog_Args{'version'}) {
	print "Version: $VERSION\n";
	exit 0;  
    }
    
    if ($Prog_Args{'help'}) {
	usage();
    }
    
    
    if ( ($Prog_Args{'min-archive-days'}) &&
	 ($Prog_Args{'min-archive-days'} !~ m/\d+/)
	 ) {
	die("Argument min-archive-days must be a number. \n".
	    "It is: '$Prog_Args{'min-archive-days'}'\n");
    }
    if ( ($Prog_Args{'max-archive-days'}) &&
	 ($Prog_Args{'max-archive-days'} !~ m/\d+/)
	 ) {
	die("Argument max-archive-days must be a number. ".
	    "It is: '$Prog_Args{'max-archive-days'}'\n");
    }

    $MIN_ARCHIVE_FILES = $Prog_Args{'min-archive-files'} || 
	$MIN_ARCHIVE_FILES;

    $MAX_ARCHIVE_DAYS = $Prog_Args{'max-archive-days'} || 
	$MAX_ARCHIVE_DAYS;

    $MIN_ARCHIVE_FILES = untaint_digits($MIN_ARCHIVE_FILES);

    $MAX_ARCHIVE_DAYS = untaint_digits($MAX_ARCHIVE_DAYS);

    $MAX_ARCHIVE_SECONDS = $MAX_ARCHIVE_DAYS*24*60*60;
    @ARCHIVE_DIRS = map { untaint_filename($_) } @ARGV;

    foreach $dir (@ARCHIVE_DIRS) {

	my $dir_ok = (-d $dir) && (-r $dir) && (-w $dir);

	($dir_ok) ||
	    die("Directory: '$dir', is not a writable directory\n");	
    }

    return ;
}



#-------------- 
# Main function
#-------------- 

{
  set_static_vars();
  get_env();
  parse_args();

  print_dirs_index(@ARCHIVE_DIRS);

  if( !($Prog_Args{'index-only'}) ) {
      purge_dirs(@ARCHIVE_DIRS);	
  }

  exit 0;
}

