#!usr/bin/perl

# This script optimizes ARC files by eliminating unchanged content from
# multiple crawls of the same URI. The weeded results are stored as ARCO
# files (ARC-Optimized) which are identical in format to ARC files, except
# that they lack the header information found in a typical ARC file.

# You can also specify a size limit to eliminate very large files from
# the archive. This was an addition to fit into our current server space,
# not made on the basis of any archival principle.

# If ARCO files are found in the destination directory, they will be
# indexed prior to optimizing the new ARC files.

# NOTE: This script will NOT decompress ARC files. If your ARC file ends
# with ".arc.gz" you need to decompress it before running this script.
# You can also prevent Heritrix from compressing it in the first place
# through the crawl settings.

# For more information about ARC files, see the official documentation:
# http://www.archive.org/web/researcher/ArcFileFormat.php

# Written by Nick Baker, http://www-personal.si.umich.edu/~nnnickor
# http://www.cogulus.com. No rights reserved, credits appreciated.

#------------------------------------------------------------------------------
# SET YOUR VARIABLES

# Figure out where we are located on pc or mac...
$baseDir = $0; # get current location
$baseDir =~ s/\\/\//g; # change windows \ to /
# remove all past /mirror/. This will vary for you...
$baseDir =~ s/mirror\/.+$/mirror\//;


# Where are your uncompressed arc files?
$arcDir = $baseDir . "Private/arcs/";

# Where do you want the output ARCO files to go?
$arcoDir = $baseDir . "HTML/arcos/";

# Where are the indexes from previous crawls? (see arco_indexer.pl)
$arcoIndexDir = $baseDir . "HTML/arco_indexes/";

# How big should the ARCO files be?
$arcoMaxSize = 50 * 1000 * 1000; # bytes

# How big should the biggest internal file be?
$fileMaxSize = .02 * $arcoMaxSize; # 2% of the arco file size

# How should the ARCO files be named?
# (fileStem.1.arco, fileStem.2.arco, etc)
$fileStem = "UM";

# Show debugging info?
$debug = 0;

#------------------------------------------------------------------------------
# EDIT BELOW THIS LINE AT YOUR OWN RISK

print "arc files: " . $arcDir . "\n" . "arco files: " . $arcoDir . "\n";


# Initialize some variables...

$arcoCount = 1;
%arco = (); # will contain start, length, and arc file
$fileSize = 0;

# gather info on which url-date combos have already been optimized
%alreadyOptimized = (); # will contain date/uri combos
&Reindex();


# Gather the arc files to optimize
@Files = ();
@Dirs = ($arcDir);
foreach $dir (@Dirs) {

print "$dir\n";

opendir(DIR, $dir);
my @items = readdir(DIR);
closedir(DIR);

foreach my $item (@items) {
if($item =~ /^\./) { next; }

$item = $dir . $item;
if(-d $item) { push(@Dirs, "$item/"); }
elsif($item =~ /\.arc$/i) {
push(@Files, $item);
}
}
}
@Files = sort @Files;

# Open the first arco file
print "\nwriting $fileStem.$arcoCount.arco\n";
open(ARCO, ">>" . $arcoDir . $fileStem . "." . $arcoCount++ . ".arco");
binmode(ARCO);

# Process each arc file
foreach $arcFile (@Files) {
print "\nreading $arcFile\n\n";
&Extractor();
}

close(ARCO);

print "\n Done \n\n press enter to exit "; <stdin>; exit;

#------------------------------------------------------------------------------
sub Reindex() {

print "Reading indexes of ARCO files...\n";

opendir(DIR, $arcoIndexDir);
my @Items = sort grep(/\.index\.txt$/, readdir(DIR));
closedir(DIR);

if($#Items >= 0) {

# Set the arcocount and filesize so we can restart where we left off.
$arcoCount = $#Items + 1;
$fileSize = (-s $arcoDir . "UM." . $arcoCount . ".arco");

# Sort the arco files
@Items = &SortFileList(@Items);

foreach my $item (@Items) {

print " $item\n";

open(FILE, "<" . $arcoIndexDir . $item);
while(<FILE>) {

chomp($_);
# Split the tab-delimited line into separate items
my @items = split(/\t/, $_);

# list key: (see arco_indexer.pl for more info)
# 0 - uri
# 2 - date
# 5 - start
# 6 - length
# 8 - arc

# create a date/uri combo
$alreadyOptimized{"$items[2]$items[0]"} = 1;

# store the start, length, and arc file by uri
$arco{"$uri"} = ["$items[5]", "$items[6]", "$items[8]"];


} # end while

close(FILE);

} # end foreach
} # end if
}

#------------------------------------------------------------------------------

sub Extractor () {

open(FILE, "<" . $arcFile) or die print "Could not open $arcFile";
binmode(FILE); # necessary for Windows, optional for Mac

# Special case for ARC file header...
$line = <FILE>; chomp($line);
($uri, $ip, $date, $content_type, $length) = split(" ", $line);
# Calculate the position of the end of this file (pos + len)
$end = tell(FILE) + $length + 2;
seek(FILE, $end, 0);

# Now read each internal file...
while(!eof(FILE)) {

$line = <FILE>;

$optiHeader = $line;
$optiStart = tell(FILE);

chomp($line);

($uri, $ip, $date, $content_type, $recordLength) = split(" ", $line);

$optiLength = $recordLength;

# print "$uri\n";

# Calculate the position of the end of this file (pos + len + 1)
$end = tell(FILE) + $recordLength;

# read the information about the file, up to the first empty line
while(length($line) > 0) {
$line = <FILE>; chop($line); chop($line); # chomp doesn't work
}

# read the appropriate section of the file in binary mode
$content_start = tell(FILE);
$content_length = Max($end - $content_start, 0);

$newContent = 1;

# skip if it's already been optimized
if($alreadyOptimized{"$date$uri"}) {
$newContent = 0;
$arco{"$uri"} = [$content_start, $content_length, $arcFile];
print "* $uri\n" if($debug);

}
# skip if it's too big
elsif($content_length > $fileMaxSize) {
$newContent = 0;
print "o $uri $content_length\n";
}
# if the uri has been archived before...
elsif($arco{"$uri"}) {

# and the file length is the same...
if($content_length == @{$arco{"$uri"}}[1]) {

# assume that it hasn't changed...
$newContent = 0;
print "* $uri\n";

}
}

# if we aren't skipping, add to the arco file.
if($newContent) {

print " $uri\n" if($debug);
$arco{"$uri"} = [$content_start, $content_length, $arcFile];

seek(FILE, $optiStart, 0);
read(FILE, $temp, $optiLength + 1);
&TestArcoSize(length($optiHeader) + length($temp));
print ARCO $optiHeader . $temp;

}

# position at start of next file
seek(FILE, $end + 1, 0);
}
}

#------------------------------------------------------------------------------

sub TestArcoSize () {

my $newSize = $_[0];

if($newSize + $fileSize > $arcoMaxSize) {

close(ARCO);

print "\nwriting $fileStem.$arcoCount.arco\n\n";
open(ARCO, ">>" . $arcoDir . $fileStem . "." . $arcoCount++ . ".arco");
binmode(ARCO);

$fileSize = $newSize;

} else {
$fileSize += $newSize;
}

}

#------------------------------------------------------------------------------

sub SortFileList() {

# This function changes a file list like this:
# UM.1.arco
# UM.10.arco
# UM.11.arco
# UM.2.arco
# UM.3.arco

# to this:
# UM.1.arco
# UM.2.arco
# UM.3.arco
# UM.10.arco
# UM.11.arco

my $length = 0;

# Find the length of the longest item in the list
foreach (@_) {
$length = &Max($length, length($_));
}

# Move the numbers to the front and pad with zeros
foreach (@_) {
$_ =~ s/^(\w+\.)(\d+\.)(.*)$/$2$1$3/;
while(length($_) <= $length) { $_ = "0" . $_; }
}

# Sort the list
@_ = sort @_;

# Return the list to its original form
foreach (@_) {
$_ =~ s/^0+(\d+\.)(\w+\.)(.*)$/$2$1$3/;
}

return @_;
}


#------------------------------------------------------------------------------

sub Max {

my $max = @_[0];
foreach (@_) { $max = $_ if($_ > $max); }
return $max;

}