#!usr/bin/perl
# This script optimizes ARC files by eliminating unchanged content from
# multiple crawls of the same URI. The weeded results are stored as ARCO
# files (ARC-Optimized) which are identical in format to ARC files, except
# that they lack the header information found in a typical ARC file.
# You can also specify a size limit to eliminate very large files from
# the archive. This was an addition to fit into our current server space,
# not made on the basis of any archival principle.
# If ARCO files are found in the destination directory, they will be
# indexed prior to optimizing the new ARC files.
# NOTE: This script will NOT decompress ARC files. If your ARC file ends
# with ".arc.gz" you need to decompress it before running this script.
# You can also prevent Heritrix from compressing it in the first place
# through the crawl settings.
# For more information about ARC files, see the official documentation:
# http://www.archive.org/web/researcher/ArcFileFormat.php
# Written by Nick Baker, http://www-personal.si.umich.edu/~nnnickor
# http://www.cogulus.com. No rights reserved, credits appreciated.
#------------------------------------------------------------------------------
# SET YOUR VARIABLES
# Figure out where we are located on pc or mac...
$baseDir = $0; # get current location
$baseDir =~ s/\\/\//g; # change windows \ to /
# remove all past /mirror/. This will vary for you...
$baseDir =~ s/mirror\/.+$/mirror\//;
# Where are your uncompressed arc files?
$arcDir = $baseDir . "Private/arcs/";
# Where do you want the output ARCO files to go?
$arcoDir = $baseDir . "HTML/arcos/";
# Where are the indexes from previous crawls? (see arco_indexer.pl)
$arcoIndexDir = $baseDir . "HTML/arco_indexes/";
# How big should the ARCO files be?
$arcoMaxSize = 50 * 1000 * 1000; # bytes
# How big should the biggest internal file be?
$fileMaxSize = .02 * $arcoMaxSize; # 2% of the arco file size
# How should the ARCO files be named?
# (fileStem.1.arco, fileStem.2.arco, etc)
$fileStem = "UM";
# Show debugging info?
$debug = 0;
#------------------------------------------------------------------------------
# EDIT BELOW THIS LINE AT YOUR OWN RISK
print "arc files: " . $arcDir . "\n" . "arco files: " . $arcoDir . "\n";
# Initialize some variables...
$arcoCount = 1;
%arco = (); # will contain start, length, and arc file
$fileSize = 0;
# gather info on which url-date combos have already been optimized
%alreadyOptimized = (); # will contain date/uri combos
&Reindex();
# Gather the arc files to optimize
@Files = ();
@Dirs = ($arcDir);
foreach $dir (@Dirs) {
print "$dir\n";
opendir(DIR, $dir);
my @items = readdir(DIR);
closedir(DIR);
foreach my $item (@items) {
if($item =~ /^\./) { next; }
$item = $dir . $item;
if(-d $item) { push(@Dirs, "$item/"); }
elsif($item =~ /\.arc$/i) {
push(@Files, $item);
}
}
}
@Files = sort @Files;
# Open the first arco file
print "\nwriting $fileStem.$arcoCount.arco\n";
open(ARCO, ">>" . $arcoDir . $fileStem . "." . $arcoCount++ . ".arco");
binmode(ARCO);
# Process each arc file
foreach $arcFile (@Files) {
print "\nreading $arcFile\n\n";
&Extractor();
}
close(ARCO);
print "\n Done \n\n press enter to exit "; <stdin>; exit;
#------------------------------------------------------------------------------
sub Reindex() {
print "Reading indexes of ARCO files...\n";
opendir(DIR, $arcoIndexDir);
my @Items = sort grep(/\.index\.txt$/, readdir(DIR));
closedir(DIR);
if($#Items >= 0) {
# Set the arcocount and filesize so we can restart where we left off.
$arcoCount = $#Items + 1;
$fileSize = (-s $arcoDir . "UM." . $arcoCount . ".arco");
# Sort the arco files
@Items = &SortFileList(@Items);
foreach my $item (@Items) {
print " $item\n";
open(FILE, "<" . $arcoIndexDir . $item);
while(<FILE>) {
chomp($_);
# Split the tab-delimited line into separate items
my @items = split(/\t/, $_);
# list key: (see arco_indexer.pl for more info)
# 0 - uri
# 2 - date
# 5 - start
# 6 - length
# 8 - arc
# create a date/uri combo
$alreadyOptimized{"$items[2]$items[0]"} = 1;
# store the start, length, and arc file by uri
$arco{"$uri"} = ["$items[5]", "$items[6]", "$items[8]"];
} # end while
close(FILE);
} # end foreach
} # end if
}
#------------------------------------------------------------------------------
sub Extractor () {
open(FILE, "<" . $arcFile) or die print "Could not open $arcFile";
binmode(FILE); # necessary for Windows, optional for Mac
# Special case for ARC file header...
$line = <FILE>; chomp($line);
($uri, $ip, $date, $content_type, $length) = split(" ", $line);
# Calculate the position of the end of this file (pos + len)
$end = tell(FILE) + $length + 2;
seek(FILE, $end, 0);
# Now read each internal file...
while(!eof(FILE)) {
$line = <FILE>;
$optiHeader = $line;
$optiStart = tell(FILE);
chomp($line);
($uri, $ip, $date, $content_type, $recordLength) = split(" ", $line);
$optiLength = $recordLength;
# print "$uri\n";
# Calculate the position of the end of this file (pos + len + 1)
$end = tell(FILE) + $recordLength;
# read the information about the file, up to the first empty line
while(length($line) > 0) {
$line = <FILE>; chop($line); chop($line); # chomp doesn't work
}
# read the appropriate section of the file in binary mode
$content_start = tell(FILE);
$content_length = Max($end - $content_start, 0);
$newContent = 1;
# skip if it's already been optimized
if($alreadyOptimized{"$date$uri"}) {
$newContent = 0;
$arco{"$uri"} = [$content_start, $content_length, $arcFile];
print "* $uri\n" if($debug);
}
# skip if it's too big
elsif($content_length > $fileMaxSize) {
$newContent = 0;
print "o $uri $content_length\n";
}
# if the uri has been archived before...
elsif($arco{"$uri"}) {
# and the file length is the same...
if($content_length == @{$arco{"$uri"}}[1]) {
# assume that it hasn't changed...
$newContent = 0;
print "* $uri\n";
}
}
# if we aren't skipping, add to the arco file.
if($newContent) {
print " $uri\n" if($debug);
$arco{"$uri"} = [$content_start, $content_length, $arcFile];
seek(FILE, $optiStart, 0);
read(FILE, $temp, $optiLength + 1);
&TestArcoSize(length($optiHeader) + length($temp));
print ARCO $optiHeader . $temp;
}
# position at start of next file
seek(FILE, $end + 1, 0);
}
}
#------------------------------------------------------------------------------
sub TestArcoSize () {
my $newSize = $_[0];
if($newSize + $fileSize > $arcoMaxSize) {
close(ARCO);
print "\nwriting $fileStem.$arcoCount.arco\n\n";
open(ARCO, ">>" . $arcoDir . $fileStem . "." . $arcoCount++ . ".arco");
binmode(ARCO);
$fileSize = $newSize;
} else {
$fileSize += $newSize;
}
}
#------------------------------------------------------------------------------
sub SortFileList() {
# This function changes a file list like this:
# UM.1.arco
# UM.10.arco
# UM.11.arco
# UM.2.arco
# UM.3.arco
# to this:
# UM.1.arco
# UM.2.arco
# UM.3.arco
# UM.10.arco
# UM.11.arco
my $length = 0;
# Find the length of the longest item in the list
foreach (@_) {
$length = &Max($length, length($_));
}
# Move the numbers to the front and pad with zeros
foreach (@_) {
$_ =~ s/^(\w+\.)(\d+\.)(.*)$/$2$1$3/;
while(length($_) <= $length) { $_ = "0" . $_; }
}
# Sort the list
@_ = sort @_;
# Return the list to its original form
foreach (@_) {
$_ =~ s/^0+(\d+\.)(\w+\.)(.*)$/$2$1$3/;
}
return @_;
}
#------------------------------------------------------------------------------
sub Max {
my $max = @_[0];
foreach (@_) { $max = $_ if($_ > $max); }
return $max;
}