#!usr/bin/perl
# This script indexes ARCO files (Optimized ARC files).
# The index is created as a tab-delimited text file.
# These files can then be used by arc_optimizer.pl to restart, or by
# arco_sql.pl to create an sql command file to insert the info into
# a database.
# Written by Nick Baker, http://www-personal.si.umich.edu/~nnnickor
# http://www.cogulus.com. No rights reserved, credits appreciated.
#------------------------------------------------------------------------------
# SET YOUR VARIABLES
# Figure out where we are located on pc or mac...
$baseDir = $0; # get current location
$baseDir =~ s/\\/\//g; # change windows \ to /
# remove all past /mirror/. This will vary for you...
$baseDir =~ s/mirror\/.+$/mirror\//;
# What directory are your ARCO files in?
$dir = $baseDir . "HTML/arcos/";
# Where do you want the output files to go?
# (if no destination specified, defaults to ARCO file dir)
$destination = $baseDir . "HTML/arco_indexes/";
#------------------------------------------------------------------------------
# EDIT BELOW THIS LINE AT YOUR OWN RISK
print "Indexing ARC files...\n\n";
$destination = $dir if($destination eq "");
opendir(DIR, $dir);
@Items = grep(/\.arco$/i, readdir(DIR));
closedir(DIR);
@Items = &SortFileList(@Items);
foreach $arcoFile (@Items) {
# if the index file does not exist, or
# if the arc file was modified more recently than the index file...
if(!(-e "$destination$arcoFile.index.txt") ||
(-M "$destination$arcoFile.index.txt" > -M "$dir$arcoFile")) {
print "$arcoFile\n";
&Extractor();
}
}
print "\nDone\n\npress enter to exit ";
$junk = <STDIN>;
exit;
#------------------------------------------------------------------------------
sub Extractor () {
# "Programs that need to read the file without an index (such as to unpack the
# whole file) should use buffered I/O. The URL record can then be read with an
# fgets(), and the objects can be read with an fread() of <size> bytes."
# -archive.org
# initialize an index to store info about each item in the arc file
%index = ();
# explicitly define the fields that every record has.
# (other fields will be added as they are discovered)
@fields = ("uri", "ip", "time-stamp", "content-type", "extension",
"content-start", "content-length", "response-code", "arc-file");
$count = 0;
open(FILE, "<" . $dir . $arcoFile) or die print "Could not open $dir$arcoFile";
binmode(FILE); # necessary for Windows, optional for Mac
while(!eof(FILE)) {
$index{$count} = ();
$line = <FILE>;
chomp($line);
($uri, $ip, $date, $content_type, $recordLength) = split(" ", $line);
$index{$count}{"uri"} = $uri;
$index{$count}{"ip"} = $ip;
$index{$count}{"time-stamp"} = $date;
$index{$count}{"content-type"} = $content_type;
$index{$count}{"arc-file"} = $arcoFile;
# determine the file type...
$extension = FileType($content_type, $uri);
$index{$count}{"extension"} = $extension;
# Calculate the position of the end of this file (pos + len + 1)
$end = tell(FILE) + $recordLength;
# determine the response code
$line = <FILE>;
if($line =~ m|HTTP/\d\.\d\s(\d\d\d)\D+|) {
$response_code = $1;
$index{$count}{"response-code"} = $response_code;
}
# read the information about the file, up to the first empty line
while(length($line) > 0) {
$line = <FILE>;
chop($line); chop($line); # chomp doesn't work properly
if($line =~ /:\s/) {
($name, $value) = split(/:\s/, $line);
$name = lc($name);
$index{$count}{"$name"} = $value;
#push(@fields, $name); # this was causing problems with weird fields
}
}
# read the appropriate section of the file in binary mode
$content_start = tell(FILE);
$content_length = Max($end - $content_start, 0);
$index{$count}{"content-start"} = $content_start;
$index{$count}{"content-length"} = $content_length;
# position at start of next file
seek(FILE, $end + 1, 0);
$count++;
}
# Create Index File
# eliminate duplicate field listings
@fields = UniqueList(@fields);
# print the field names in the first row
$indexFile = join("\t", @fields) . "\n";
# print each row of data
for($i=0; $i < $count; $i++) {
foreach $field (@fields) {
$indexFile .= $index{$i}{$field} . "\t";
}
$indexFile .= "\n";
}
# take the extra tab off the end of each row
$indexFile =~ s/\t\n/\n/g;
# write the file
open(INDEX, ">" . $destination . $arcoFile . ".index.txt") or die print
"Could not open ". $destination . $arcoFile . ".index.txt";
binmode(INDEX);
print INDEX $indexFile;
close(INDEX);
close(FILE);
}
#------------------------------------------------------------------------------
sub FileType () {
my $extension = "txt";
(my $type, my $subtype) = split("/", $_[0]);
my $uri = $_[1];
if($type eq "text") {
if($subtype eq "plain") { $extension = "txt"; }
else { $extension = $subtype; }
} elsif($type eq "image") {
$extension = $subtype;
$extension =~ s/jpeg/jpg/i;
} elsif($uri =~ m/\.(\w\w\w)$/) { #/
$extension = $1;
}
return $extension;
}
#------------------------------------------------------------------------------
sub SortFileList() {
# This function changes a file list like this:
# UM.1.arco
# UM.10.arco
# UM.11.arco
# UM.2.arco
# UM.3.arco
# to this:
# UM.1.arco
# UM.2.arco
# UM.3.arco
# UM.10.arco
# UM.11.arco
my $length = 0;
# Find the length of the longest item in the list
foreach (@_) {
$length = &Max($length, length($_));
}
# Move the numbers to the front and pad with zeros
foreach (@_) {
$_ =~ s/^(\w+\.)(\d+\.)(.*)$/$2$1$3/;
while(length($_) <= $length) { $_ = "0" . $_; }
}
# Sort the list
@_ = sort @_;
# Return the list to its original form
foreach (@_) {
$_ =~ s/^0+(\d+\.)(\w+\.)(.*)$/$2$1$3/;
}
return @_;
}
#------------------------------------------------------------------------------
sub UniqueList {
# this function eliminates duplicate list items.
my @uniq = ();
my %useen = ();
foreach my $uitem (@_) {
push(@uniq, $uitem) unless $useen{$uitem}++;
}
return @uniq;
}
#------------------------------------------------------------------------------
sub Max {
my $max = @_[0];
foreach (@_) { $max = $_ if($_ > $max); }
return $max;
}