#!usr/bin/perl

# This script indexes ARCO files (Optimized ARC files).
# The index is created as a tab-delimited text file.

# These files can then be used by arc_optimizer.pl to restart, or by
# arco_sql.pl to create an sql command file to insert the info into
# a database.

# Written by Nick Baker, http://www-personal.si.umich.edu/~nnnickor
# http://www.cogulus.com. No rights reserved, credits appreciated.


#------------------------------------------------------------------------------
# SET YOUR VARIABLES

# Figure out where we are located on pc or mac...
$baseDir = $0; # get current location
$baseDir =~ s/\\/\//g; # change windows \ to /
# remove all past /mirror/. This will vary for you...
$baseDir =~ s/mirror\/.+$/mirror\//;


# What directory are your ARCO files in?
$dir = $baseDir . "HTML/arcos/";

# Where do you want the output files to go?
# (if no destination specified, defaults to ARCO file dir)
$destination = $baseDir . "HTML/arco_indexes/";


#------------------------------------------------------------------------------
# EDIT BELOW THIS LINE AT YOUR OWN RISK

print "Indexing ARC files...\n\n";

$destination = $dir if($destination eq "");

opendir(DIR, $dir);
@Items = grep(/\.arco$/i, readdir(DIR));
closedir(DIR);

@Items = &SortFileList(@Items);

foreach $arcoFile (@Items) {

# if the index file does not exist, or
# if the arc file was modified more recently than the index file...

if(!(-e "$destination$arcoFile.index.txt") ||
(-M "$destination$arcoFile.index.txt" > -M "$dir$arcoFile")) {

print "$arcoFile\n";
&Extractor();

}
}

print "\nDone\n\npress enter to exit ";
$junk = <STDIN>;

exit;



#------------------------------------------------------------------------------


sub Extractor () {

# "Programs that need to read the file without an index (such as to unpack the
# whole file) should use buffered I/O. The URL record can then be read with an
# fgets(), and the objects can be read with an fread() of <size> bytes."
# -archive.org


# initialize an index to store info about each item in the arc file
%index = ();

# explicitly define the fields that every record has.
# (other fields will be added as they are discovered)
@fields = ("uri", "ip", "time-stamp", "content-type", "extension",
"content-start", "content-length", "response-code", "arc-file");

$count = 0;

open(FILE, "<" . $dir . $arcoFile) or die print "Could not open $dir$arcoFile";
binmode(FILE); # necessary for Windows, optional for Mac

while(!eof(FILE)) {

$index{$count} = ();

$line = <FILE>;
chomp($line);

($uri, $ip, $date, $content_type, $recordLength) = split(" ", $line);
$index{$count}{"uri"} = $uri;
$index{$count}{"ip"} = $ip;
$index{$count}{"time-stamp"} = $date;
$index{$count}{"content-type"} = $content_type;
$index{$count}{"arc-file"} = $arcoFile;

# determine the file type...
$extension = FileType($content_type, $uri);
$index{$count}{"extension"} = $extension;

# Calculate the position of the end of this file (pos + len + 1)
$end = tell(FILE) + $recordLength;

# determine the response code
$line = <FILE>;
if($line =~ m|HTTP/\d\.\d\s(\d\d\d)\D+|) {
$response_code = $1;
$index{$count}{"response-code"} = $response_code;
}

# read the information about the file, up to the first empty line
while(length($line) > 0) {
$line = <FILE>;
chop($line); chop($line); # chomp doesn't work properly
if($line =~ /:\s/) {
($name, $value) = split(/:\s/, $line);
$name = lc($name);
$index{$count}{"$name"} = $value;
#push(@fields, $name); # this was causing problems with weird fields
}
}

# read the appropriate section of the file in binary mode
$content_start = tell(FILE);
$content_length = Max($end - $content_start, 0);
$index{$count}{"content-start"} = $content_start;
$index{$count}{"content-length"} = $content_length;

# position at start of next file
seek(FILE, $end + 1, 0);

$count++;
}

# Create Index File

# eliminate duplicate field listings
@fields = UniqueList(@fields);

# print the field names in the first row
$indexFile = join("\t", @fields) . "\n";

# print each row of data
for($i=0; $i < $count; $i++) {
foreach $field (@fields) {
$indexFile .= $index{$i}{$field} . "\t";
}
$indexFile .= "\n";
}

# take the extra tab off the end of each row
$indexFile =~ s/\t\n/\n/g;

# write the file
open(INDEX, ">" . $destination . $arcoFile . ".index.txt") or die print
"Could not open ". $destination . $arcoFile . ".index.txt";
binmode(INDEX);
print INDEX $indexFile;
close(INDEX);

close(FILE);

}

#------------------------------------------------------------------------------

sub FileType () {

my $extension = "txt";

(my $type, my $subtype) = split("/", $_[0]);
my $uri = $_[1];

if($type eq "text") {
if($subtype eq "plain") { $extension = "txt"; }
else { $extension = $subtype; }

} elsif($type eq "image") {
$extension = $subtype;
$extension =~ s/jpeg/jpg/i;

} elsif($uri =~ m/\.(\w\w\w)$/) { #/
$extension = $1;
}

return $extension;
}


#------------------------------------------------------------------------------

sub SortFileList() {

# This function changes a file list like this:
# UM.1.arco
# UM.10.arco
# UM.11.arco
# UM.2.arco
# UM.3.arco

# to this:
# UM.1.arco
# UM.2.arco
# UM.3.arco
# UM.10.arco
# UM.11.arco

my $length = 0;

# Find the length of the longest item in the list
foreach (@_) {
$length = &Max($length, length($_));
}

# Move the numbers to the front and pad with zeros
foreach (@_) {
$_ =~ s/^(\w+\.)(\d+\.)(.*)$/$2$1$3/;
while(length($_) <= $length) { $_ = "0" . $_; }
}

# Sort the list
@_ = sort @_;

# Return the list to its original form
foreach (@_) {
$_ =~ s/^0+(\d+\.)(\w+\.)(.*)$/$2$1$3/;
}

return @_;
}

#------------------------------------------------------------------------------

sub UniqueList {

# this function eliminates duplicate list items.

my @uniq = ();
my %useen = ();
foreach my $uitem (@_) {
push(@uniq, $uitem) unless $useen{$uitem}++;
}
return @uniq;

}


#------------------------------------------------------------------------------

sub Max {

my $max = @_[0];
foreach (@_) { $max = $_ if($_ > $max); }
return $max;

}