#!/usr/local/bin/perl
##
## urlindex.pl (C) 1995 Andrew Scherpbier
##
## This program will build an index of all the URLs in the
## htdig document database.
##
use GDBM_File;
require('SDSU_www.pl');
$dbfile = "/gopher/www/htdig/sdsu3d.docdb";
$dbfile = "/tmp/db.docdb";
$exclude = "rohan.sdsu.edu\\/home\\/";
tie(%docdb, GDBM_File, $dbfile, GDBM_READER, 0) || die "Unable to open $dbfile: $!";
print "Reading...\n";
##
## Read in all the relevant data.
##
while (($key, $value) = each %docdb)
{
next if $key =~ /^nextDocID/;
%record = parse_ref_record($value);
next if $record{"STATE"} eq 1;
next if $key =~ /$exclude/;
$title = $record{"TITLE"};
##
## Get rid of starting and trailing whitespace junk
##
$title =~ s/^[ \t\n\r]*//;
$title =~ s/[ \t\n\r]*$//;
##
## If the title starts with 'the', it will be taken out and added
## to the end of the title. This means that a title like "The
## Homepage of X" will become "Homepage of X, The"
##
if ($title =~ /^the /i)
{
$title = substr($title, 4) . ", " . substr($title, 0, 3);
}
if ($title =~ /^SDSU /)
{
$title = substr($title, 5) . ", " . substr($title, 0, 4);
}
if ($title =~ /^San Diego State University /i)
{
$title = substr($title, 27) . ", " . substr($title, 0, 26);
}
$value = $title;
$value =~ tr/A-Z/a-z/;
$titles{$value} = "$title\001$key";
push(@unsorted, $value);
}
$current = " ";
open(M, ">index.html");
print M "
Index of all documents at SDSU\n";
print M "\n";
print M &www_logo_2("Index of all documents at SDSU");
print M "This is a list of WWW documents that were found while indexing all\n";
print M "the publicly available WWW servers at San Diego State University.\n";
print M "The documents are indexed by their titles.\n";
print M "
\n";
$previous = "";
print "Writing...\n";
foreach $value (sort @unsorted)
{
next if $value eq $previous;
$previous = $value;
next if !($value =~ /^[a-zA-Z]/);
($title, $url) = split('\001', $titles{$value}, 2);
$first = substr($title, 0, 1);
if ($current =~ /$first/i)
{
print F "
$title\n";
}
else
{
##
## New letter. Open a new file for it
##
$current = $first;
$current =~ tr/a-z/A-Z/;
print F "\n";
close(F);
open(F, ">index$current.html");
print F "Index for $current\n";
print F "\n";
print F &www_logo_2("Index for $current");
print F "\n";
print F "- $title
\n";
##
## Add a reference to the main index for this letter
##
print M " $current\n";
print "Index of $current\n";
}
}
close(F);
print M "\n";
close(M);
sub parse_ref_record
{
local($value) = @_;
local(%rec, $length, $count, $result);
while (length($value) > 0)
{
$what = unpack("C", $value);
$value = substr($value, 1);
if ($what == 0)
{
# ID
$rec{"ID"} = unpack("i", $value);
$value = substr($value, 4);
}
elsif ($what == 1)
{
# TIME
$rec{"TIME"} = unpack("i", $value);
$value = substr($value, 4);
}
elsif ($what == 2)
{
# ACCESSED
$rec{"ACCESSED"} = unpack("i", $value);
$value = substr($value, 4);
}
elsif ($what == 3)
{
# STATE
$rec{"STATE"} = unpack("i", $value);
$value = substr($value, 4);
}
elsif ($what == 4)
{
# SIZE
$rec{"SIZE"} = unpack("i", $value);
$value = substr($value, 4);
}
elsif ($what == 5)
{
# LINKS
$rec{"LINKS"} = unpack("i", $value);
$value = substr($value, 4);
}
elsif ($what == 6)
{
# IMAGESIZE
$rec{"IMAGESIZE"} = unpack("i", $value);
$value = substr($value, 4);
}
elsif ($what == 7)
{
# HOPCOUNT
$rec{"HOPCOUNT"} = unpack("i", $value);
$value = substr($value, 4);
}
elsif ($what == 8)
{
# URL
$length = unpack("i", $value);
$rec{"URL"} = unpack("x4 A$length", $value);
$value = substr($value, 4 + $length);
}
elsif ($what == 9)
{
# HEAD
$length = unpack("i", $value);
$rec{"HEAD"} = unpack("x4 A$length", $value);
$value = substr($value, 4 + $length);
}
elsif ($what == 10)
{
# TITLE
$length = unpack("i", $value);
$rec{"TITLE"} = unpack("x4 A$length", $value);
$value = substr($value, 4 + $length);
}
elsif ($what == 11)
{
# DESCRIPTIONS
$count = unpack("i", $value);
$value = substr($value, 4);
$result = "";
foreach (1 .. $count)
{
$length = unpack("i", $value);
$result = $result . unpack("x4 A$length", $value) . "";
$value = substr($value, 4 + $length);
}
chop $result;
$rec{"DESCRIPTIONS"} = $result;
}
elsif ($what == 12)
{
# ANCHORS
$count = unpack("i", $value);
$value = substr($value, 4);
$result = "";
foreach (1 .. $count)
{
$length = unpack("i", $value);
$result = $result . unpack("x4 A$length", $value) . "";
$value = substr($value, 4 + $length);
}
chop $result;
$rec{"ANCHORS"} = $result;
}
elsif ($what == 13)
{
# EMAIL
$length = unpack("i", $value);
$rec{"EMAIL"} = unpack("x4 A$length", $value);
$value = substr($value, 4 + $length);
}
elsif ($what == 14)
{
# NOTIFICATION
$length = unpack("i", $value);
$rec{"NOTIFICATION"} = unpack("x4 A$length", $value);
$value = substr($value, 4 + $length);
}
elsif ($what == 15)
{
# SUBJECT
$length = unpack("i", $value);
$rec{"SUBJECT"} = unpack("x4 A$length", $value);
$value = substr($value, 4 + $length);
}
elsif ($what == 16)
{
# STRING (ignore, but unpack)
$length = unpack("i", $value);
$rec{"STRING"} = unpack("x4 A$length", $value);
$value = substr($value, 4 + $length);
}
elsif ($what == 17)
{
# METADSC
$length = unpack("i", $value);
$rec{"METADSC"} = unpack("x4 A$length", $value);
$value = substr($value, 4 + $length);
}
elsif ($what == 18)
{
# BACKLINKS
$rec{"BACKLINKS"} = unpack("i", $value);
$value = substr($value, 4);
}
elsif ($what == 19)
{
# SIGNATURE
$rec{"SIG"} = unpack("i", $value);
$value = substr($value, 4);
}
}
print "title = $rec{'TITLE'}\n";
return %rec;
}