#!/usr/bin/perl

# dcbot.pl
#
# Creator: andy powell <a.powell@ukoln.ac.uk>
# [bodged around a bit by Martin Hamilton <bodger@martinh.net>]
#
# $Id: dcbot.pl,v 1.4 2002/09/30 12:04:54 martin Exp $
# (For copyright statement and acknowledgements see end)
#
# Perl script to generate Dublin Core elements in various formats for the
# Web page identified by user-supplied URL.  Some tags are generated
# automatically.

require HTML::Parser;
package P; @ISA = qw(HTML::Parser);
use LWP::Simple;
use LWP::UserAgent;
use HTML::Entities ();
use URI::URL;
use MD5;
use Metadata::Marc;

$ENV{'HARVEST_HOME'} = "/usr/local/harvest" if (!defined($ENV{'HARVEST_HOME'}));
unshift(@INC, "$ENV{'HARVEST_HOME'}/lib");      # use local files
require 'soif.pl';
use Time::Local;

use Getopt::Std;
getopts("df:l:pu:w:");
# set from the command line...
$debug = $opt_d || 0;
$guesspublisher = $opt_p || 0;
$ldat = $opt_l || "ldat"; # MS file parser
$output_format = $opt_f;
$url = $opt_u;
$whoiscmd = $opt_w || "whois -h whois.arin.net"; # if -p

# List of tags that go into DC.Subject...
@keytags = (
    'b',
    'strong',
    'i',
    'a',
    'em',
    'h1',
    'h2',
    'h3',
    'h4',
    'h5',
    'h6',
);

# Keyword stop-list
@stoplist = (
'a', 'about', 'above', 'across', 'after', 'again', 'against', 'all', 'almost',
'alone', 'along', 'already', 'also', 'although', 'always', 'among', 'an',
'and', 'another', 'any', 'anybody', 'anyone', 'anything', 'anywhere',
'are', 'area', 'areas', 'around', 'as', 'ask', 'asked', 'asking',
'asks', 'at', 'away', 'b', 'back', 'backed', 'backing', 'backs', 'be',
'became', 'because', 'become', 'becomes', 'been', 'before', 'began',
'behind', 'being', 'beings', 'best', 'better', 'between', 'big', 'both',
'but', 'by', 'c', 'came', 'can', 'cannot', 'case', 'cases', 'certain',
'certainly', 'clear', 'clearly', 'come', 'could', 'd', 'did', 'differ',
'different', 'differently', 'do', 'does', 'done', 'down', 'down', 'downed',
'downing', 'downs', 'during', 'e', 'each', 'early', 'either', 'end',
'ended', 'ending', 'ends', 'enough', 'even', 'evenly', 'ever', 'every',
'everybody', 'everyone', 'everything', 'everywhere', 'f', 'face', 'faces',
'fact', 'facts', 'far', 'felt', 'few', 'find', 'finds', 'first', 'for',
'four', 'from', 'full', 'fully', 'further', 'furthered', 'furthering',
'furthers', 'g', 'gave', 'general', 'generally', 'get', 'gets', 'give',
'given', 'gives', 'go', 'going', 'good', 'goods', 'got', 'great', 'greater',
'greatest', 'group', 'grouped', 'grouping', 'groups', 'h', 'had', 'has',
'have', 'having', 'he', 'her', 'here', 'herself', 'high', 'high',
'high', 'higher', 'highest', 'him', 'himself', 'his', 'how', 'however',
'i', 'if', 'important', 'in', 'interest', 'interested', 'interesting',
'interests', 'into', 'is', 'it', 'its', 'itself', 'j', 'just', 'k',
'keep', 'keeps', 'kind', 'knew', 'know', 'known', 'knows', 'l', 'large',
'largely', 'last', 'later', 'latest', 'least', 'less', 'let', 'lets',
'like', 'likely', 'long', 'longer', 'longest', 'm', 'made', 'make', 'making',
'man', 'many', 'may', 'me', 'member', 'members', 'men', 'might', 'more',
'most', 'mostly', 'mr', 'mrs', 'much', 'must', 'my', 'myself', 'n',
'necessary', 'need', 'needed', 'needing', 'needs', 'never', 'new',
'new', 'newer', 'newest', 'next', 'no', 'nobody', 'non', 'noone', 'not',
'nothing', 'now', 'nowhere', 'number', 'numbers', 'o', 'of', 'off',
'often', 'old', 'older', 'oldest', 'on', 'once', 'one', 'only', 'open',
'opened', 'opening', 'opens', 'or', 'order', 'ordered', 'ordering',
'orders', 'other', 'others', 'our', 'out', 'over', 'p', 'part', 'parted',
'parting', 'parts', 'per', 'perhaps', 'place', 'places', 'point', 'pointed',
'pointing', 'points', 'possible', 'present', 'presented', 'presenting',
'presents', 'problem', 'problems', 'put', 'puts', 'q', 'quite', 'r',
'rather', 'really', 'right', 'right', 'room', 'rooms', 's', 'said', 'same',
'saw', 'say', 'says', 'second', 'seconds', 'see', 'seem', 'seemed',
'seeming', 'seems', 'sees', 'several', 'shall', 'she', 'should', 'show',
'showed', 'showing', 'shows', 'side', 'sides', 'since', 'small', 'smaller',
'smallest', 'so', 'some', 'somebody', 'someone', 'something', 'somewhere',
'state', 'states', 'still', 'still', 'such', 'sure', 't', 'take', 'taken',
'than', 'that', 'the', 'their', 'them', 'then', 'there', 'therefore',
'these', 'they', 'thing', 'things', 'think', 'thinks', 'this', 'those',
'though', 'thought', 'thoughts', 'three', 'through', 'thus', 'to', 'today',
'together', 'too', 'took', 'toward', 'turn', 'turned', 'turning',
'turns', 'two', 'u', 'under', 'until', 'up', 'upon', 'us', 'use',
'used', 'uses', 'v', 'very', 'w', 'want', 'wanted', 'wanting', 'wants',
'was', 'way', 'ways', 'we', 'well', 'wells', 'went', 'were', 'what',
'when', 'where', 'whether', 'which', 'while', 'who', 'whole', 'whose',
'why', 'will', 'with', 'within', 'without', 'work', 'worked', 'working',
'works', 'would', 'x', 'y', 'year', 'years', 'yet', 'you',
'young', 'younger', 'youngest', 'your', 'yours', 'z'
);

# Maximum number of auto-generated keyword terms...
$maxterms = 50;

# <meta> tags that describe DC-dot
$metatags = <<EOF;
<meta name="DC.Title" content="DC-dot">
<meta name="DC.Creator" content="Andy Powell - a.powell\@ukoln.ac.uk">
<meta name="DC.Subject" content="Dublin Core; DC; generator; editor;
Warwick Framework; SOIF; TEI; USMARC; XML; GILS; ROADS; RDF; IMS">
<meta name="DC.Description" content="A Web-based Dublin Core metadata
editor">
<meta name="Keywords" content="Dublin Core; DC; generator; editor;
Warwick Framework; SOIF; TEI; USMARC; XML; GILS; ROADS; RDF; IMS">
<meta name="Description" content="A Web-based Dublin Core metadata
editor">
<meta name="DC.Format" content="text/html">
<meta name="DC.Identifier" content="http://www.ukoln.ac.uk/metadata/dcdot/">
<meta name="DC.Publisher" content="UKOLN, University of Bath">
<meta name="DC.Rights" content="http://www.ukoln.ac.uk/metadata/dcdot/COPYING">
EOF

# Acknowledgements
$ack = <<EOF;
DC-dot was developed by UKOLN, University of Bath.
<p>
UKOLN is funded by Resource: The Council for Museums, Archives & Libraries,
<br>
the Joint Information Systems Committee of the Higher Education Funding Councils,
<br>
as well as by project funding from the JISC and the European Union.
<br>
UKOLN also receives support from the University of Bath where it is based.
EOF

# end of configurable stuff...

# Version
$version = 'DC-dot $Id: dcbot.pl,v 1.4 2002/09/30 12:04:54 martin Exp $';

print STDERR "url = $url\n" if $debug;

$dc'identifier = $url;
    
#$LWP::Simple::ua->proxy(['http', 'ftp'], "http://wwwcache.lut.ac.uk:3128/");
#$LWP::Simple::ua->agent('DC-dot/1.0');
@head = head($url);
$dc'format = shift(@head);
$mimetype = $dc'format;
$dc'type = 'Text' if ($dc'format =~ /^text\//);
$dc'type = 'Dataset' if ($dc'format =~ /^application\//);
$ukoln'size = shift(@head) if ($#head == 3); # head() doesn't always return
                                             # an array of 5 elements?!
$date = shift(@head);
print STDERR "get($url)\n" if $debug;
$html = get($url);
$ukoln'size = length($html) unless ($ukoln'size);
print "Got HTML - ". $ukoln'size . " bytes\n" if $debug;
&error('Page not available') unless $html;

$dc'format .= " || $ukoln'size bytes" if ($ukoln'size);

if ($date) {
    @date = localtime($date);
    $date[5] += 1900;
    $date[5] += 100 if ($date[5] < 1970);
    $date[4]++;
    $date[4] = '0' . $date[4] if ($date[4] < 10);
    $date[3] = '0' . $date[3] if ($date[3] < 10);
    $dc'date = $date[5] . '-' . $date[4] . '-' . $date[3];
}
#$dc'date = localtime($date) if ($date);
#print "$dc'form\n$dc'date\n";
    
$ukoln'md5 = &MakeMD5($html);
    
if ($mimetype eq 'application/powerpoint' || $mimetype eq 'application/msword') {
    &ParseMS($html);
    $dc'type = 'Text';
} elsif ($mimetype eq 'application/pdf') {
    &ParsePDF($html);
    $dc'type = 'Text';
} elsif ($mimetype =~ /text\/html/) {
    # Parse the HTML file
    #
    @warn = (); # set up array for warning messages
    $emetacount = 0; # Count and array for existing <meta> tags
    @emeta = ();
    $p = new P;
    $html =~ s/\s*\/>/>/g; # horrible hack to convert XHTML back to HTML!
    $p->parse($html);
}

$s = $ukoln'size;

# Clean up <meta> tag text
foreach (keys %meta) {
    $meta{$_} =~ s/\s+/ /gs;
}

# Extract DC element values
#
$dc'title = shift(@headings);
$dc'title = join(" ", @dc'title) if @dc'title;
$dc'title = $meta{'dc.title'} if $meta{'dc.title'};
$dc'creator = $meta{'author'} if $meta{'author'};
$dc'creator = $meta{'dc.author'} if $meta{'dc.author'};
$dc'creator = $meta{'dc.creator'} if $meta{'dc.creator'};
#    $dc'creator'email = $meta{'author.email'} if $meta{'author.email'};
#    $dc'creator'email = $meta{'dc.author.email'} if $meta{'dc.author.email'};
#    $dc'creator'email = $meta{'dc.creator.email'} if $meta{'dc.creator.email'};

foreach $k (keys %dc'subject) {
    next if ($k =~ /^http:\/\//);
    next if ($k =~ /^ftp:\/\//);
    next if ($k =~ /^gopher:\/\//);
    next if ($k =~ /^mailto:/);

    # strip leading and trailing punctuation...
    $k =~ s/^[^a-zA-Z0-9]*//;
    $k =~ s/[^a-zA-Z0-9]*$//;
    next if eval " grep(/^$k\$/i, \@stoplist); ";
    push(@dc'subject, $k);
}

if ($#dc'subject > $maxterms) {
    foreach $k (@dc'subject) {
        foreach $t (split(/ /, $k)) {
	    $t =~ s/^[^a-zA-Z0-9]*//;
            $t =~ s/[^a-zA-Z0-9]*$//;
            next if ($t =~ /'/);
            next if ($t =~ /"/);
            next if ($t =~ /^\s*$/);
            next if ($t =~ /^[0-9,\.\s-]*$/);
            next if eval " grep(/^$t\$/i, \@stoplist); ";
            $newsubject{$t}++;
        }
    }

    undef @dc'subject;
    foreach $k (keys %newsubject) {
        push(@dc'subject, $k);
    }
}

@dc'subject = @metakeywords if (@metakeywords);
$dc'subject = join("; ", @dc'subject) if (@dc'subject);
$dc'subject = $meta{'dc.subject'} if $meta{'dc.subject'};
$dc'description = $meta{'description'} if $meta{'description'};
$dc'description = $meta{'dc.description'} if $meta{'dc.description'};
if ($meta{'dc.publisher'}) {
    $dc'publisher = $meta{'dc.publisher'};
} else {
    if ($whoiscmd && $guesspublisher) {
        $host = url($url)->host;
        ($name, $aliases, $addrtype, $length, @addrs) = gethostbyname($host);
        ($a, $b, $c, $d) = unpack('C4', $addrs[0]);
        $b = $c = $d = 0 if ($a >= 1 && $a <= 127);
        $c = $d = 0 if ($a >= 128 && $a <= 191);
        $d = 0 if ($a >= 192 && $a <= 223);

        print STDERR "Running... $whoiscmd $a.$b.$c.$d\n" if $debug;
        if (open(WHOIS, "$whoiscmd $a.$b.$c.$d |")) {
            $whois = <WHOIS>; # only need the first line...
            print STDERR "Result... $whois\n" if ($debug);
            close(WHOIS);
            # No match for "138.38.32.1".
            # Bath University (NET-BATH-AC-UK)
            undef $whois if ($whois =~ /^No match for/);
            $whois =~ s/\s*\(.*\)\s*//;
        }
    }
    $dc'publisher = "$whois" if ($whois);
}

$dc'contributor = $meta{'dc.contributor'} if $meta{'dc.contributor'};
$dc'date = $meta{'dc.date'} if $meta{'dc.date'};
$dc'type = $meta{'dc.type'} if $meta{'dc.type'};
$dc'format = $meta{'dc.format'} if $meta{'dc.format'};
$dc'identifier = $meta{'dc.identifier'} if $meta{'dc.identifier'};
$dc'source = $meta{'dc.source'} if $meta{'dc.source'};
$dc'language = $meta{'dc.language'} if $meta{'dc.language'};
$dc'relation'isparentof = join(', ', @dc'relation'isparentof);
#    $dc'relation'isparentof = $meta{'dc.relation.isparentof'} if $meta{'dc.relation.isparentof'};
#    $dc'relation'ischildof = $meta{'dc.relation.ischildof'} if $meta{'dc.relation.ischildof'};
$dc'relation = $meta{'dc.relation'} if $meta{'dc.relation'};
$dc'coverage = $meta{'dc.coverage'} if $meta{'dc.coverage'};
$dc'rights = $meta{'dc.rights'} if $meta{'dc.rights'};

&dump_outputs;

exit 0;


# Error during auto-generation of metadata
#
sub error {
    local($msg) = @_;

    print <<EOF;
Sorry, an error occured while processing your request.
Error: $msg
EOF
    exit 1;
}

# Called for each opening HTML tag
#
sub start
{
    my($self, $tag, $attr) = @_;
    my($name, $content);
    print STDERR "<$tag>\n" if $debug;
    if ($tag eq 'title') {
	$titletext++;
    }
    if ($tag =~ /^h[1-6]$/) {
        $headingstext++;
    }
    if (grep(/^$tag$/, @keytags)) {
	$keywordstext++;
    }
    if ($tag eq 'meta') {
	for (keys %$attr) {
	    $name = $attr->{$_} if (/^name$/i);
	    $httpequiv = $attr->{$_} if (/^http-equiv$/i);
	    $content = $attr->{$_} if (/^content$/i);
	}

	# keywords is the only http-equiv <meta> tag we're interested in...
	$httpequiv =~ tr/A-Z/a-z/;
	if ($httpequiv eq 'keywords') {
	    push(@metakeywords, $content) unless (@metakeywords);
	    next;
	}
	next unless $name;

	$emetacount++;
	push(@emeta, $name);
	&Validate($name, $content);
	$name =~ tr/A-Z/a-z/; # ignore case of name

	if ($content =~ /\(type=email\)/i) {
	    $content =~ s/\s*\(type=email\)\s*//i;
	    $name .= '.email';
	}
	if ($content =~ /\(type=name\)/i) {
	    $content =~ s/\s*\(type=name\)\s*//i;
	}
	if ($name eq 'dc.identifier' && $content =~ /\(type=url\)/i) {
	    $content =~ s/\s*\(type=url\)\s*//i;
	}
	if ($name eq 'keywords') { # Treat this specially - we'll use it
				   # later to over-ride generated value
	    push(@metakeywords, split(/\s*,\s*/, $content));
	}
	else {
	    push(@metaname, $name) unless $meta{$name};
	    $meta{$name} .= " || " if $meta{$name};
	    $meta{$name} .= "$content\n";
	}
    }
    if ($tag eq 'a') {
	for (keys %$attr) {
	    $href = $attr->{$_} if (/^href$/i);
	}
	push(@dc'relation'isparentof, $href);
    }
    if ($tag eq 'script') {
	$inscript++;
    }
}

# Called for each closing HTML tag
#
sub end
{
    my($self, $tag) = @_;
    print STDERR "</$tag>\n" if $debug;
    if ($tag eq 'title') {
	$titletext--;
	undef $titletext if ($titletext < 0);
    }
    if ($tag =~ /^h[1-6]$/) {
	$headingstext--;
	undef $headingstext if ($headingstext < 0);
    }
    if (grep(/^$tag$/, @keytags)) {
	$keywordstext--;
	undef $keywordstext if ($keywordstext < 0);
    }
    if ($tag eq 'script') {
	$inscript--;
	undef $inscript if ($inscript < 0);
    }
}

# Called for each bit of text
#
sub text
{
    my $self = shift;
    my $t = $_[0];
    return if $inscript; # Chuck out any script text
    # Zap embedded CTRL-M's
    $t =~ s/\x0d/\n/mg;
    # Strip leading newline
    $t =~ s/^\n+//m;
    # Multiple spaces or tabs to single space...
    $t =~ s/[ \t]+/ /mg;
    # Multiple newlines to single newline...
    $t =~ s/\n+/\n/mg;
    # Throw out whitespace only lines
    return if ($t =~ /^\s*$/);
    $t = HTML::Entities::decode_entities($t);
    if ($t) {
        # Remove leading and trailing whitespace...
        $t =~ s/^\s*//;
        $t =~ s/\s*$//;
        push(@dc'title, $t) if $titletext;
        push(@headings, $t) if $headingstext;
        $dc'subject{$t}++ if $keywordstext;
        push(@dc'relation'isparentof, $t) if $urltext;
    }
}

# Called for each HTML comment
#
sub comment
{
}

# Save SOIF record
#
sub SaveSOIF {
    return unless ($soifdir);
    my $md5 = &MakeMD5($dc'identifier);
    if (open(SOIF, ">$soifdir/$md5.soif")) {
	$soif'output = \*SOIF;
	&WriteSOIFRecord;
	close(SOIF);
	$soif'output = \*STDOUT;
    }
}

# Generate MD5 hash of string
#
sub MakeMD5
{
    my ($str) = @_;
    my ($digest, $hex);
    my $md5 = new MD5;

    $md5->add($str);
    $digest = $md5->digest();
    $hex = unpack("H*", $digest);

    return($hex);
}

# Write out a SOIF record
#
sub WriteSOIFRecord {
    print $soif'output "\@FILE { $dc'identifier\n";
    soif'print_item('DC.Title', $dc'title);
    soif'print_item('DC.Creator', $dc'creator);
#    soif'print_item('DC.Creator.Address', $dc'creator'email);
    soif'print_item('DC.Subject', $dc'subject);
    soif'print_item('DC.Description', $dc'description);
    soif'print_item('DC.Publisher', $dc'publisher);
    soif'print_item('DC.Contributor', $dc'contributor);
    soif'print_item('DC.Date', $dc'date);
    soif'print_item('DC.Type', $dc'type);
    soif'print_item('DC.Format', $dc'format);
    soif'print_item('DC.Identifier', $dc'identifier);
    soif'print_item('DC.Source', $dc'source);
    soif'print_item('DC.Language', $dc'language);
    #soif'print_item('DC.relation.isparentof', $dc'relation'isparentof);
    #soif'print_item('DC.relation.ischildof', $dc'relation'ischildof);
    soif'print_item('DC.Relation', $dc'relation);
    soif'print_item('DC.Coverage', $dc'coverage);
    soif'print_item('DC.Rights', $dc'rights);
    soif'print_item('UKOLN.Size', $ukoln'size);
    soif'print_item('UKOLN.MD5', $ukoln'md5);
    print $soif'output "}\n";
}

# Write out a SOIF record suitable for use by Harvest
#
sub WriteHarvestSOIFRecord {
    print "\@FILE { $dc'identifier\n";
    #soif'print_item('Update-Time', $dc'date);
    soif'print_item('Description', $dc'description);
    if ($dc'date =~ /^\d\d\d\d\d\d\d\d$/ || $dc'date =~ /^\d\d\d\d-\d\d-\d\d$/) {
	my $date = $dc'date;
	$date =~ s/-//g;
	$year = substr($date, 0, 4);
	$mon = substr($date, 4, 2);
	$mon--;
	$mday = substr($date, 6, 2);
	if ($year > 1900 && $year < 2999 && $mon >= 0 && $mon < 12 &&
	    $mday > 0 && $mday <= 31) {
	    $year -= 1900;
	    my $time = &timelocal((0,0,0,$mday,$mon,$year,0,0,0));
	    soif'print_item('Last-Modification-Time', $time);
	}
    }
    soif'print_item('Time-to-Live', '2419200');
    soif'print_item('Refresh-Rate', '604800');
    soif'print_item('Gatherer-Name', 'DC-dot');
    soif'print_item('Type', $dc'format);
    soif'print_item('File-Size', $ukoln'size);
    soif'print_item('MD5', $ukoln'md5);
    my $keywords = $dc'subject;
    $keywords =~ s/^\(SCHEME=[^)]*\)\s*//i;
    $keywords =~ s/\s*,\s*/\n/g;
    soif'print_item('Keywords', $keywords);
    soif'print_item('Title', $dc'title);
    soif'print_item('Creator', $dc'creator);
#    soif'print_item('Creator-Email', $dc'creator'email);
    soif'print_item('Publisher', $dc'publisher);
    soif'print_item('Contributor', $dc'contributor);
    soif'print_item('Type', $dc'type);
    soif'print_item('Source', $dc'source);
    soif'print_item('Language', $dc'language);
    soif'print_item('Relation', $dc'relation);
    soif'print_item('Coverage', $dc'coverage);
    soif'print_item('Rights', $dc'rights);
    print "}\n";
}

# Validate an existing <meta> tag
#
sub Validate {
    my ($name, $content) = @_;
    @dc_elements = ('Title', 'Creator', 'Subject', 'Description', 'Publisher',
	'Contributor', 'Date', 'Type', 'Format', 'Identifier', 'Source',
	'Language', 'Relation', 'Coverage', 'Rights');

    @nc = split(/\./, $name); # split name into components
    # General
    if (length($content) > 1024) {
	push(@warn, "$name - long CONTENT (>1024 bytes), may cause problems for some HTML tools (warning)");
    }

    # Dublin Core
    if ($nc[0] =~ /dc/i) {
	push(@warn, "$name - \"$nc[0]\" should be written \"DC\" (warning)") unless ($nc[0] eq 'DC');
	if (grep(/^$nc[1]$/i, @dc_elements)) {
	    push(@warn, "$name - incorrect case for element name - use uppercase first letter for \"$nc[1]\" (warning)") unless (grep(/^$nc[1]$/, @dc_elements));
	}
	else {
	    push(@warn, "$name - \"$nc[1]\" is not a valid DC element (error)");
	}
	if ($nc[2]) {
	    push(@warn, "$name - use of \"$nc[2]\" sub-element in \"$name\" may be non-standard (warning)");
	}
	if ($nc[1] =~ /^subject$/i) {
	    push(@warn, "$name - words/phrases possibly not separated by semi-colons (warning)") unless ($content =~ /;/);
	}
	if ($nc[1] =~ /^identifier$/i && $content =~ /^http:\/\//) {
	    push(@warn, "$name - embedded URL different from resource URL (warning)") unless ($content eq $url);
	}
	if ($nc[1] =~ /^date$/i) {
	    unless ($content =~ /^\d\d\d\d-\d\d-\d\d$/ ||
		    $content =~ /^\d\d\d\d-\d\d$/ ||
		    $content =~ /^\d\d\d\d$/) {
		push(@warn, "$name - date possibly not in ISO 8601 format");
	    }
	}
    }
}

# Write out a USMARC record
#
sub WriteUSMARCRecord {
    $MarcRecord->{data}{245}[$count245++]="00\x1f"."a$dc'title" if $dc'title;
    $MarcRecord->{data}{110}[$count110++]="00\x1f"."a$dc'creator" if $dc'creator;
# No place for email
#    $dc'creator'email;
    $MarcRecord->{data}{653}[$count653++] = "00\x1f"."a$dc'subject" if $dc'subject;
    $MarcRecord->{data}{520}[$descriptioncount++] = "00\x1f"."a$dc'description" if $dc'description;
    $MarcRecord->{data}{260}[$count260++] = "00\x1f"."b$dc'publisher" if $dc'publisher;
    $MarcRecord->{data}{720}[$count720++]="00\x1f"."a$dc'contributor" if $dc'contributor;
    $MarcRecord->{data}{260}[$count260++]="00\x1f"."c$dc'date" if $dc'date;
    $MarcRecord->{data}{516}[$count516++]="00\x1f"."a$dc'type" if $dc'type;
    $MarcRecord->{data}{516}[$count516++]="00\x1f"."a$dc'format" if $dc'format;
    $MarcRecord->{data}{"856"}[$count856++]="70\x1f"."u$dc'identifier" if $dc'identifier;
    $MarcRecord->{data}{"786"}[$count786++]="00\x1f"."t$dc'source" if $dc'source;
    $MarcRecord->{data}{"546"}[$count546++]="00\x1f"."a$dc'language" if $dc'language;
#    $dc'relation'isparentof;
#    $dc'relation'ischildof;
    $MarcRecord->{data}{"787"}[$count787++]="00\x1f"."n$dc'relation" if $dc'relation;
    $MarcRecord->{data}{"500"}[$count500++]="00\x1f"."a$dc'coverage" if $dc'coverage;
    $MarcRecord->{data}{"506"}[$count506++]="00\x1f"."a$dc'rights" if $dc'rights;
    $MarcRecord->{marc_type} = 'USMARC';
    &WriteMarcRecord(*STDOUT);
}

# Write out a UNIMARC record
# Not correct!!!  DO NOT USE!!!!
#
sub WriteUNIMARCRecord {
    $MarcRecord->{data}{245}[$count245++]="00\x1e"."a$dc'title" if $dc'title;
    $MarcRecord->{data}{110}[$count110++]="00\x1e"."a$dc'creator" if $dc'creator;
# No place for email
#    $dc'creator'email;
    $MarcRecord->{data}{653}[$count653++] = "00\x1e"."a$dc'subject" if $dc'subject;
    $MarcRecord->{data}{520}[$descriptioncount++] = "00\x1e"."a$dc'description" if $dc'description;
    $MarcRecord->{data}{260}[$count260++] = "00\x1e"."b$dc'publisher" if $dc'publisher;
    $MarcRecord->{data}{720}[$count720++]="00\x1e"."a$dc'contributor" if $dc'contributor;
    $MarcRecord->{data}{260}[$count260++]="00\x1e"."c$dc'date" if $dc'date;
    $MarcRecord->{data}{516}[$count516++]="00\x1e"."a$dc'type" if $dc'type;
    $MarcRecord->{data}{516}[$count516++]="00\x1e"."a$dc'format" if $dc'format;
    $MarcRecord->{data}{"856"}[$count856++]="70\x1e"."u$dc'identifier" if $dc'identifier;
    $MarcRecord->{data}{"786"}[$count786++]="00\x1e"."t$dc'source" if $dc'source;
    $MarcRecord->{data}{"546"}[$count546++]="00\x1e"."a$dc'language" if $dc'language;
#    $dc'relation'isparentof;
#    $dc'relation'ischildof;
    $MarcRecord->{data}{"787"}[$count787++]="00\x1e"."n$dc'relation" if $dc'relation;
    $MarcRecord->{data}{"500"}[$count500++]="00\x1e"."a$dc'coverage" if $dc'coverage;
    $MarcRecord->{data}{"506"}[$count506++]="00\x1e"."a$dc'rights" if $dc'rights;
    $MarcRecord->{marc_type} = 'UNIMARC';
    &WriteMarcRecord(*STDOUT);
}

# Generate TEI header
#
sub WriteTEIHeader {
    my $notes = "      <note>\n        contributor: $dc'contributor\n      </note>\n" if ($dc'contributor);
    $notes .= "      <note>\n        format: $dc'format\n      </note>\n" if ($dc'format);
    $notes .= "      <note>\n        relation: $dc'relation\n      </note>\n" if ($dc'relation);
    $notes .= "      <note>\n        coverage: $dc'coverage\n      </note>\n" if ($dc'coverage);
    $notes .= "      <note>\n        rights: $dc'rights\n      </note>\n" if ($dc'rights);

    print "<teiHeader>\n";
    if ($dc'title || $dc'creator || $dc'publisher || $dc'date ||
        $dc'identifier || $dc'description || $dc'source || $notes) {
	print "  <fileDesc>\n";
	if ($dc'title || $dc'creator) {
	    print "    <titleStmt>\n";
	    if ($dc'title) {
		print "      <title>\n";
		&fprint("	", $dc'title);
		print "      </title>\n";
	    }
	    if ($dc'creator) {
		print "      <author>\n";
		&fprint("	", $dc'creator);
		print "      </author>\n";
	    }
	    print "    </titleStmt>\n";
	}
	if ($ukoln'size) {
	    print "    <extent>\n      $ukoln'size\n    </extent>\n";
	}
	if ($dc'publisher || $dc'date || $dc'identifier) {
	    print "    <publicationStmt>\n";
	    if ($dc'publisher) {
		print "      <publisher>\n";
		&fprint("	", $dc'publisher);
		print "      </publisher>\n";
	    }
	    if ($dc'date) {
		print "      <date>\n";
		&fprint("	", $dc'date);
		print "      </date>\n";
	    }
	    if ($dc'identifier) {
		print "      <idno>\n";
		&fprint("	", $dc'identifier);
		print "      </idno>\n";
	    }
	    print "    </publicationStmt>\n";
	}
	if ($dc'description || $dc'source) {
	    print "    <sourceDesc>\n";
	    if ($dc'description) {
		print "      <description>\n";
		&fprint("	", $dc'description);
		print "      </description>\n";
	    }
	    if ($dc'source) {
		print "      <source>\n";
		&fprint("	", $dc'source);
		print "      </source>\n";
	    }
	    print "    </sourceDesc>\n";
	}
	if ($notes) {
	    print "    <notesStmt>\n";
	    print $notes;
	    print "    </notesStmt>\n";
	}
	print "  </fileDesc>\n";
    }
    if ($dc'language || $dc'type || $dc'subject) {
	print "  <profileDesc>\n";
	if ($dc'language) {
	    print "    <langUsage>\n";
	    print "      $dc'language\n";
	    print "    </langUsage>\n";
	}
	if ($dc'type || $dc'subject) {
	    print "    <textClass>\n";
	    print "      <keywords>\n";
	    print "        <list>\n";
	    foreach ($dc'type, split(/\s*,\s*/, $dc'type)) {
		&fprint("          ", "<item>$_</item>");
	    }
	    print "        </list>\n";
	    print "      </keywords>\n";
	    print "    </textClass>\n";
	}
	print "  </profileDesc>\n";
    }
    print "</teiHeader>\n";
}

# Formated print
#
sub fprint {
    my ($prefix, $val) = @_;

    eval "format STDOUT =
$prefix^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
\$val
.
";
    write;
}

# Write out a ROADS/IAFA record based on the DOCUMENT template
#
sub WriteIAFADocument {
    my $md5 = &MakeMD5($dc'identifier);
    my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdat) = gmtime();
    my $day = (Sun,Mon,Tue,Wed,Thu,Fri,Sat)[$wday];
    my $month = (Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec)[$mon];
#    my $entrydate = sprintf ("%s, %2.2d %s 19%2.2d %2.2d:%2.2d:%2.2d +0000",
#        ($day,$mday,$month,$year,$hour,$min,$sec));
    $year += 1900;
    $year += 100 if ($year < 1970);
    my $entrydate = "$mday $month $year";
    my $user = $ENV{REMOTE_USER} || $ENV{REMOTE_IDENT} || 'unknown';
    my $useremail = "$user\@$ENV{REMOTE_HOST}";

    print "Template-Type: DOCUMENT\n";
    print "Handle: $md5\n";
    print "Title: $dc'title\n" if $dc'title;
    print "Category: $dc'type\n" if $dc'type;
    print "URI-v1: $dc'identifier\n";
    print "Author-Name-v1: $dc'creator\n" if $dc'creator;
#    print "Author-Email-v1: $dc'creator'email\n" if $dc'creator'email;
    print "Source: $dc'source\n" if $dc'source;
    if ($dc'description) {
	print "Description:";
	&fprint(" ", $dc'description);
    }
    print "Publisher-Name-v1: $dc'publisher\n" if $dc'publisher;
    print "Copyright: $dc'rights\n" if $dc'rights;
    if ($dc'subject) {
	print "Keywords:";
	&fprint(" ", $dc'subject);
    }
    print "Format-v1: $dc'format\n" if $dc'format;
    print "Size-v1: $ukoln'size\n" if $ukoln'size;
    print "Language-v1: $dc'language\n" if $dc'language;
    # Need to do something with date format here!
    # Change to... 18 Jun 1993
    if ($dc'date =~ /^\d\d\d\d\d\d\d\d$/ || $dc'date =~ /^\d\d\d\d-\d\d-\d\d$/) {
	my $date = $dc'date;
	$date =~ s/-//g;
	$year = substr($date, 0, 4);
	$mon = substr($date, 4, 2);
	$mon--;
	$mday = substr($date, 6, 2);
	if ($year > 1900 && $year < 2999 && $mon >= 0 && $mon < 12 &&
	    $mday > 0 && $mday <= 31) {
	    $month = (Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec)[$mon];
	    $dc'date = "$mday $month $year";
	}
    }
    print "Last-Revision-Date-v1: $dc'date\n" if $dc'date;
    my $comment = "Contributor - $dc'contributor. " if $dc'contributor;
    $comment .= "Coverage - $dc'coverage. " if $dc'coverage;
    $comment .= "Relation - $dc'relation. " if $dc'relation;
    if ($comment) {
	print "Comments:";
	&fprint(" ", $comment);
    }
    print "Record-Last-Modified-Date: $entrydate\n";
    print "Record-Last-Modified-Email: $useremail\n";
    print "Record-Created-Date: $entrydate\n";
    print "Record-Created-Email: $useremail\n";
}

# Generate GILS record using SGML-like syntax suitable for loading
# into Zebra.
#
sub WriteGILSRecord {
    print "<gils>\n";
    if ($dc'title) {
	print "  <Title>\n";
	&fprint("    ", $dc'title);
	print "  </Title>\n";
    }
    if ($dc'creator) {
	print "  <Originator>\n";
	&fprint("    ", $dc'creator);
	print "  </Originator>\n";
    }
    if ($dc'contributor) {
	print "  <Contributor>\n";
	&fprint("    ", $dc'contributor);
	print "  </Contributor>\n";
    }
    if ($dc'subject) {
	# Uncontrolled-Term ???
	# Local-Subject-Index ???
	# Subject-Terms-Uncontrolled ???
#	my $subject = $dc'subject;
#	$subject =~ s/,/;/g;
#	print "  <Local-Subject-Index>\n";
#	&fprint("    ", $subject);
#	print "  </Local-Subject-Index>\n";
	print GILS "  <Subject-Terms-Uncontrolled>\n";
	foreach (split(/,\s*/, $Current{'keywords'})) {
	    print "    <Uncontrolled-Term> $_ </Uncontrolled-Term>\n";
	}
	print "  </Subject-Terms-Uncontrolled>\n";
    }
    if ($dc'description) {
	print "  <Abstract>\n";
	&fprint("    ", $dc'description);
	print "  </Abstract>\n";
    }
    if ($dc'publisher || $dc'format || $dc'identifier) {
	print "  <Availability>\n";
	if ($dc'publisher) {
	    print "    <Distributor>\n";
	    print "      <Distributor-Name>\n";
	    &fprint("        ", $dc'publisher);
	    print "      </Distributor-Name>\n";
	    print "    </Distributor>\n";
	}
	if ($dc'format || $dc'identifier) {
	    print "    <Available-Linkage>\n";
	    if ($dc'format) {
	        print "      <Linkage-Type>\n";
	        &fprint("        ", $dc'format);
	        print "      </Linkage-Type>\n";
	    }
	    if ($dc'identifier) {
	        print "      <Linkage>\n";
	        &fprint("        ", $dc'identifier);
	        print "      </Linkage>\n";
	    }
	    print "    </Available-Linkage>\n";
	}
	print "  </Availability>\n";
    }
    if ($dc'date) {
	print "  <Date-of-Publication>\n";
	&fprint("    ", $dc'date);
	print "  </Date-of-Publication>\n";
    }
    if ($dc'type) {
	print "  <Medium>\n";
	&fprint("    ", $dc'type);
	print "  </Medium>\n";
    }
    if ($dc'source) {
	print "  <Sources-of-Data>\n";
	&fprint("    ", $dc'source);
	print "  </Sources-of-Data>\n";
    }
    if ($dc'language) {
	print "  <Language-of-Resource>\n";
	&fprint("    ", $dc'language);
	print "  </Language-of-Resource>\n";
    }
    if ($dc'relation) {
	# Cross-Reference-Linkage ?
	print "  <Cross-Reference>\n";
	print "    <Cross-Reference-Linkage>\n";
	print "      <Linkage>\n";
	&fprint("        ", $dc'relation);
	print "      </Linkage>\n";
	print "    </Cross-Reference-Linkage>\n";
	print "  </Cross-Reference>\n";
    }
    if ($dc'coverage) {
	print "  <Supplemental-Information>\n";
	&fprint("    ", "Coverage: $dc'coverage");
	print "  </Supplemental-Information>\n";
    }
    if ($dc'rights) {
	print "  <Use-Constraints>\n";
	&fprint("    ", $dc'rights);
	print "  </Use-Constraints>\n";
    }
    print "</gils>\n";
}

# Generate DC record in XML
#
sub WriteXML {
    print "<?xml version=\"1.0\"?>\n";
    print <<EOF;
<metadata
  xmlns:dc="http://purl.org/dc/elements/1.1/">
EOF
    if ($dc'title) {
	print "  <dc:title>\n";
	&fprint("    ", $dc'title);
	print "  </dc:title>\n";
    }
    if ($dc'creator) {
	print "  <dc:creator>\n";
	&fprint("    ", $dc'creator);
#	if ($dc'creator'email) {
#	    print "    <Email>\n";
#	    &fprint("      ", $dc'creator'email);
#	    print "    </Email>\n";
#	}
	print "  </dc:creator>\n";
    }
    if ($dc'subject) {
	print "  <dc:subject>\n";
	&fprint("    ", $dc'subject);
	print "  </dc:subject>\n";
    }
    if ($dc'description) {
	print "  <dc:description>\n";
	&fprint("    ", $dc'description);
	print "  </dc:description>\n";
    }
    if ($dc'publisher) {
	print "  <dc:publisher>\n";
	&fprint("    ", $dc'publisher);
	print "  </dc:publisher>\n";
    }
    if ($dc'contributor) {
	print "  <dc:contributor>\n";
	&fprint("    ", $dc'contributor);
	print "  </dc:contributor>\n";
    }
    if ($dc'date) {
	print "  <dc:date>\n";
	&fprint("    ", $dc'date);
	print "  </dc:date>\n";
    }
    if ($dc'type) {
	print "  <dc:type>\n";
	&fprint("    ", $dc'type);
	print "  </dc:type>\n";
    }
    if ($dc'format) {
	print "  <dc:format>\n";
	&fprint("    ", $dc'format);
	print "  </dc:format>\n";
    }
    if ($dc'identifier) {
	print "  <dc:identifier>\n";
	&fprint("    ", $dc'identifier);
	print "  </dc:identifier>\n";
    }
    if ($dc'source) {
	print "  <dc:source>\n";
	&fprint("    ", $dc'source);
	print "  </dc:source>\n";
    }
    if ($dc'language) {
	print "  <dc:language>\n";
	&fprint("    ", $dc'language);
	print "  </dc:language>\n";
    }
    if ($dc'relation) {
	print "  <dc:relation>\n";
	&fprint("    ", $dc'relation);
	print "  </dc:relation>\n";
    }
    if ($dc'coverage) {
	print "  <dc:coverage>\n";
	&fprint("    ", $dc'coverage);
	print "  </dc:coverage>\n";
    }
    if ($dc'rights) {
	print "  <dc:rights>\n";
	&fprint("    ", $dc'rights);
	print "  </dc:rights>\n";
    }
    print "</metadata>\n";
}

# Generate IMS/XML record
#
sub WriteIMS {
    my $l = '<LANGSTRING>';
    my $ll = '<LANGSTRING>';
    my $le = '</LANGSTRING>';
    if ($lang ne "unknown") {
	$ll = '<LANGSTRING lang="'.$lang.'">';
    }

    print <<EOF;
<?xml version=\"1.0\"?>
<!DOCTYPE RECORD SYSTEM "http://www.imsproject.org/XML/IMS-MD01.dtd">
<!--DOCTYPE RECORD SYSTEM "IMS-MD01.dtd" -->
<RECORD xmlns="http://www.imsproject.org/metadata/">
  <METAMETADATA>
    <LANGUAGE>en-UK</LANGUAGE>
  </METAMETADATA>
  <GENERAL>
EOF
    foreach (split(/\s*\|\|\s*/, $dc'title)) {
	print <<EOF;
    <TITLE>
      $ll$_$le
    </TITLE>
EOF
    }
    if ($dc'description) {
	print <<EOF;
    <DESCRIPTION>
      $ll$dc'description$le
    </DESCRIPTION>
EOF
    }
    print "    <KEYWORDS>\n" if $dc'subject;
    foreach (split(/\s*;\s*/, $dc'subject)) {
	print <<EOF;
      $ll$_$le
EOF
    }
    print "    </KEYWORDS>\n" if $dc'subject;
    foreach (split(/\s*\|\|\s*/, $dc'language)) {
	print <<EOF;
    <LANGUAGE>$_</LANGUAGE>
EOF
    }
    foreach (split(/\s*||\s*/, $dc'coverage)) {
	print <<EOF;
    <COVERAGE>
      $l$_$le
    </COVERAGE>
EOF
    }
    print <<EOF;
  </GENERAL>
  <LIFECYCLE>
EOF
    if ($dc'publisher || $dc'date) {
      print <<EOF;
    <CONTRIBUTE>
      <ROLE>
	<LANGSTRING lang="en">Publisher</LANGSTRING>
      </ROLE>
EOF
      foreach (split(/\s*\|\|\s*/, $dc'publisher)) {
	print <<EOF;
      <CENTITY>
	<VCARD>
BEGIN:vCard
ORG:$_
END:vCard
	</VCARD>
      </CENTITY>
EOF
      }
      if ($dc'date) {
	print <<EOF;
      <DATE>
	<DATETIME>$dc'date</DATETIME>
      </DATE>
EOF
      }
      print <<EOF;
    </CONTRIBUTE>
EOF
    }
    foreach (split(/\s*\|\|\s*/, $dc'contributor)) {
	print <<EOF;
    <CONTRIBUTE>
      <ROLE>
	<LANGSTRING lang="en">Contributor</LANGSTRING>
      </ROLE>
      <CENTITY>
	<VCARD>
BEGIN:vCard
FN:$_
END:vCard
	</VCARD>
      </CENTITY>
    </CONTRIBUTE>
EOF
    }
    foreach (split(/\s*\|\|\s*/, $dc'creator)) {
	print <<EOF;
    <CONTRIBUTE>
      <ROLE>
	<LANGSTRING lang="en">Author</LANGSTRING>
      </ROLE>
      <CENTITY>
	<VCARD>
BEGIN:vCard
FN:$_
END:vCard
	</VCARD>
      </CENTITY>
    </CONTRIBUTE>
EOF
    }
    print <<EOF;
  </LIFECYCLE>
  <TECHNICAL>
EOF
    foreach (split(/\s*\|\|\s*/, $dc'format)) {
	next unless (/^text/ || /^application/ || /^image/);
	print <<EOF;
    <FORMAT>$_</FORMAT>
EOF
    }
    foreach (split(/\s*\|\|\s*/, $dc'identifier)) {
	next unless (/^http:/ || /^ftp:/);
	print <<EOF;
    <LOCATION type="URI">$_</LOCATION>
EOF
    }
    print <<EOF;
  </TECHNICAL>
  <EDUCATIONAL>
EOF
    foreach (split(/\s*\|\|\s*/, $dc'type)) {
      print <<EOF;
    <LEARNINGRESOURCETYPE>
      $l$_$le
    </LEARNINGRESOURCETYPE>
EOF
    }
    print <<EOF;
  </EDUCATIONAL>
EOF
    if ($dc'rights) {
	print <<EOF;
  <RIGHTS>
    <DESCRIPTION>
      $l$dc'rights$le
    </DESCRIPTION>
  </RIGHTS>
EOF
    }
    if ($dc'source || $dc'relation) {
      print <<EOF;
  <RELATION>
EOF
      foreach (split(/\s*\|\|\s*/, $dc'source)) {
	print <<EOF;
    <KIND>
      <LANGSTRING lang="en">IsBasedOn</LANGSTRING>
    </KIND>
    <RESOURCE>
      <DESCRIPTION>
	$l$_$le
      </DESCRIPTION>
    </RESOURCE>
EOF
      }
      print <<EOF;
  </RELATION>
EOF
    }
    print "</RECORD>\n";
}

# Generate OLIB OLSTF record format.  For input into NewsAgent system.
#
sub WriteOLSTFRecord {
    print "TI $dc'title\n" if ($dc'title);
    $s =~ s/[,\s]*$/, /;
    $s .= $dc'subject if ($dc'subject);
    print "SU $s\n" if ($s);
    print "AB $dc'description\n" if ($dc'description);
    print "AS $dc'creator\n" if ($dc'creator);
#    print "AE $dc'creator'email\n" if ($dc'creator'email && $dc'creator);
#    print "MT $dc'type\n" if ($dc'type);
#    print "DF $dc'format\n" if ($dc'format);
    print "UL $dc'identifier\n" if ($dc'identifier);
    print "LA $dc'language\n" if ($dc'language);
    print "PU $dc'publisher\n" if ($dc'publisher);
    print "OT WWW\n";
    print "*\n";
}

# Generate DC record in RDF
#
sub WriteRDF {
#    return unless $dc'identifier;
    my($url) = $dc'identifier;
    chomp($url);
    $url =~ s/\s//g;
    my($lt) = '<';
    my($gt) = '>';
    if ($quoting_required) {
	$lt = '&lt;';
	$gt = '&gt;';
	print "<pre>\n";
    }
    print <<EOF;
${lt}?xml version="1.0"?${gt}
${lt}!DOCTYPE rdf:RDF SYSTEM "http://purl.org/dc/schemas/dcmes-xml-20000714.dtd"${gt}

${lt}rdf:RDF
EOF
    print <<EOF;
  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  xmlns:dc="http://purl.org/dc/elements/1.1/">
  ${lt}rdf:Description about="$url"${gt}
EOF
    &WriteRDFProperty("dc:title", $dc'title) if ($dc'title);
    &WriteRDFProperty("dc:creator", $dc'creator) if ($dc'creator);
    &WriteRDFProperty("dc:subject", $dc'subject) if ($dc'subject);
    &WriteRDFProperty("dc:description", $dc'description) if ($dc'description);
    &WriteRDFProperty("dc:publisher", $dc'publisher) if ($dc'publisher);
    &WriteRDFProperty("dc:contributor", $dc'contributor) if ($dc'contributor);
    &WriteRDFProperty("dc:date", $dc'date) if ($dc'date);
    &WriteRDFProperty("dc:type", $dc'type) if ($dc'type);
    &WriteRDFProperty("dc:format", $dc'format) if ($dc'format);
    &WriteRDFProperty("dc:source", $dc'source) if ($dc'source);
    &WriteRDFProperty("dc:language", $dc'language) if ($dc'language);
    &WriteRDFProperty("dc:relation", $dc'relation) if ($dc'relation);
    &WriteRDFProperty("dc:coverage", $dc'coverage) if ($dc'coverage);
    &WriteRDFProperty("dc:rights", $dc'rights) if ($dc'rights);
    print <<EOF;
  ${lt}/rdf:Description${gt}
${lt}/rdf:RDF${gt}
EOF
    print "</pre>\n" if $quoting_required;
}

# write one or more RDF properties
sub WriteRDFProperty {
    my ($element, $value) = @_;
    my($lt) = '<';
    my($gt) = '>';
    if ($quoting_required) {
	$lt = '&lt;';
	$gt = '&gt;';
    }
    foreach $v (split(/\s*\|\|\s*/, $value)) {
	print "    ${lt}$element${gt}\n";
	&fprint("      ", $v);
	print "    ${lt}/$element${gt}\n";
    }
}

# Generate DC record in RDF/XML abbreviated syntax
#
sub WriteAbbrevRDF {
#    return unless $dc'identifier;
    my(@url) = split(/\s*\|\|\s*/, $dc'identifier);
    my $url = shift @url;
    my $identifier = join ' || ', @url;
    my @bag;
    my $bag;
    my($lt) = '<';
    my($gt) = '>';
    print <<EOF;
${lt}rdf:RDF
EOF
    print <<EOF;
  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  xmlns:dc="http://purl.org/dc/elements/1.1/">
  ${lt}rdf:Description about="$url"
EOF
#    if ($dc'title) {
#	&fprint("    ", "dc:Title=\"".$dc'title."\"");
#    }
    push(@bag, &WriteAbbrevRDFElement('dc:title', $dc'title));
    push(@bag, &WriteAbbrevRDFElement('dc:creator', $dc'creator));
    push(@bag, &WriteAbbrevRDFElement('dc:subject', $dc'subject));
    push(@bag, &WriteAbbrevRDFElement('dc:description', $dc'description));
    push(@bag, &WriteAbbrevRDFElement('dc:publisher', $dc'publisher));
    push(@bag, &WriteAbbrevRDFElement('dc:contributor', $dc'contributor));
    push(@bag, &WriteAbbrevRDFElement('dc:date', $dc'date));
    push(@bag, &WriteAbbrevRDFElement('dc:type', $dc'type));
    push(@bag, &WriteAbbrevRDFElement('dc:format', $dc'format));
    push(@bag, &WriteAbbrevRDFElement('dc:identifier', $identifier));
    push(@bag, &WriteAbbrevRDFElement('dc:source', $dc'source));
    push(@bag, &WriteAbbrevRDFElement('dc:language', $dc'language));
    push(@bag, &WriteAbbrevRDFElement('dc:relation', $dc'relation));
    push(@bag, &WriteAbbrevRDFElement('dc:coverage', $dc'coverage));
    push(@bag, &WriteAbbrevRDFElement('dc:rights', $dc'rights));
    foreach (@bag) {
	$bag .= $_;
    }
    if ($bag) {
	print "    $gt\n";
	print $bag;
	print "  ${lt}/rdf:Description${gt}\n";
    }
    else {
	print "  /$gt\n";
    }
    print <<EOF;
${lt}/rdf:RDF${gt}
EOF
}

# Write abbreviated RDF element or return RDF for rdf:Bag in the case of
# multiple values
#
sub WriteAbbrevRDFElement {
    my($element, $value) = @_;
    return unless $value;
    my($lt) = '<';
    my($gt) = '>';
    $value =~ s/\s*$//;
    if ($value =~ /\|\|/) {
	my $ret = "    ".$lt.$element.$gt."\n      ".$lt."rdf:Bag\n";
	my $c = 1;
	foreach $v (split(/\s*\|\|\s*/, $value)) {
	    $ret .= "        rdf:_$c=\"$v\"\n";
	    $c++;
	}
	$ret .= "      /".$gt."\n    ".$lt."/".$element.$gt."\n";
	return($ret);
    }
    else {
	&fprint("    ", $element."=\"".$value."\"");
	return('');
    }
}

# Write a plain text formated BIBLINK mail message
#
sub WriteBIBLINK {
    print ("create\n");
    &bfprint("Title", $dc'title) if ($dc'title);
    &bfprint("Creator", $dc'creator) if ($dc'creator);
    &bfprint("Creator.Email", $dc'creator'email) if ($dc'creator'email);
    &bfprint("Subject", $dc'subject) if ($dc'subject);
    &bfprint("Description", $dc'description) if ($dc'description);
    &bfprint("Publisher", $dc'publisher) if ($dc'publisher);
    &bfprint("Cobtributor", $dc'contributor) if ($dc'contributor);
    &bfprint("Date", $dc'date) if ($dc'date);
    &bfprint("Type", $dc'type) if ($dc'type);
    &bfprint("Format", $dc'format) if ($dc'format);
    &bfprint("Identifier", $dc'identifier) if ($dc'identifier);
    &bfprint("Source", $dc'source) if ($dc'source);
    &bfprint("Language", $dc'language) if ($dc'language);
    &bfprint("Relation", $dc'relation) if ($dc'relation);
    &bfprint("Coverage", $dc'coverage) if ($dc'coverage);
}

# Write a Scout Portal Toolkit bulk import/export dump
sub WriteSPT {
    print join("\t", $dc'title, "", $dc'description, $url, $dc'source,
        $dc'relation, $dc'coverage, $dc'rights, $dc'creator, $dc'date,
        $dc'date, $dc'date, $dc'date, $dc'date, "0", "", "", $dc'subject,
        "") . "\n";
}

sub bfprint {
    my ($name, $value) = @_;
    print "set this $name $value\n";
}

# Parse Microsoft file using ldat
sub ParseMS {
    my ($file) = @_;

# Processing "ukolug98.ppt"
# # (unknown, <undef>, rev 12)
#   Title: Metadata for the Web - RDF and the Dublin Core
#   Authress: Andy Powell
#   Organization: UKOLN
#   Application: Microsoft PowerPoint
#   Template: C:\Program Files\Microsoft Office\Templates\Presentations\UKOLN\ukol
# n-standard-slide.pot
#   Created: 30.06.1998, 08:46:25
#   Last saved: 21.07.1998, 11:12:05
# Done.

    my ($tmpfile) = "/tmp/dcdot.pl.$$";

    return unless ($ldat && -x $ldat);

    if (open(TMP, ">$tmpfile")) {
	print TMP $file;
	close(TMP);
    }
    if (open(TMP, "$ldat $tmpfile|")) {
	while (<TMP>) {
	    chomp;
	    if (/^ *Title: /) {
		s/^ *Title: //;
		$meta{'dc.title'} = $_;
	    }
	    if (/^ *Authress: /) {
		s/^ *Authress: //;
		$meta{'dc.creator'} = $_;
	    }
	    if (/^ *Organization: /) {
		s/^ *Organization: //;
		$meta{'dc.publisher'} = $_;
	    }
	    if (/^ *Created: /) {
		s/^ *Created: //;
		s/, .*$//;
		my ($d, $m, $y) = split(/\./);
		$meta{'dc.date'} = "$y-$m-$d";
	    }
	}
	close(TMP);
    }
    unlink($tmpfile);
}

# Parse PDF file - suggested by James A Uren <James.A.Uren@jpl.nasa.gov>
# dunno how reliable this will be!?
sub ParsePDF {
  my ($pdf) = @_;

# Look for stuff like this...
# >>
# endobj
# 307 0 obj
# <<
# /CreationDate (D:19990303143632)
# /Producer (Acrobat Distiller 3.0 for Power Macintosh)
# /Title (NASA's Deep-Space Telecommunications Road Map)
# /Subject (NASA's Deep Space Telecommunications Road Map)
# /Author (C. D. Edwards, Jr., C. T. Stelzried, L. J. Deutsch, L. Swanson)
# /Keywords (deep space telecommunications, road map, Ka-band, 32 GHz, TMOD, wideband\
# , high data-rate communications, planetary distances, rapid infusion of \
# Ka-band and optical communicatrions, cost effective growth, future growt\
# h, faster, cheaper, better)
# /ModDate (D:19990303144118)
# >>
#
# See http://partners.adobe.com/supportservice/devrelations/PDFS/TN/PDFSPEC.PDF
# for details...

  $pdf =~ s/\cM/\n/g;
  $pdf =~ s/\n\n/\n/g;
  $pdf =~ s/\\\n//g;

  foreach (split(/\n/, $pdf)) {
    if (/^\/Author /) {
      s/^\/Author //;
      s/^\s*\(//;
      s/\)\s*$//;
      $meta{'dc.creator'} = $_;
    }
    if (/^\/Title /) {
      s/^\/Title //;
      s/^\s*\(//;
      s/\)\s*$//;
      $meta{'dc.title'} = $_;
    }
    if (/^\/Keywords /) {
      s/^\/Keywords //;
      s/^\s*\(//;
      s/\)\s*$//;
      $meta{'dc.subject'} = $_;
    }
  }
}

sub GuessTypeScheme {
    my ($type) = @_;

    return('DCMIType') if ($type =~ /^Collection$/i
	|| $type =~ /^Dataset$/i
	|| $type =~ /^Event$/i
	|| $type =~ /^Image$/i
	|| $type =~ /^InteractiveResource$/i
	|| $type =~ /^Service$/i
	|| $type =~ /^Software$/i
	|| $type =~ /^Sound$/i
	|| $type =~ /^Text$/i);
}

sub GuessFormatScheme {
    my ($format) = @_;

    return if ($format =~ /\s/);
    return('IMT') if ($format =~ /^text\//i
	|| $format =~ /^application\//i
	|| $format =~ /^sound\//i
	|| $format =~ /^image\//i);
}

sub GuessDateScheme {
    my ($date) = @_;

    return('W3CDTF') if ($date =~ /^\d\d\d\d$/ ||
	$date =~ /^\d\d\d\d-\d\d$/ ||
	$date =~ /^\d\d\d\d-\d\d-\d\d$/);
    return('');
}

sub GuessLanguageScheme {
    my ($language) = @_;

    return('RFC1766') if ($language =~ /^[a-zA-Z][a-zA-Z]$/ ||
	$language =~ /^[a-zA-Z][a-zA-Z]-[a-zA-Z][a-zA-Z]$/);
    return('ISO639-2') if ($language =~ /^[a-zA-Z][a-zA-Z][a-zA-Z]$/);
    return('');
}


# Gubbins for dumping out DC fields in different formats
sub dump_outputs {
    if ($Form{'output_format'} eq 'UNIMARC') {
        &WriteUNIMARCRecord;
    } elsif ($output_format eq 'USMARC') {
        &WriteUSMARCRecord;
    } elsif ($output_format eq 'SOIF') {
        &WriteHarvestSOIFRecord;
    } elsif ($output_format eq 'TEI') {
        &WriteTEIHeader;
    } elsif ($output_format eq 'IAFA') {
        &WriteIAFADocument;
    } elsif ($output_format eq 'GILS') {
        &WriteGILSRecord;
    } elsif ($output_format eq 'XML') {
        &WriteXML;
    } elsif ($output_format eq 'OLSTF') {
        &WriteOLSTFRecord;
    } elsif ($output_format eq 'RDF') {
        &WriteRDF;
    } elsif ($output_format eq 'AbbreviatedRDF') {
        &WriteAbbrevRDF;
    } elsif ($output_format eq 'BIBLINK') {
        &WriteBIBLINK;
    } elsif ($output_format eq 'IMS') {
        &WriteIMS;
    } elsif ($output_format eq 'TEXT') {
        &WriteBIBLINK;
    } elsif ($output_format eq 'SPT') {
	&WriteSPT;
    } else {
	&WriteBIBLINK;
    }
}


=head1 NAME

B<dcbot.pl> - Robot to harvest Dublin Core metadata

=head1 SYNOPSIS

  dcbot.pl [-d] [-f format] [-g] [-l ldat-path] [-p] [-u URL]
           [-w WHOIS-command]

=head1 DESCRIPTION

B<dcbot.pl> - is a Perl program to generate Dublin Core elements in
various formats from the metadata embedded in a nominated HTML page.

It was derived from the original UKOLN DC-dot program, which is
intended to run as a CGI script under harness to a Web server.  By
removing the Web functionality the resulting code can be used
trivially in a batch scripting environment, e.g. to harvest large
volumes of Dublin Core metadata for later indexing.

Note that the Web functionality of DC-dot also includes a Dublin Core
editor component, which for obvious reasons is not present in this
program!

=head1 OPTIONS

=over 4

=item B<-d>

Indicates whether debugging should be enabled - it's switched off by
default.

=item B<-f> I<format>

Indicates the output format which should be used for the results
generated by B<dcbot.pl>.  The available choices are listed below.

=item B<-g>

Attempt to "guess" the Publisher of the resource using WHOIS to look
up their domain contact details.

=item B<-l> I<path-to-ldat>

The path to the b<ldat> program, used to extract metadata from
Microsoft Office documents.  This defaults to "ldat".

=item B<-p>

Whether to attempt to use WHOIS to look up the domain registration
information for the site in question, if it is not provided in the
embedded Dublin Core META tags.

=item B<-u> I<URL>

The URL of the HTML page which is to be fetched and parsed for
embedded Dublin Core metadata

=item B<-w> I<WHOIS-command>

The path to the B<whois> command, used if the B<-g> option is given.
This defaults to "whois -h whois.arin.net".

=back

=head1 OUTPUT FORMATS

=over 4

=item I<UNIMARC>

UNIMARC format - not fully operational yet, so use with caution!

=item I<USMARC>

US MARC

=item I<SOIF>

The Harvest system's Summary Object Interchange Format

=item I<TEI>

Using the Text Encoding Initiative's TEI SGML DTD

=item I<IAFA>

IAFA templates suitable for loading into a ROADS server

=item I<GILS>

GILS GRS-1 records suitable for loading into a Z39.50 server

=item I<XML>

Using the Dublin Core basic XML schema

=item I<OLSTF>

OLIB OLSTF format for importing into the NewsAgent system

=item I<RDF>

Using the Dublin Core basic RDF schema

=item I<AbbreviatedRDF>

Abbreviated RDF

=item I<BIBLINK>

Write a plain text formatted version of the record suitable for
importing into the BIBLINK system.

=item I<IMS>

Using the IMS project's XML DTD

=item I<TEXT>

Synonym for BIBLINK

=item I<SPT>

Use the Scout Portal Toolkit bulk import/export format

=back

=head1 EXAMPLE

  $ dcbot.pl -f SOIF -u http://www.lboro.ac.uk/ 
  @FILE { http://www.lboro.ac.uk/
  Description{75}:        Loughborough University offers degree
    programmes and world class research. 
  Last-Modification-Time{10}:     1032908400
  Time-to-Live{7}:        2419200
  Refresh-Rate{6}:        604800
  Gatherer-Name{6}:       DC-dot
  Type{23}:       text/html || 8415 bytes
  File-Size{4}:   8415
  MD5{32}:        aef5f97ecfbd0a2018d105761e09cc52
  Keywords{106}:  University; England; postgraduate; undergraduate;
    degree programme; course; research; teaching; prospectus
  Title{23}:      Loughborough University
  Type{4}:        Text
  }

=head1 BUGS

The B<dcbot.pl> program has an external dependency on the Harvest
system's Summary Object Interchange Format (SOIF) parser code.

It is assumed that Harvest has been installed in
I</usr/local/harvest>.  If this is not the case, the environmental
variable B<HARVEST_HOME> should be set to the Harvest top level
directory, or the B<dcbot.pl> code edited to reflect its location.

=head1 COPYRIGHT

The original DC-dot program is Copyright (C) 1997 UKOLN, University of
Bath, UK, and was written by UKOLN's Andy Powell.  This derivative was
created by Martin Hamilton in 2002.

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA

This work was partially supported by grants from the UK Electronic
Libraries Programme (eLib), the European Commission Telematics for
Research Programme, and the joint JISC/NSF Digital Libraries Programme
(IMesh Toolkit project).

UKOLN is funded by Resource: The Council for Museums, Archives &
Libraries (the organisation succeeding the Library and Information
Commission), the Joint Information Systems Committee (JISC) of the
Higher and Further Education Funding Councils, as well as by project
funding from the JISC and the European Union. UKOLN also receives
support from the University of Bath where it is based.

