#!/usr/bin/perl

# $Id: spt2cip.pl,v 1.2 2002/09/30 12:04:54 martin Exp $

$now = time;
$base = shift;
$dsi = shift;
$counter = 1;

while (<>) {
    chop;

    my ($Title, $AlternateTitle, $Description, $Url, $Source,
	$Relation, $Coverage, $Rights, $EmailAddress, $DateIssued,
	$DateOfRecordCreation, $DateRecordChecked, $DateLastModified,
	$VerificationAttempts, $ControlledName, $ControlledNameTypeName,
	$ClassificationName, $ClassificationTypeId) = split /\t/;

    foreach $word (split(/\W/, $Title)) {
	next if $word =~ /^$/ || $word =~ /^\s+$/;
	$centroid{Title}{"$counter/$word"} = 1;
    }
    foreach $word (split(/\W/, $AlternateTitle)) {
	next if $word =~ /^$/ || $word =~ /^\s+$/;
	$centroid{Title}{"$counter/$word"} = 1;
    }
    foreach $word (split(/\W/, $Description)) {
	next if $word =~ /^$/ || $word =~ /^\s+$/;
	$centroid{Description}{"$counter/$word"} = 1;
    }
    foreach $word (split(/\W/, $Source)) {
	next if $word =~ /^$/ || $word =~ /^\s+$/;
	$centroid{Source}{"$counter/$word"} = 1;
    }
    foreach $word (split(/\W/, $Relation)) {
	next if $word =~ /^$/ || $word =~ /^\s+$/;
	$centroid{Relation}{"$counter/$word"} = 1;
    }
    foreach $word (split(/\W/, $Coverage)) {
	next if $word =~ /^$/ || $word =~ /^\s+$/;
	$centroid{Coverage}{"$counter/$word"} = 1;
    }
    foreach $word (split(/\W/, $Rights)) {
	next if $word =~ /^$/ || $word =~ /^\s+$/;
	$centroid{Rights}{"$counter/$word"} = 1;
    }

    $counter++;

#XXX
#dcCreator: $EmailAddress
#dcDate: $DateIssued
#dc$ControlledNameTypeName: $ControlledName
#dcSubject: $ClassificationName
}

print <<EOF;
version: x-tagged-index-1
updatetype: total
thisupdate: $now
BEGIN IO-schema
EOF

foreach $attr (keys %centroid) {
    print "$attr: FULL\r\n";
}

print <<EOF;
END IO-Schema\r
BEGIN Index-Info
EOF

foreach $attr (keys %centroid) {
    print "$attr: ";
    $cont = 0;
    foreach $value (sort {$a <=> $b} keys %{$centroid{$attr}}) {
        if ($cont == 0) {
	    print "$value\r\n";
	    $cont = 1;
        } else {
	    print "-$value\r\n";
        }
    }
}

print "END Index-Info\r\n";


=head1 NAME

B<spt2cip.pl> - Munge an SPT export into a CIP Tagged Index Object

=head1 SYNOPSIS

  spt2cip.pl <spt.txt >cip.txt

=head1 DESCRIPTION

The B<spt2cip.pl> program takes a bulk export produced by the Scout
Portal Toolkit (SPT) software, and uses it to generate an RFC 2654
style Tagged Index Object, for use with the Common Indexing Protocol
(see RFCs 2651 and 2652).  TIOs are one of the forward knowledge
summary mechanisms supported by CIP, and are supported by default in
the ROADS software's cross-searching subsystem.

Note that the summaries produced by this program are based on mapping
the SPT attributes to unqualified Dublin Core element - "Description",
"Title" and so on.

=head1 EXAMPLE

  $ cat foo.spt
  The Shockwave Rider             Nickie Haflinger, the only
  person to escape from Tarnover - where they raise hyper-
  intelligent children to maintain the  political dominance of
  the USA in the 21st century - is on the run, dodging from
  loophole to crevice to crack in the computerised data-net that
  binds the continent like chains.  After years of flight and
  constant changes of identity, at the strange small town called
  Precipice he discovers he is not alone in his quest.  But can
  his new allies save him when he falls again into the sinister
  grasp of Tarnover ?
  http://www.mammothmusic.com/~wolf/literature/brunner_john.html
         1975     1975    1975    1975    0

(SPT records use the tab character as a delimiter - but the above has
been word wrapped for readability)

  $ spt2cip.pl <foo.spt
  version: x-tagged-index-1
  updatetype: total
  thisupdate: 1033386548
  BEGIN IO-schema
  Description: FULL
  Title: FULL
  END IO-Schema
  BEGIN Index-Info
  Description: 1/identity
  -1/crevice
  -1/flight
  -1/into
  -1/constant
  -1/alone
  ...
  ...
  ...
  -1/not
  -1/small
  -1/discovers
  Title: 1/Rider
  -1/Shockwave
  -1/The
  END Index-Info

=head1 SEE ALSO

L<spt2centroid.pl>

=head1 COPYRIGHT

Copyright (c) 2002, Martin Hamilton E<lt>imeshtk-utils@martinh.netE<gt>
All rights reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

It was developed at the Department of Computer Science at Loughborough
University, as part of the joint JISC/NSF IMesh Toolkit project.

=head1 AUTHOR

Martin Hamilton E<lt>imeshtk-utils@martinh.netE<gt>

