#!/usr/bin/perl


## ACKNOWLEDGMENT: Based on code by Pete Cliff and Andy Powell of UKOLN 
## Modified by M.Duke of UKOLN to process oai records 
## and return values for RSS channel items

## Gets the documents from a list of URLs 
## The document URL should return a version of the record that
## conforms to the OAI dc schema  
## The item values about, title and description are returned
## Any errors in the URL or XML are reported
## deals also with version 1 oai (because of RDN)

sub  getItemValuesFromOai {
  use LWP::Simple;
  use XML::Simple;

  %errors; # key: URL giving error; value: error message
  @success; # each index consists of an array of item about, title, descripiton values
    #open (ERRS, ">/opt/rdn/imeshdev/tmp/errorlog");  for debugging or logging

  my ($oaiver, $URLs) = @_;
  if ($oaiver == 1) {
    $qnameoai = ""; # version 1 of OAI does not use namespace prefixes 
    $qnamedc = "";
  } elsif ($oaiver == 2) {
    $qnameoai = "oai_dc:";
    $qnamedc = "dc:";
  } else {
     # unrecognized OAI version - must be 1 or 2!
  }

  $dckey = $qnameoai."dc";
  $idkey = $qnamedc."identifier";
  $desckey = $qnamedc."description";
  $tikey = $qnamedc."title";

  #print ERRS "oaiver $oaiver DC prefix $qnamedc OAIprefix $qnameoai DC key $dckey ID key $idkey Description key $desckey Title key $tikey \n";

  foreach $url (@{$URLs}) {
    my ($title, $description, $about);
    ## get the web page content
    # print ERRS "Got $url \n";
    my $content = get($url);  ## if this fails, store url and error message in errors hash
    #my $content = eval{ get($url) };  ## if this fails, store url and error message in errors hash
    #if ($@) { 
    #  $errors{$url} = $@; 
     # next
    #} 
    unless (defined $content) {
      $errors{$url} = "Could not get this URL" unless ($url eq ""); 
      next
    }
   
    ### find the values by processing the XML 
    my $xmlp = eval { XMLin($content, noattr=>1) };  ## if this fails, store url and error message in errors hash
    #print ERRS "$content";
    if ($@) { 
      $errors{$url} = $@; 
      next 
    }
    # check for double values of id, select the one which has URL
    if ( ref($xmlp->{GetRecord}->{'record'}->{'metadata'}->{$dckey}->{$idkey}) eq 'ARRAY' ) { # >1  identifiers
        @about = grep /http:\/\//,@{ $xmlp->{GetRecord}->{'record'}->{'metadata'}->{$dckey}->{$idkey} };
        $about = trim($about[0]);
    } elsif ( ref($xmlp->{GetRecord}->{'record'}->{'metadata'}->{$dckey}->{$idkey}) eq '') {   # single identifier
        $about = trim($xmlp->{GetRecord}->{'record'}->{'metadata'}->{$dckey}->{$idkey});
    } else {
       # error!
    }
    # check for double values of description, concatenate
    if ( ref($xmlp->{GetRecord}->{record}->{'metadata'}->{$dckey}->{$desckey}) eq 'ARRAY' ) { # >1  description
      foreach $desc ( @{ $xmlp->{GetRecord}->{'record'}->{'metadata'}->{$dckey}->{$desckey} } ) {
        $description .= $desc;
      }
    } elsif ( ref($xmlp->{GetRecord}->{'record'}->{'metadata'}->{$dckey}->{$desckey}) eq '') {
        $description = $xmlp->{GetRecord}->{'record'}->{'metadata'}->{$dckey}->{$desckey};
    } else {
       # error!
    }
    $title = xmlEncodeURL(trim($xmlp->{GetRecord}->{'record'}->{'metadata'}->{$dckey}->{$tikey}));
    #print ERRS "ABOUT $about \n TITLE: $title \n DESC: $description\n";
    my @item = ($about, $title, $description);
    push @success, \@item;
  }
  return (\@success, \%errors);
}

sub generate_RSS {
    use XML::RSS;
   
    # items are the channel items, others are the rest of the channel parameters e.g. title
    my ($items, $others)  = @_;
    
    # create instance of XML::RSS

    my $rss = new XML::RSS (version => '1.0');

   # set up encoding hash fir url encode subroutine
   for (0 .. 255) {
    $char2ref{chr($_)} = "&#".$_.";";
    $ref2char{$_} = chr($_);
  }

    # channel
    my $now = scalar localtime;
    $rss->channel(
                  'title'       => $others->{ctitle},
                  'link'        => &url_encode($others->{clink}),
                  'description' => &html_encode(&html_decode($others->{cdesc})),
                  'language'    => $others->{clang},
                  'copyright'    => $others->{ccopyright},
                  'webMaster'    => $others->{cwebmaster},
                  'lastBuildDate' => $now,
                  );

    # image
    if ($others->{iurl}) {
        $rss->image(
                    'title' => &html_encode(&html_decode($others->{ititle})),
                    'url'   => &url_encode($others->{iurl}),
                    'link'  => &url_encode($others->{ilink})
                    );
    }

    # textinput
    if ($others->{tilink}) {
        $rss->textinput(
                        'title'       => &html_encode(&html_decode($others->{tititle})),
                        'description' => &html_encode(&html_decode($others->{tidesc})),
                        'name'        => &html_encode(&html_decode($others->{tiname})),
                        'link'        => &url_encode($others->{tilink})
                        );
    }

    # items
    foreach $it (@{$items}) {   # $it is a ref to an array of about, title, description
        $rss->add_item(
                       'title' => &html_encode(&html_decode($it->[1])),
                       #'link'  => $it->[0],
                       'link'  => &urlUnencode(&url_encode($it->[0])),
                       'description'  => &html_encode(&html_decode($it->[2]))
                       );
    }

  # close ERRS;
  return $rss->as_string;
}

##################### secondary subroutines ######################

sub urlUnencode {
  my ($in) = @_;
  
  $in =~ s/%(..)/chr(hex($1))/egi;
  $in =~ s/\+*$//;
  return $in;
  #return $in."\n";
}

sub xmlEncodeURL {
  my ($url) = @_;
  
  $url =~ s/\&/\&amp;/g;
  $url =~ s/"/\&quot;/g;
  $url =~ s/</\&lt;/g;

  return $url;
}

sub trim {
  # removing preceeding and tailing spaces
  ($_) = @_;
  s/^\s*//;
  s/\s*$//;
  return $_;
}

sub url_encode {

  my $val = shift;
  $val =~ s/"/%25/g;
  $val =~ s/\(/%28/g;
  $val =~ s/\)/%29/g;
  $val =~ s/ /+/g;

  $val =~ s/([^\n\t !\#\$%\'-;=?-~])/$char2ref{$1}/g;
  return $val;
  # return "<![CDATA[".$val."]]>";

}

sub html_decode {

  my $val = shift;

  $val =~ s/\&\#(\d*)\;/$ref2char{$1}/g;
  return $val;

}

sub html_encode {

  my $val = shift;

  # Encode control chars, high bit chars and '<', '&', '>', '"'
  $val =~ s/([^\n\t !\#\$%\'-;=?-~])/$char2ref{$1}/g;
  return $val;
  # return "<!CDATA[".$val."]]>\n";

}

sub trim {
  ($_) = @_;
  s/^\s*//;
  s/\s*$//;
  return $_;
}

1;

