#!/usr/bin/perl
#
# transcode_email.pl
#
# Transcode e-mail from RFC822 to XML (see email.dtd).
# Requires MailTools package from CPAN.
#
# Created: August 3, 2000 <eekim@eekim.com>

use strict;
use Mail::Address;
use Mail::Header;
use Mail::Internet;
use Mail::Util qw(read_mbox);

# some initial error checking
if ($#ARGV < 0) {
  print qq{Usage:
    $0 mboxfilename

where mboxfilename is an RFC822-compliant mailbox.
};
  exit 1;
}
if (!(-e $ARGV[0])) {
  print "Error: File $ARGV[0] does not exist.\n";
  exit 1;
}

# declare some variables
my ($mail, $head); # Mail objects
my (@messages, $message, @body); # local variables

# parse mailbox file and print associated XML
print q{<?XML version="1.0"?>
<!DOCTYPE email SYSTEM "email.dtd">
};
@messages = read_mbox($ARGV[0]);
foreach $message (@messages) {
  print "<message>\n";
  $mail = new Mail::Internet $message;
  # clean up message body
  $mail->tidy_body;
  $mail->unescape_from;
  # parse and print headers
  $head = $mail->head;
  &transcode_headers($head->header_hashref);
  # parse and print body
  print "<body>\n";
  @body = &transcode_body(@{$mail->body});
  print join("", @body);
  print "</body>\n";
  # parse and print MIME attachments
  ### [parse and print MIME attachments here]
  print "</message>\n";
}

# fini
exit 0;

### functions

# transcode_headers
# pre:
#   $header is a hash reference returned by the Mail::Header
#   object.
# post:
#   Prints transcoded headers as XML.

sub transcode_headers {
  my ($header) = @_;
  my ($subject);

  print "<headers>\n";
  &print_addresses('from', @{$header->{From}});
  &print_addresses('to', @{$header->{To}});
  &print_addresses('cc', @{$header->{Cc}});
  ### [do date parsing here]
  $subject = join("", @{$header->{Subject}});
  $subject =~ s/\n//g;
  $subject =~ s/\r//g;
  print "  <subject>$subject</subject>\n";
  ### [include other headers here]
  print "</headers>\n";
}

# print_addresses
# pre:
#   Takes the mail header type (i.e. from, to, cc) and a list of
#   address strings as parameters.
# post:
#   Parses raw strings into name and email components, and prints
#   XML to stdout.

sub print_addresses {
  my ($header, @addresses) = @_;
  my ($address, @parsed_addresses, $parsed_address);
  my ($name, $email);

  foreach $address (@addresses) {
    @parsed_addresses = Mail::Address->parse($address);
    foreach $parsed_address (@parsed_addresses) {
      print "  <$header>\n";
      $name = $parsed_address->name;
      $email = $parsed_address->address;
      if (defined $name) {
        print "    <name>$name</name>\n";
      }
      if (defined $email) {
        print "    <email>$email</email>\n";
      }
      print "  </$header>\n";
    }
  }
}

# transcode_body
# pre:
#   Takes the body of an e-mail message as a parameter.  The body
#   should be represented as an array of lines.
# post:
#   Returns transcoded body as an array of lines.

sub transcode_body {
  my (@body) = @_;
  my ($line, $prevline, @newbody);
  my $sid = 0;
  my $S_newparagraph = 0;

  foreach $line (@body) {
    # replace reserved XML characters w/ entities
    $line =~ s/\&/\&amp\;/g;
    $line =~ s/</\&lt\;/g;
    $line =~ s/>/\&gt\;/g;
    # transcode paragraphs
    if ($line eq "\n") {
      if (!$S_newparagraph) {
        $prevline = pop @newbody;
        $prevline =~ s/$/<\/p>/;
        push @newbody, $prevline;
        $sid++;
      }
      $S_newparagraph = 1;
    }
    elsif ($S_newparagraph) {
      $line =~ s/^/<p sid="$sid">/;
      $S_newparagraph = 0;
    }
    push @newbody, $line;
  }
  # add paragraph tags at the beginning and end of the body
  $newbody[0] =~ s/^/<p sid="0">/;
  $newbody[$#newbody] =~ s/$/<\/p>/;
  # return transcoded body
  return @newbody;
}

