#!/usr/local/bin/perl

#-------------------------------------------
#
# This demonstrator program shows how the body of a message may be
# canonicalized by decoding any 'quoted-printable' or 'base64' portions
# before hashing (irrespective of whether they were present when the
# message was submitted or whether they were encoded en route because some
# non-8BITMIME or non-BINARYMIME MTA was encountered).
#
# This canonicalization is either an addition to the presently proposed
# 'relaxed' canonicalization for bodies (the ignoring of trailing
# whitespace and of trailing blank lines has been omitted from this
# demonstration for simplicity's sake), or else it could form the basis of
# some new '8bit-safe' canonicalization.
#
# The presence of the MIME media types 'multipart' and 'message' can bring
# about a tree of potentially unbounded depth, with the possibility of
# encountering encoded objects at any node. Hence a recursive descent of
# this tree is necessary.
# 
# Little attempt has been made to check for mal-formed MIME structures,
# since they are going to fail elsewhere in the mail system anyway.
# However, a few cases which would cause the canonicalization to fail will
# give rise to fatal errors, and there are a couple of warnings for some
# odd cases that are technically illegal, but will not cause signatures to
# fail.
# 
# Care has been taken to ensure that naked LFs do not occur outside of
# regions where the CTE is 'binary'. However, it is too much trouble to
# test for naked CRs; if present (whether in binary regions or not) they
# will pass through and get hashed like any other character.
# 
#-------------------------------------------

use English;
use MIME::Head;
use MIME::QuotedPrint;
use MIME::Base64;
use Getopt::Long;

my $LONGLINE = ' ' x 1000;
$_ = $LONGLINE;	# provide a buffer long enough for most lines;
		# if it needs more, it will malloc it

GetOptions (
	"unix|u" => \$unix,
	"test|t" => \$test,
	"noctedrop" => \$noctedrop,
	"head" => \$keep_heads,
	"quiet|q" => \$quiet,
	"help|h" => \&usage
) or &usage;

sub usage {
	print STDOUT
	"usage: uncode.pl [options] [(infile|-) [outfile]]
	-h --help   print this message
	-u --unix   ensure lines end with CRLF, except within CTE binary;
	            for testing, or for MUAs running on Unix
	-t --test   no decoding; output should be identical to input,
	            modulo added CRs
	--noctedrop do not replace Content-Transfer-Encodings;
	            for testing only
	--head      keep top-level headers
	-q --quiet  no warnings; recommended when verifying\n";
	exit;
}

#-------------------------------------------
#
# Set up $in and $out handlers; by default, STDIN and STDOUT
# (this program is designed to operate in a pipeline).
#
my ($in, $out);
if (defined $ARGV[0]) {open($in, "<$ARGV[0]")}
else {$in = STDIN}
if (defined $ARGV[1]) {open($out, ">$ARGV[1]")}
else {$out = STDOUT}

#-------------------------------------------
#
#   whine ()   for non-fatal problems
#
sub whine {warn(@_) unless $quiet};

#-------------------------------------------
#
#   print_line ($line, $encoding)
#
#   on $out, as per $encoding; the $line parameter is a Ref to the
#   actual line, to avoid copying
#
sub print_line {
	my ($line,$encoding) = @_;
	local $buf = $LONGLINE;
	if ($encoding eq 'quoted-printable') {
	    ### the following messiness corects the misfeature of
	    ### 'decode_qp' in Unix versions of Perl, which generates
	    ### lines ending in a single LF; this necessitates
	    ### detecting incoming lines ending in '='

		$buf = decode_qp($$line);
		if (substr($$line,-3) !~ m/=\r?\n$|^$/o)
			{substr($buf,-1,1) = "\r\n"}
		print $out $buf;
	} elsif ($encoding eq 'base64') {
		$buf = decode_base64($$line);
		print $out $buf;
	} else {
		print $out $$line;
	}
}

#-------------------------------------------
#
#   read_chunk ($bound, $encoding)
#
#   reads, and trascribes using the specified $encoding, up to the
#   given MIME boundary, or up to EOF
#
sub read_chunk {
	my ($bound,$encoding) = @_;

	my $last = "";
	my $eos_type = "";	# DELIM, CLOSE or EOF

	while (<$in>) {		# will read up to next '\n'
				# which might be a long way away in
				# genuine binary attachments, but should
				# otherwise fit in 1000 octets
		if ($encoding ne 'binary' && substr($_,-2,2) ne "\r\n") {
		    ### if we find a line ending in naked LF when not in
		    ### CTE 'binary' then in Unix mode we fix it,
		    ### otherwise it is a fatal error

			if ($unix) {substr($_,-1,1) = "\r\n"}
			else {die "naked LF in non-binary region\n"}
		}
		if (substr($_,0,2) eq '--'
		   && $_ =~ s/^(--$bound(--)?[ \t]*)\r?\n/\1\r\n/) {
		    ### it was the expected boundary, and we note whether
		    ### it is a CLOSEing boundary

			$eos_type = ($2 ? 'CLOSE':'DELIM');

		    ### we always print one line in arrear, since we do not
		    ### want to print the boundary, which is never encoded
			print_line(\$last, $encoding);
			$last = $_;

			return ($eos_type, $last);
		    ### we return the boundary for printing later
		} else {
			print_line(\$last, $encoding);
			$last = $_;
		}
	}
	$eos_type = 'EOF';
	print_line(\$last, $encoding);
	return ($eos_type, '');
}

#-------------------------------------------
#
#   A map from encoding to integer for checking that a larger encoding
#   is not contained within a smaller one.
#   '' is a dummy encoding representing the initial state.
#
my %encodings = (''=>0, 'binary'=>1, '8bit'=>2, '7bit'=>3,
                 'quoted-printable'=>3, 'base64'=>3); 

#-------------------------------------------
#
#   process_header ($deftype, $old_encoding)
#
#   Reads headers, both at the top level and at the start of multipart 
#   and message types. It reurns the Content-Type (defaulting to $deftype)
#   and Content-Transfer-Encoding, $old-encoding will be '' at the
#   top level.
#
sub process_header {
	my ($deftype, $old_encoding) = @_;
	my $encoding;

	my $head = new MIME::Head;
	$head->read($in);
	my $mime_type = $head->mime_type($deftype);
	$encoding = $head->mime_encoding;
	my $bound = $head->multipart_boundary;

	$head->replace('content-transfer-encoding', "binary\r")
		unless ($test or $noctedrop);
    ### we don't know the original CTE, but anyway that CTE is no longer
    ### of interest; so we change it to 'binary' to make sure the
    ### 'naked LF' test is not triggered during verification

	my $newstring = $head->as_string;
	if ($unix) {$newstring =~ s/\r?\n/\r\n/og}
	if ($old_encoding ne '' || $test || $keep_heads)
	    ### i.e. not a top-level header, which should not be output
	    ### because they are canonicalized separately
		{ print $out $newstring, "\r\n" }
	return ($mime_type, $encoding, $bound);
	}

#-------------------------------------------
#
#   process_part ($oldbound, $deftype, $old_encoding)
#
#   process_part is the heart of the whole system. A 'part' is a part of
#   a multipart, or a message type, or even the whole message. This
#   routine walks through the tree of parts, calling itself recursively
#   as needed.
#
sub process_part {
	my ($oldbound, $deftype, $old_encoding) = @_;

	my ($mime_type, $encoding, $bound) =
	    process_header($deftype, $old_encoding);
	if ($encodings{$encoding} < $encodings{$old_encoding})
	    ### you can't have 'binary' inside an '8bit' object, or
	    ### '8bit' inside a '7bit' one
		{whine "illegal $encoding within $old_encoding\n"}

	my ($type, $subtype) = split('/', $mime_type);
	my $eos_type;

	if ($type eq 'multipart') {
	### multipart
		# determine the default type or the parts
		my $retype = $subtype eq 'digest'?'message/rfc822':'text/plain';
		my $more_parts = 1;

	    ### parse preamble
		# the preamble is not supposed to be displayed, but it
		# still get hashed; it MUST have CTE '7bit', because
		#  there is no way to encode it
		($eos_type, $last) = read_chunk($bound, "7bit");
		die "boundary $bound not found\n" if ($eos_type eq 'EOF');
			# an expected CLOSEing boundary was never found;
			# a fatal error
		if ($eos_type eq 'CLOSE')
			{whine "zero parts in multipart\n"; $more_parts=0};
		# print the boundary; boundaries MUST be ASCII because
		# there is no way to encode them
		print_line(\$last, "7bit");

	    ### process parts
		while ($more_parts) {
			# the recursive call for each part within this
			# multipart
			($eos_type, $last) = process_part($bound, $retype, $encoding);
			die "boundary $bound not found\n" if ($eos_type eq 'EOF');
			$more_parts=0 if ($eos_type eq 'CLOSE');
			# print the boundary of this part; we have
			# already noted whether it is the CLOSEing
			# boundary of this multipart
			print_line(\$last, "7bit");
		}

	    ### process epilogue
		# the epilogue also is not for display, but still gets
		# hashed; we are now looking for the next boundary
		# ($oldbound) withing our parent
		($eos_type, $last) = read_chunk($oldbound, "7bit");

	} elsif ($type eq 'message') {
	### message
	    ### all message types have the same structure, though only
	    ### some of them, such as message/rfc822 and message/partial
	    ### possess bodies which might need encoding;
	    ### so, having already processed the header for the current
	    ### part, we now encounter the header of the message proper
	    ### - another recursive call; the mesage will be terminated
	    ### by the boundary of the parent, hence the $oldbound
	    ### parameter

		($eos_type, $last) = process_part($oldbound, 'text.plain', $encoding);

	} else {
	### simple part
	    ### this is a tip of the part tree, and the only place where
	    ### 'quoted-printable' or 'base64' can occur or may need to
	    ### be decoded (which we don't do if $test is set);
	    ### if no decoding is needed, read_chunk still needs to
	    ### distinguish between '7/8bit' and 'binary'

		if ($test && $encoding ne 'binary' &&
		    ($encoding eq 'quoted-printable' || $encoding eq 'base64')) {
		($eos_type, $last) = read_chunk($oldbound, '7bit');
		} else {
		($eos_type, $last) = read_chunk($oldbound, $encoding);
		}
	}
	return ($eos_type, $last);
}


#-------------------------------------------
#
#   main program
#
#   The three parameters represent:
#	no boundary for any parent
#	no default type (actually, it is text/plain, but process_header
#		will tell us that
#	dummy CTE representing initial state above top level
#
process_part('', '', '');