#!/usr/local/bin/perl #------------------------------------------- # # This demonstrator program shows how the body of a message may be # canonicalized by decoding any 'quoted-printable' or 'base64' portions # before hashing (irrespective of whether they were present when the # message was submitted or whether they were encoded en route because some # non-8BITMIME or non-BINARYMIME MTA was encountered). # # This canonicalization is either an addition to the presently proposed # 'relaxed' canonicalization for bodies (the ignoring of trailing # whitespace and of trailing blank lines has been omitted from this # demonstration for simplicity's sake), or else it could form the basis of # some new '8bit-safe' canonicalization. # # The presence of the MIME media types 'multipart' and 'message' can bring # about a tree of potentially unbounded depth, with the possibility of # encountering encoded objects at any node. Hence a recursive descent of # this tree is necessary. # # Little attempt has been made to check for mal-formed MIME structures, # since they are going to fail elsewhere in the mail system anyway. # However, a few cases which would cause the canonicalization to fail will # give rise to fatal errors, and there are a couple of warnings for some # odd cases that are technically illegal, but will not cause signatures to # fail. # # Care has been taken to ensure that naked LFs do not occur outside of # regions where the CTE is 'binary'. However, it is too much trouble to # test for naked CRs; if present (whether in binary regions or not) they # will pass through and get hashed like any other character. # #------------------------------------------- use English; use MIME::Head; use MIME::QuotedPrint; use MIME::Base64; use Getopt::Long; my $LONGLINE = ' ' x 1000; $_ = $LONGLINE; # provide a buffer long enough for most lines; # if it needs more, it will malloc it GetOptions ( "unix|u" => \$unix, "test|t" => \$test, "noctedrop" => \$noctedrop, "head" => \$keep_heads, "quiet|q" => \$quiet, "help|h" => \&usage ) or &usage; sub usage { print STDOUT "usage: uncode.pl [options] [(infile|-) [outfile]] -h --help print this message -u --unix ensure lines end with CRLF, except within CTE binary; for testing, or for MUAs running on Unix -t --test no decoding; output should be identical to input, modulo added CRs --noctedrop do not replace Content-Transfer-Encodings; for testing only --head keep top-level headers -q --quiet no warnings; recommended when verifying\n"; exit; } #------------------------------------------- # # Set up $in and $out handlers; by default, STDIN and STDOUT # (this program is designed to operate in a pipeline). # my ($in, $out); if (defined $ARGV[0]) {open($in, "<$ARGV[0]")} else {$in = STDIN} if (defined $ARGV[1]) {open($out, ">$ARGV[1]")} else {$out = STDOUT} #------------------------------------------- # # whine () for non-fatal problems # sub whine {warn(@_) unless $quiet}; #------------------------------------------- # # print_line ($line, $encoding) # # on $out, as per $encoding; the $line parameter is a Ref to the # actual line, to avoid copying # sub print_line { my ($line,$encoding) = @_; local $buf = $LONGLINE; if ($encoding eq 'quoted-printable') { ### the following messiness corects the misfeature of ### 'decode_qp' in Unix versions of Perl, which generates ### lines ending in a single LF; this necessitates ### detecting incoming lines ending in '=' $buf = decode_qp($$line); if (substr($$line,-3) !~ m/=\r?\n$|^$/o) {substr($buf,-1,1) = "\r\n"} print $out $buf; } elsif ($encoding eq 'base64') { $buf = decode_base64($$line); print $out $buf; } else { print $out $$line; } } #------------------------------------------- # # read_chunk ($bound, $encoding) # # reads, and trascribes using the specified $encoding, up to the # given MIME boundary, or up to EOF # sub read_chunk { my ($bound,$encoding) = @_; my $last = ""; my $eos_type = ""; # DELIM, CLOSE or EOF while (<$in>) { # will read up to next '\n' # which might be a long way away in # genuine binary attachments, but should # otherwise fit in 1000 octets if ($encoding ne 'binary' && substr($_,-2,2) ne "\r\n") { ### if we find a line ending in naked LF when not in ### CTE 'binary' then in Unix mode we fix it, ### otherwise it is a fatal error if ($unix) {substr($_,-1,1) = "\r\n"} else {die "naked LF in non-binary region\n"} } if (substr($_,0,2) eq '--' && $_ =~ s/^(--$bound(--)?[ \t]*)\r?\n/\1\r\n/) { ### it was the expected boundary, and we note whether ### it is a CLOSEing boundary $eos_type = ($2 ? 'CLOSE':'DELIM'); ### we always print one line in arrear, since we do not ### want to print the boundary, which is never encoded print_line(\$last, $encoding); $last = $_; return ($eos_type, $last); ### we return the boundary for printing later } else { print_line(\$last, $encoding); $last = $_; } } $eos_type = 'EOF'; print_line(\$last, $encoding); return ($eos_type, ''); } #------------------------------------------- # # A map from encoding to integer for checking that a larger encoding # is not contained within a smaller one. # '' is a dummy encoding representing the initial state. # my %encodings = (''=>0, 'binary'=>1, '8bit'=>2, '7bit'=>3, 'quoted-printable'=>3, 'base64'=>3); #------------------------------------------- # # process_header ($deftype, $old_encoding) # # Reads headers, both at the top level and at the start of multipart # and message types. It reurns the Content-Type (defaulting to $deftype) # and Content-Transfer-Encoding, $old-encoding will be '' at the # top level. # sub process_header { my ($deftype, $old_encoding) = @_; my $encoding; my $head = new MIME::Head; $head->read($in); my $mime_type = $head->mime_type($deftype); $encoding = $head->mime_encoding; my $bound = $head->multipart_boundary; $head->replace('content-transfer-encoding', "binary\r") unless ($test or $noctedrop); ### we don't know the original CTE, but anyway that CTE is no longer ### of interest; so we change it to 'binary' to make sure the ### 'naked LF' test is not triggered during verification my $newstring = $head->as_string; if ($unix) {$newstring =~ s/\r?\n/\r\n/og} if ($old_encoding ne '' || $test || $keep_heads) ### i.e. not a top-level header, which should not be output ### because they are canonicalized separately { print $out $newstring, "\r\n" } return ($mime_type, $encoding, $bound); } #------------------------------------------- # # process_part ($oldbound, $deftype, $old_encoding) # # process_part is the heart of the whole system. A 'part' is a part of # a multipart, or a message type, or even the whole message. This # routine walks through the tree of parts, calling itself recursively # as needed. # sub process_part { my ($oldbound, $deftype, $old_encoding) = @_; my ($mime_type, $encoding, $bound) = process_header($deftype, $old_encoding); if ($encodings{$encoding} < $encodings{$old_encoding}) ### you can't have 'binary' inside an '8bit' object, or ### '8bit' inside a '7bit' one {whine "illegal $encoding within $old_encoding\n"} my ($type, $subtype) = split('/', $mime_type); my $eos_type; if ($type eq 'multipart') { ### multipart # determine the default type or the parts my $retype = $subtype eq 'digest'?'message/rfc822':'text/plain'; my $more_parts = 1; ### parse preamble # the preamble is not supposed to be displayed, but it # still get hashed; it MUST have CTE '7bit', because # there is no way to encode it ($eos_type, $last) = read_chunk($bound, "7bit"); die "boundary $bound not found\n" if ($eos_type eq 'EOF'); # an expected CLOSEing boundary was never found; # a fatal error if ($eos_type eq 'CLOSE') {whine "zero parts in multipart\n"; $more_parts=0}; # print the boundary; boundaries MUST be ASCII because # there is no way to encode them print_line(\$last, "7bit"); ### process parts while ($more_parts) { # the recursive call for each part within this # multipart ($eos_type, $last) = process_part($bound, $retype, $encoding); die "boundary $bound not found\n" if ($eos_type eq 'EOF'); $more_parts=0 if ($eos_type eq 'CLOSE'); # print the boundary of this part; we have # already noted whether it is the CLOSEing # boundary of this multipart print_line(\$last, "7bit"); } ### process epilogue # the epilogue also is not for display, but still gets # hashed; we are now looking for the next boundary # ($oldbound) withing our parent ($eos_type, $last) = read_chunk($oldbound, "7bit"); } elsif ($type eq 'message') { ### message ### all message types have the same structure, though only ### some of them, such as message/rfc822 and message/partial ### possess bodies which might need encoding; ### so, having already processed the header for the current ### part, we now encounter the header of the message proper ### - another recursive call; the mesage will be terminated ### by the boundary of the parent, hence the $oldbound ### parameter ($eos_type, $last) = process_part($oldbound, 'text.plain', $encoding); } else { ### simple part ### this is a tip of the part tree, and the only place where ### 'quoted-printable' or 'base64' can occur or may need to ### be decoded (which we don't do if $test is set); ### if no decoding is needed, read_chunk still needs to ### distinguish between '7/8bit' and 'binary' if ($test && $encoding ne 'binary' && ($encoding eq 'quoted-printable' || $encoding eq 'base64')) { ($eos_type, $last) = read_chunk($oldbound, '7bit'); } else { ($eos_type, $last) = read_chunk($oldbound, $encoding); } } return ($eos_type, $last); } #------------------------------------------- # # main program # # The three parameters represent: # no boundary for any parent # no default type (actually, it is text/plain, but process_header # will tell us that # dummy CTE representing initial state above top level # process_part('', '', '');