#!/usr/bin/perl
#
my $revision = '$Id: FileTypes.pm,v 1.3 2006/01/03 13:16:35 bre Exp $';
my $version = 'Anomy 0.0.0 : Anomy::Sanitizer::FProt.pm';
#
##  Copyright (c) 2006 FRISK Software International. All rights reserved.
##  This program is free software; you can redistribute it and/or modify
##  it under the terms of the GNU Lesser General Public License as 
##  published by the Free Software Foundation; either version 2.1 of the 
##  License, or (at your option) any later version.
#
##############################################################################
#
# NOTE:  Sanitizer development is for the most part sponsored by
#        FRISK Software International, http://www.f-prot.com/.  Please
#        consider buying their anti-virus products to show your 
#        appreciation.
#
##############################################################################
#
# This module implements a built in scanner which recognizes a few known file
# types based on headers.  It verifies that a file is what it says it is.
#
# Usage:
#
#  use Anomy::Sanitizer::FileTypes qw( check_file_type );
#
#  my ($is_evil, $is_suspicious, $risk_level, @filetypes) = 
#      check_file_type( snippet     => $first_few_bytes_of_data,
#                       file_names  => \@list_of_filenames,
#                       mime_type   => $mime_type,
#                       blacklisted => \@blacklisted_file_types );
#
##[ Package definition ]######################################################

package Anomy::Sanitizer::FileTypes;
use strict;

BEGIN {
    use Exporter ();
    use vars     qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);

    $VERSION     = do { my @r = (q$Revision: 1.3 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
    @ISA         = qw(Exporter);
    @EXPORT      = qw( );
    @EXPORT_OK   = qw( &check_file_type &unknown_file_types &known_file_types );
}
use vars @EXPORT_OK;
my $serial = 0;


##[ Recognized file formats ]##################################################

# Risk levels
my ($unknown, $low, $medium, $high) = (0, 1, 2, 3);

# File type definitions
my $EXE = {
    id         => "exe",
    risk       => $high,
    name       => "MSDOS or MS Windows executable",
    extensions => [ "exe", "com", "scr", "pif", "lnk", "bat" ],
    mime_types => [ 'application/octet-stream', 
                    "application/x-ms-dos-executable",
                    "application/x-msdownload" ],
    magic      => [ "MZ" ],
};
my $WMF = {
    id         => "wmf",
    risk       => $medium,
    name       => "Windows MetaFile (WMF)",
    extensions => [ "emf", "wmf" ],
    mime_types => [ 'application/x-msMetafile' ],
    magic      => [ "\xD7\xCD\xC6\x9A", 
                    "\x01\x00\x09\x00", 
                    "\x02\x00\x09\x00" ],
};
my $JPEG = {
    id         => "jpeg",
    risk       => $low,
    name       => "JPEG Image",
    extensions => [ "jpg", "jpe", "jpeg", "jfif", "jfif-tbnl" ],
    mime_types => [ 'image/jpeg', 'image/pjpeg' ],
    magic      => [ "\xFF\xD8" ],
};
my $GIF = {
    id         => "gif",
    risk       => $low,
    name       => "GIF Image",
    extensions => [ "gif" ],
    mime_types => [ 'image/gif' ],
    magic      => [ "GIF8" ],
};
my $PNG = {
    id         => "png",
    risk       => $low,
    name       => "PNG Image",
    extensions => [ "png" ],
    mime_types => [ 'image/png' ],
    magic      => [ "\x89PNG" ],
};
my $TIFF = {
    id         => "tiff",
    risk       => $low,
    name       => "TIFF image data",
    extensions => [ "tiff", "tif" ],
    mime_types => [ 'image/tiff' ],
    magic      => [ "MM\x00\x2A", "II\x2A\x00" ],
};
my $JS = {
    id         => "js",
    risk       => $medium,
    name       => "JavaScript file",
    extensions => [ "js" ],
    mime_types => [ 'application/x-javascript' ],
    magic      => [ ],
};
my $HTML = {
    id         => "html",
    risk       => $low,
    name       => "HTML text file",
    extensions => [ "html", "htm", "shtml" ],
    mime_types => [ 'text/html' ],
    magic      => [ ],
    regexp     => '<html|<body|<p>|<b>|<i>|<br>|</a>',
};
my $TXT = {
    id         => "txt",
    risk       => $low,
    name       => "Plain text file",
    extensions => [ "txt" ],
    mime_types => [ 'text/plain' ],
    magic      => [ ],
};

# Set up useful ways to look up stuff in the above definitions.
my @all_types   = ($EXE, $WMF, $JPEG, $GIF, $PNG, $TIFF, $JS, $HTML, $TXT);
my $file_types  = { };
my $file_ext    = { };
my $file_regexp = { };
my $file_magic  = { };
my $magic_lengths = { };
foreach my $ft (@all_types)
{
    $file_types->{$ft->{id}} = $ft;
    foreach my $magic (@{ $ft->{magic} })
    {
        $file_magic->{$magic} = $ft;
        $magic_lengths->{length($magic)} = 1;
    }
    if (my $regexp = $ft->{regexp})
    {
        $file_regexp->{$regexp} = $ft;
    }
    foreach my $ext (@{ $ft->{extensions} })
    {
        $file_ext->{lc($ext)} = $ft;
    }
}
my @magic_lengths = (sort { $b <=> $a } keys(%$magic_lengths));


##[ Package implementation ]##################################################

# Check a list of filetypes to see if we recognize all of them, return
# undef or the first unrecognized file type.
sub unknown_file_types
{
    my (@list) = @_;
    @list = @{ $list[0] } if (ref($list[0]) =~ /array/i);
    foreach my $item (@list)
    {
        return $item unless (0 < (grep { $_->{id} eq lc($item) } @all_types));
    } 
    return undef;
}

# Return a list of known filetypes.
sub known_file_types
{
    return (map { $_->{id} } @all_types);
}

# Usage:
#
#  use Anomy::Sanitizer::FileTypes qw( check_file_type );
#
#  my ($is_evil, $is_suspicious, $risk_level, @filetypes) = 
#      check_file_type( snippet     => $first_few_bytes_of_data,
#                       file_names  => \@list_of_filenames,
#                       mime_type   => $mime_type,
#                       blacklisted => \@blacklisted_file_types );
#
# Return values:
#
#  The first two returned values are boolean, 0 or 1.
#
#  Risk levels range from 0-3 (unknown, low, medium, high).
#
#  @filetypes is a list of hashrefs describing what we think the file
#  is.  The matches are ordered such that the riskiest match is listed
#  first.
#
sub check_file_type
{
    my (%args) = @_;
    my $snippet     = $args{snippet};
    my $filenames   = $args{file_names} || [ $args{file_name} ];
    my $mimetype    = $args{mime_type};
    my $blacklisted = $args{blacklisted} || [ ];

    my ($matched_filename, $matched_magic, $matches) = 
        get_matches( snippet   => $snippet, 
                     filenames => $filenames );

    my ($is_evil, $is_suspicious, $risk_level) = 
        estimate_risk( matched_filename => $matched_filename, 
                       matched_magic    => $matched_magic,
                       matches          => $matches || { },
                       blacklisted      => $blacklisted );

    return ($is_evil, $is_suspicious, $risk_level,
            sort { ($b->{risk} <=> $a->{risk})
                || ($a->{id}   cmp $b->{id}) } values(%$matches));
}

sub get_matches
{
    my (%args) = @_;
    my $snippet = $args{snippet};
    my $filenames = $args{filenames};

    # This is what we'll be returning.
    my ($matched_filenames, $matched_magic, $matches) = (0, 0, { });

    # Guess filetype based on extension...
    foreach my $fn (@$filenames)
    {
        if ($fn =~ /\.([^\.]+)$/)
        {
            my $ext = $1;
            if (my $match = $file_ext->{lc($ext)})
            {
                print STDERR "Filename match: $match->{id}\n" if ($ENV{DEBUG_FILETYPES});
                $matches->{$match} = $match;
                $matched_filenames++;
            } 
        }
    }
    # Guess filetype based on binary snippet
    foreach my $length (@magic_lengths)
    {
        my $bytes = substr($snippet, 0, $length);
        if (my $match = $file_magic->{$bytes})
        {
            print STDERR "Magic match: $match->{id}\n" if ($ENV{DEBUG_FILETYPES});
            $matches->{$match} = $match;
            $matched_magic++;
        }
    }
    # Guess filetype using regexps, unless magic worked.
    if (!$matched_magic)
    {
        foreach my $re (keys(%$file_regexp))
        {
            next unless ($snippet =~ /$re/i);
            my $match = $file_regexp->{$re};

            print STDERR "Regexp match: $match->{id}\n" if ($ENV{DEBUG_FILETYPES});
            $matches->{$match} = $match;
            $matched_magic++;
        } 
    }
    
    # Some types have no magic, and all files match the "undefined" magic.
    if (!$matched_magic)
    {
        foreach my $m (values(%$matches))
        {
            if ((@{ $m->{magic} } < 1) && (!$m->{regexp}))
            {
                print STDERR "Undefined magic match.\n" if ($ENV{DEBUG_FILETYPES});
                $matched_magic++;
            }
        }
    }

    return ($matched_filenames, $matched_magic, $matches);
}
    
sub estimate_risk
{
    my (%args) = @_;
    my $matched_filename = $args{matched_filename};
    my $matched_magic    = $args{matched_magic};
    my $matches          = $args{matches};
    my %blacklisted = ( map { lc($_) => 1 } @{ $args{blacklisted} } );
    my %risks = ( map { $matches->{$_}->{risk} => 1 } keys(%$matches) );

    # This is what we'll return.
    my ($is_evil, $is_suspicious, $risk_level) = (0, 0, $unknown);

    # Do we have more than one possible match?  That's odd!
    if (keys(%$matches) > 1)
    {
        # If we also have multiple risk levels, that probably means
        # someone is trying to masquerade a high-risk file as a low-risk
        # one, which is very evil.  If they are all "low" risk, then we
        # just assume the sending mailer is dumb (sends GIFs as JPGs).
        $is_evil++ unless ($risks{$low} && (keys(%risks) == 1));
        print STDERR "Multiple matches: is_evil = $is_evil\n" if ($ENV{DEBUG_FILETYPES});
    }

    # Is this filetype blacklisted?
    foreach my $m (values(%$matches))
    {
        $is_evil++ if ($blacklisted{$m->{id}});
    }
   
    # Does this claim to be a low-risk file, without matching our magic?
    $is_suspicious++ if ($risks{$low} && (!$matched_magic));

    # Report highest risk level
    $risk_level = $low    if ($risks{$low});
    $risk_level = $medium if ($risks{$medium});
    $risk_level = $high   if ($risks{$high});

    return ($is_evil, $is_suspicious, $risk_level);
}


##[ Testing ]##################################################################

sub Test
{
    my @tests = (
        {
            name    => "Plain text file",
            args    => { snippet     => "Plain text file",
                         file_names  => [ "plain.txt" ] },
            results => "0,0,$low,txt",
        },
        {
            name    => "Incorrectly named HTML file",
            args    => { snippet     => "<html>foo</html>",
                         file_names  => [ "plain.txt" ] },
            results => "0,0,$low,html,txt",
        },
        {
            name    => "Incorrectly named TIFF image file.",
            args    => { snippet     => "MM\x00\x2A",
                         file_names  => [ "plain_data" ] },
            results => "0,0,$low,tiff",
        },
        {
            name    => "Bogus JPEG file.",
            args    => { snippet    => "bogosity",
                         file_names => [ "fake.jpg" ] },
            results => "0,1,$low,jpeg",
        },
        {
            name    => "GIF image which thinks it's a JPEG.",
            args    => { snippet    => "GIF89afoo",
                         file_names => [ "fake.jpg" ] },
            results => "0,0,$low,gif,jpeg",
        },
        {
            name    => "Windows executable with correct name.",
            args    => { snippet    => "MZfoo",
                         file_names => [ "fake.exe" ] },
            results => "0,0,$high,exe",
        },
        {
            name    => "Windows executable with wrong name.",
            args    => { snippet    => "MZfoo",
                         file_names => [ "fake.txt" ] },
            results => "1,0,$high,exe,txt",
        },
        {
            name    => "Blacklisted WMF file without magic.",
            args    => { snippet     => "bogosity",
                         blacklisted => [ "wmf" ],
                         file_names  => [ "blacklisted.wmf" ] },
            results => "1,0,$medium,wmf",
        },
        {
            name    => "Unblacklisted WMF file with valid magic.",
            args    => { snippet     => "\xD7\xCD\xC9\x9A",
                         file_names  => [ "blacklisted.wmf" ] },
            results => "0,0,$medium,wmf",
        },
        {
            name    => "WMF file pretending to be a JPEG.",
            args    => { snippet     => "\xD7\xCD\xC9\x9A",
                         file_names  => [ "evil.jpg" ] },
            results => "1,0,$medium,wmf,jpeg",
        },
    );
    foreach my $t (@tests)
    {
        my ($e, $s, $r, @info) = check_file_type(%{ $t->{args} });
        my $results = join(',', $e, $s, $r, map { $_->{id} } @info );
        if ($t->{results} eq $results)
        {
            print "OK  $t->{name} ($results)\n";
        }
        else
        {
            print "BAD $t->{name}\n*** $results != $t->{results} \n";
        }
    }
    
    # Test filetype validity checker
    if (my $unknown = unknown_file_types("exe", "txt", "html", "js"))
    {
        print "BAD Filetype validity checker doesn't recognize '$unknown'\n"; 
    }
    else
    {
        print "OK  Filetype validity checker is OK.\n"; 
    }
    if ("bogus" eq unknown_file_types("exe", "txt", "html", "js", "bogus"))
    {
        print "OK  Filetype validity checker doesn't recognize 'bogus'.\n"; 
    }
    else
    {
        print "BAD Filetype validity checker recognizes 'bogus'!\n"; 
    }
    if (9 == known_file_types())
    {
        print "OK  List of filetypes is ok.\n"; 
    }
    else
    {
        print "BAD List of filetypes is silly!\n"; 
    }


}

1;

# vi:ts=4
