#!/usr/bin/perl -w # This code functions both on the networked NSRL environment # and on standalone boxes. There is a lot of bloat # to accomplish this at this time. # -------------------------------------------------------------------- # The software provided here is released by the National # Institute of Standards and Technology (NIST), an agency of # the U.S. Department of Commerce, Gaithersburg MD 20899, # USA. The software bears no warranty, either expressed or # implied. NIST does not assume legal liability nor # responsibility for a User's use of the software or the # results of such use. # # Please note that within the United States, copyright # protection, under Section 105 of the United States Code, # Title 17, is not available for any work of the United # States Government and/or for any works created by United # States Government employees. User acknowledges that this # software contains work which was created by NIST employees # and is therefore in the public domain and not subject to # copyright. The User may use, distribute, or incorporate # this software provided the User acknowledges this via an # explicit acknowledgment of NIST-related contributions to # the User's work. User also agrees to acknowledge, via an # explicit acknowledgment, that any modifications or # alterations have been made to this software before # redistribution. # use strict; use NSRL::Config; use NSRL::Hash; use NSRL::Magic; use NSRL::Unarc; use NSRL::Log; use Getopt::Std; use vars qw( $ret %root_cfgs @OSconfigs %oshash @OSarcs @OSextns @use_hashes $appstring $recurse_level $dispatcher ); # Set up logging # For Knoppix use, logging is off by default as we cannot assume space exists # for the log file to be written, or where that space may be. nsrlLog( 'emergency', '/dev/null' ); #nsrlLog( 'debug', 'NSRLHashing.log' ); # command line options override config file values if ( reconcile_config_and_command_line() ) { die "\n$0 : configuration file or command line error.\n"; } $recurse_level = -1; # counter of depth, used in sandbox dir name # get to work! dirhash($root_cfgs{DirToHash}); exit; # ------------------------------------- sub reconcile_config_and_command_line { use vars qw( $opt_a $opt_b $opt_B $opt_c $opt_C $opt_d $opt_D $opt_h $opt_l $opt_L $opt_m $opt_M $opt_N $opt_o $opt_s $opt_U $opt_X ); # ------------------------------------- # Read the command line arguments, there may be a config file given # key what # ------ ------ # -a AppID_Int int, AppID # -b int int, bytes to chop file off at (kazaa mode) # -B int int, offset into file to chop -b bytes # -c file cnofig file # -C do NOT use CRC32, replace with \"00000000\" # -d Debug_Level int, Debug message level # -D drive_letter do NOT use normal batched area, use a local DVD drive # -h help # -l logfile specify logfile (default is ------) # -L use LOCAL directories on this machine, not network # -m MediaID string, MediaID # -M magicfile file with magic strings and NSRL codes # -N do NOT use ODBC # -U toggle hashes to lowercase (default is uppercase) # -X don't bother with magic file signature checks # ------ ------ # deal with command line options getopts("a:c:Cd:D:hm:M:UX") or $opt_h = 1; if ($opt_h) { print STDERR "\nUsage: $0 [-h] -a appid -m mediaid [-c configfile] [-C] [-D dir] [-U] [-X]"; print STDERR "\n\t-h : show this help information"; print STDERR "\n\t-a APPID\t must be an integer"; print STDERR "\n\t-c CONFIG_FILE\t filename"; print STDERR "\n\t-C : do NOT use CRC32, replace with \"00000000\""; print STDERR "\n\t-D directory\tlocation to perform the hashing"; print STDERR "\n\t-m MEDIAID\t must be a string, quote if needed"; print STDERR "\n\t-U : toggle hashes to lowercase (default is uppercase)"; print STDERR "\n\t-X : toggle use of Magic (default is use)"; print STDERR "\nThis program hashes a directory tree and produces"; print STDERR "\noutput in the NIST NSRL RDS 2.0 format for the file"; print STDERR "\nnamed NSRLFile.txt."; print STDERR "\n"; exit; } # if the config file arg was used, see if the file exists # and use it, otherwise look for a default config, or finally # just go on with the assumptions in this code. if ($opt_c) { if (! -e $opt_c) { die "\n$0 : config file $opt_c does not exist\n"; } } else { $opt_c = "default.cfg"; if (! -e $opt_c) { die "\n$0 : default config file $opt_c does not exist\n"; } } if (-e $opt_c) { $ret = import_nsrlcfg( $opt_c ); # load config data into hash # create a hash of root level configuration data. %root_cfgs = root_config; $root_cfgs{ConfigFile} = $opt_c; # create an array of the operating systems known by the config file @OSconfigs = os_list; my $valid_os = 0; foreach my $os (@OSconfigs) { if ($os eq $^O) { $valid_os = 1;} } if (!$valid_os) { die "\n$0 : possible invalid OS\n\tYou appear to be running on $^O\n\tcheck the config file $opt_c for support\n"; } # create a hash of OS level configuration data. %oshash = os_config($^O); # create an array of the supported archive types for a given OS # using file()/magic number value, or... @OSarcs = os_archives($^O); # ...using the filename extension @OSextns = os_extensions($^O); if (defined $root_cfgs{use_hashes}) { @use_hashes = split(/,/ , $root_cfgs{use_hashes}); } else { print STDERR "$0 : cannot determine hashes to use\n"; exit; } } if ($opt_C) { if ( (exists $root_cfgs{use_CRC}) && (lc($root_cfgs{use_CRC}) eq "true") ) { $root_cfgs{use_CRC} = "false"; } } # use magic or not if ($opt_X) { if ( (exists $root_cfgs{magic}) && (lc($root_cfgs{magic}) eq "true") ) { $root_cfgs{magic} = "false"; } } if ( $root_cfgs{magic} eq "true" ) { if ($opt_M) { if ( -e $opt_M ) { $root_cfgs{magicfile} = $opt_M; } } if ( (! exists $root_cfgs{magicfile}) || (! -e $root_cfgs{magicfile}) ) { die "\n$0 : cannot use magic without a valid magic file \n"; } } # missing -a or -m is a fatal mistake if (!defined $opt_a) { print STDERR "\n$0 missing appid argument\n"; exit; } $root_cfgs{AppID_Int} = int($opt_a); if (!defined $opt_m) { print STDERR "\n$0 missing mediaid argument\n"; exit; } $root_cfgs{Media_ID} = $opt_m; # if no directory given to be hashed, use . if (!defined $opt_D) { $opt_D = "."; } if (! -d $opt_D) { die "\n$0 : invalid dir \"$opt_D\" specified for hashing.\n"; } $root_cfgs{DirToHash} = $opt_D ; $opt_o = "GEN"; $root_cfgs{OS_code} = $opt_o; $opt_s = ""; $root_cfgs{Special_code} = $opt_s; if ((defined $opt_d) && (int($opt_d) > -1)) { $root_cfgs{debug} = int($opt_d); } if ( (! defined $root_cfgs{debug}) || ($root_cfgs{debug} < 0) ) { $root_cfgs{debug} = 0; } if ($root_cfgs{debug} > 0) { print STDERR "\n\t\$root_cfgs keys and values\n"; foreach my $k (sort keys %root_cfgs) { print STDERR "$k => $root_cfgs{$k}\n"; } } return(0); } # ----------- # dirhash(directory) sub dirhash { my $target = shift; # get rid of trailing slash, if any if ( substr( (reverse $target), 0, 1) eq "/") { chop $target; } if ( (! defined $target) || (! -d $target) ) { return(0); } # bump up the level, get ready to extract into new sandbox area $recurse_level++; my $rec_lev = sprintf("%05d",$recurse_level); # find the list of files in this dir my $pfind = `find2perl $target -type f -print | perl `; if ($root_cfgs{debug} > 0) { print STDERR "$pfind\n"; } # make an array from the file list my @findlist = split(/\n/ , $pfind); # and work down through the array while(scalar @findlist ) { # absolute name my $fullfilename = pop @findlist ; # byte size my $bytes = -s "$fullfilename"; # relative name per the target directory my $relfilename = substr($fullfilename, length($target)+1 ); # hash the file my @got_hashes = getFileHashes ($fullfilename , @use_hashes); # output the results shift @got_hashes ; push (@got_hashes, $relfilename ); my $NSRLFileRow = "\"" . join("\",\"", @got_hashes) . "\"," . $bytes . ",0,\"UNK\",\"\"\n"; print $NSRLFileRow ; # # DRW 3/18/05 -- Here is where the magic starts... # my $nsrlstring = getFileDesc( $fullfilename , "NSRL" , $root_cfgs{magicfile} ); my $IDedByMagic = 0; # see if this in the NSRL format, and is a valid archive type # drw 05/03/15 added defined check if ( ($root_cfgs{magic} eq "true") && (defined $nsrlstring) && (index($nsrlstring, "[NSRL|") > -1) && (index($nsrlstring,"|NARC]") < 1) && (index($nsrlstring,"|]") < 1) ) { my @parts = split(/[\[\|\]]/, $nsrlstring); # derive the OS and archive type from the magic string my $ops_code = $parts[2]; my $arc_code = $parts[3]; # check the archive type against the known unarchive apps for our OS my $valid_arcs = 0; foreach my $arcs (@OSarcs) { if ($arcs eq $arc_code) { $valid_arcs = 1;} } my $sandbox; # if we can't unarchive this type on this OS, log that fact if (!$valid_arcs) { if ($root_cfgs{debug} > 0) { print STDERR "\nunopened archive on \[$^O\] ; check the config file " , $root_cfgs{ConfigFile} , " for support\n"; } } else { # create the support app string for an OS and archive type $appstring = archive_support( $^O , $arc_code); # create the sandbox dir to extract the archive # should probably have a PID in the path for cleanup later $sandbox = "/tmp/sandbox/$root_cfgs{AppID_Int}/$root_cfgs{Media_ID}/$rec_lev/$relfilename"; # # this is very OS dependent right now # if (!-d $sandbox) { system("mkdir -p \"$sandbox\""); } # do the extract # # JT, 2005-09-26: For those expansions that require a destination file name, # that file name should be the same as that of the archive, minus the archive # extension. So, e.g. foo.gz becomes foo . # (my $revname, my $revdir) = split /\//, (reverse $relfilename), 2; (my $revext, my $revfname) = split /\./, $revname, 2; my $filename = reverse $revfname; # do the extract if ($root_cfgs{debug} > 0) { print STDERR "expand( $appstring, $fullfilename , $sandbox, $filename )\n"; } my $result = expand( $appstring, $fullfilename , $sandbox, $filename ); if (!defined $result) { $result = "UNDEF"; } else { if ($root_cfgs{debug} > 0) { print "\n$result\n"; } $IDedByMagic = 1; # ############################################################## # WARNING - this is the recursion call # $sandbox = "/tmp/sandbox/$root_cfgs{AppID_Int}/$root_cfgs{Media_ID}/$rec_lev"; dirhash($sandbox); $recurse_level--; system( "rm -rf \"$sandbox\"" ); # ############################################################## } # # this is very OS dependent right now # } # read in file names } # found a valid magic type # START of extension processing # # DRW - this if should check supported extension actions, too # if ( ($root_cfgs{extensions} eq "true") && (index($fullfilename, ".") >=0 ) && ( (! defined $nsrlstring) || (index($nsrlstring, "[NSRL|") < 0) || (! defined $root_cfgs{magic}) || ($root_cfgs{magic} ne "true") || (! $IDedByMagic ) ) ) { # there is an extension and this file wasn't handled # by magic information my $ops_code = $^O ; my @parts = split(/\./,$fullfilename); my $arc_code = "." . lc($parts[$#parts]); # check the archive type against the known unarchive apps for our OS my $valid_arcs = 0; foreach my $arcs (@OSextns) { if ($arcs eq $arc_code) { $valid_arcs = 1;} } my $sandbox; # if we can't unarchive this extension on this OS, DONT log that fact if (!$valid_arcs) { # if ($root_cfgs{debug} > 0) { # print STDERR "\nunopened archive on \[$^O\] ; check the config file " , $root_cfgs{ConfigFile} , " for support\n"; # } } else { # create the support app string for an OS and archive type $appstring = extension_support( $^O , $arc_code); # create the sandbox dir to extract the archive # should probably have a PID in the path for cleanup later $sandbox = "/tmp/sandbox/$root_cfgs{AppID_Int}/$root_cfgs{Media_ID}/$rec_lev/$relfilename"; # # this is very OS dependent right now # if (!-d $sandbox) { system("mkdir -p \"$sandbox\""); } # # JT, 2005-09-26: For those expansions that require a destination file name, # that file name should be the same as that of the archive, minus the archive # extension. So, e.g. foo.gz becomes foo . # (my $revname, my $revdir) = split /\//, (reverse $relfilename), 2; (my $revext, my $revfname) = split /\./, $revname, 2; my $filename = reverse $revfname; # do the extract if ($root_cfgs{debug} > 0) { print STDERR "expand( $appstring, $fullfilename , $sandbox, $filename )\n"; } my $result = expand( $appstring, $fullfilename , $sandbox, $filename ); if (!defined $result) { $result = "UNDEF"; } else { if ($root_cfgs{debug} > 0) { print "\n$result\n"; } # ############################################################## # WARNING - this is the recursion call # $sandbox = "/tmp/sandbox/$root_cfgs{AppID_Int}/$root_cfgs{Media_ID}/$rec_lev"; dirhash($sandbox); $recurse_level--; system( "rm -rf \"$sandbox\"" ); # ############################################################## } # # this is very OS dependent right now # } # read in file names } # found a valid extension to process # END of extension processing } # file list return(0); } __END__