Sophie

Sophie

distrib > Mandriva > 2010.0 > i586 > media > contrib-release > by-pkgid > 43bb1dd9140ac2a3090c0de3bad20e81 > files > 13

spamprobe-1.4d-7mdv2010.0.i586.rpm

#!/usr/bin/ruby
#
# A script to split a mbox file into two files with a randomly
# chosen set of emails in each file.  Used to create test cases
# for trying out different sets of filter parameters.
#

require 'md5'

if ARGV.length < 2
  STDERR.printf("usage: splitmail percentage outfileprefix [filename...]\n")
  exit 1
end

max_messages = 8000
ignore_exp = /^Subject:\s+DON.T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA/
header_exp = /(^To:)|(^From:)|(^Date:)|(^Subject:)|(^Cc:)/
skip_pct = 5 + rand(10)

pct = ARGV.shift.to_i
prefix = ARGV.shift
files = [ File.open(prefix + ".1", "w"), File.open(prefix + ".2", "w") ];
first = [ true, true ]
digests = Hash.new

ARGV.each do |filename|
  msg_count = 0
  dup_count = 0
  digester = MD5.new
  message = ""
  on_blank = true
  in_body = false
  ignore_it = false
  File.open(filename).each do |line|
    if on_blank && line =~ /^From\s+\S+\s+\w\w\w\s+\w\w\w\s+\d\d?\s+\d\d\s*:\s*\d\d\s*:\s*\d\d\s+\d\d\d\d/
      if (message.length > 0) && (not digests.has_key?(digester.hexdigest) && (not ignore_it) && (rand(100) >= skip_pct))
        filenum = (rand(100) <= pct) ? 0 : 1
        if first[filenum]
          first[filenum] = false
        else
          files[filenum].print("\n")
        end
        files[filenum].print(message)
        digests.store(digester.hexdigest, 1)
        msg_count += 1
      elsif message.length > 0
        dup_count += 1
      end

      message = ""
      on_blank = false
      in_body = false
      ignore_it = false
      digester = MD5.new
      if msg_count > max_messages
        break
      end
    elsif line.length == 1
      on_blank = true
      in_body = true
    end
    if in_body
      if message.length < 4096
        digester.update(line)
      end
    elsif header_exp =~ line
      digester.update(line)
      ignore_it = ignore_it || (ignore_exp =~ line)
    end
    message.concat(line)
  end

  if (message.length > 0) && (not digests.has_key?(digester.hexdigest) && (not ignore_it))
    filenum = (rand(100) <= pct) ? 0 : 1
    if first[filenum]
      first[filenum] = false
    else
      files[filenum].print("\n")
    end
    files[filenum].print(message)
    msg_count += 1
  elsif message.length > 0
    dup_count += 1
  end

  printf("split file %s with %d unique messages and %d dups\n", filename, msg_count, dup_count)
end