Sophie

Sophie

distrib > Mandriva > 2010.0 > i586 > media > contrib-release > by-pkgid > 097dd4e9e72d844830e823e5746f6c3b > files > 37

swish-e-2.4.7-1mdv2010.0.i586.rpm

# !/usr/bin/perl -w
use strict;

## See documentation below.  Script may require customization
## read documentation with "perldoc index_hypermail.pl"

use File::Find;
use Date::Parse;
use HTML::TreeBuilder;
use Data::Dumper;

## This is the string that is removed while indexing from email addresses
## as defined in the hypermailrc file.
#
#---------------------- config -----------------------------------------------------

my $dumb_spamblock = '(at)not-real.';

#------------------------------------------------------------------------------------


my $dir = shift || die "must specfy directory to search";

debug(@ARGV) if $dir eq 'debug';

# Do all the work

find( { wanted => \&wanted }, $dir );




sub wanted {
    return if -d;  # don't need to process directories
    return unless /^\d+\.html$/;

    # If you want it to parse using HTML::Parser use the first line
    # and comment out the second.  But it's a LOT slower

    #output_file( $File::Find::name,  parse_file($_) );
    output_file( $File::Find::name,  fast_parse($_) );
}


sub output_file {
    my ( $file, $data ) = @_;

    local $SIG{__WARN__} = sub { "$file: @_" };

    # Get last_mod date
    my $date = str2time( $data->{comments}{received} );
    unless ( $data ) {
        warn "Failed to parse received date in $file\n";
        $date = str2time( $data->{comments}{send} );
        unless ( $date ) {
            warn "Failed to parse any dates: skipping $file\n";
            return;
        }
    }

    $data->{received} = $date;

    my $comments = $data->{comments};

    $comments->{email} =~ s/\Q$dumb_spamblock/-blabla-/;

    my $metas = join "\n", map { qq[<meta name="$_" content="$comments->{$_}">] }
        sort keys %{$data->{comments}};

    my $title = $comments->{subject} || '';

    my $html = <<EOF;
<html>
<head>
<title>$title</title>
$metas
</head>
<body>
    $data->{body}
</body>
</html>
EOF

    my $bytecount = length pack 'C0a*', $html;

    print <<EOF;
Path-Name: $file
Content-Length: $bytecount
Last-Mtime: $date
Document-Type: HTML*

EOF

    print $html;

}


sub parse_file {
    my ( $file ) = @_;

    my %data;  # Return hash of data

    my $tree = HTML::TreeBuilder->new;
    $tree->store_comments(1);  # meta data is in the comments
    $tree->warn(1);

    $tree->parse_file( $file );

    my %comments;
    # Extract out metadata
    for ( $tree->look_down( '_tag', '~comment' )) {
        my $comment = $_->attr("text");
        $comments{$1} = $2 if $comment =~ /(\w+)="([^"]+)/;
    }

    $data{comments} = \%comments if %comments;  # should die here if not.

    # Extract out the searchable content
    my $body = $tree->look_down('_tag', 'div', 'class', 'mail');
    unless ( $body ) {
        warn "$file: failed to find <div class='mail'>\n";
        return;
    }

    # Remove some sub-nodes we don't care about
    $body->look_down('_tag', 'address', 'class', 'headers')->delete;
    $body->look_down('_tag', 'span', 'id', 'received')->delete;

    $data{body} = $body->as_HTML;


    $tree->delete;

    return \%data;

}

sub fast_parse {
    my $file = shift;
    local $_;

    unless ( open FH, "<$file" ) {
        warn "Failed to open '$file'. Error: $!";
        return;
    }

    my %data;
    my %comments;


    # First parse out the comments 
    while (<FH>) {

        if ( my( $tag, $content) = /<!-- ([^=]+)="(.+)" -->$/ ) {

            unless ( $content ) {
                warn "File '$file' tag '$tag' empty content\n";
                next;
            }
            last if $tag eq 'body';  # no more comments in this section
            $comments{$tag} = $content;
        }
    }

    $data{comments} = \%comments;


    # Now grab the content


    my $end_str;  # for skipping sections
    my $body = '';

    while ( <FH> ) {
        # loo for ending tag, or maybe even the signature

        last if /<!-- body="end" -->/ || /^-- $/ || /^--$/ || /^(_|-){40,}\s*$/;

        # Look for ending tag for a skipped tag set

        if ( $end_str ) {
            $end_str = '' if /\Q$end_str/;
            next;
        }


        # These are sections to skip 
        if ( /\Q<address class="headers"/ ) {
            $end_str = "</address>";
            next;
        }

        if (/\Q<span id="received"/ ) {
            $end_str = '</div>';
            next;
        }

        $body .= $_;
    }

    $data{body} = $body;
    return \%data;
}



sub debug {
    for ( @_ ) {
        print STDERR "Debugging [$_]\n";
        my $result = parse_file( $_, 1 );
        print STDERR Dumper $result;
        output_file( $_, $result );
    }
    exit;
}
__END__

=head1 NAME

index_hypermail.pl - Parse Hypermail archive for indexing with Swish-e

=head1 SYNOPSIS

Using an example data structure like this:

    hypermail/
        archive/
        search/

Create the hypermail archive:

    $ cd hypermail
    $ hypermail -i -d archive < messages.mbox

Create a swish-e config file:

    $ cd search
    $ cat swish.conf

    # config for indexing hypermail v2.1.8 archives

    IndexDir ./index_hypermail.pl
    SwishProgParameters ../archive

    MetaNames swishtitle name email
    PropertyNames name email
    IndexContents HTML* .html
    StoreDescription HTML* <body> 100000
    UndefinedMetaTags  ignore

Copy index_hypermail.pl to the current directory.  Swish-e installs
index_hypermail.pl in the $prefix/share/doc/swish-e/examples/prog-bin directory, where
$prefix is typically "/usr/local" or simply "/usr" on some distributions.

    $ cp /usr/local/share/doc/swish-e/example/prog-bin/index_hypermail .

Then 


Index the documents:

    $ swish-e -c swish.conf -S prog

Now create the search interface:

    $ cp /usr/local/lib/swish-e/swish.cgi .
    $ cat .swishcgi.conf

    $ENV{TZ} = 'UTC'; # display dates in UTC format

    return {
        title           => "Search the Foo List Archive",
        display_props   => [qw/ name email swishlastmodified /],
        sorts           => [qw/swishrank swishtitle email swishlastmodified/],
        metanames       => [qw/swishdefault swishtitle name email/],
        name_labels     => {
            swishrank           =>  'Rank',
            swishtitle          =>  'Subject Only',
            name                =>  "Poster's Name",
            email               =>  "Poster's Email",
            swishlastmodified   =>  'Message Date',
            swishdefault        =>  'Subject & Body',
        },

        highlight       => {
            package         => 'SWISH::PhraseHighlight',

            xhighlight_on    => '<font style="background:#FFFF99">',
            xhighlight_off   => '</font>',

            meta_to_prop_map => {   # this maps search metatags to display properties
                swishdefault    => [ qw/swishtitle swishdescription/ ],
                swishtitle      => [ qw/swishtitle/ ],
                email           => [ qw/email/ ],
                name            => [ qw/name/ ],
                swishdocpath    => [ qw/swishdocpath/ ],
            },
        },
    };

Setup web server (OS/web server dependent):

    /var/www # ln -s /path/to/hypermail/search
    /var/www # ln -s /path/to/hypermail/archive

and maybe tell apache to run the script:

    $ cat .htaccess 
    Deny from all
    <files swish.cgi>
        Allow from all
        SetHandler cgi-script
        Options +ExecCGI
    </files>


=head1 DESCRIPTION

This script is used to parse files produced by hypermail.
Last tested with hypermail pre-2.1.9.

It scans the directory passed as the first parameter for files matching \d+\.html
and then extracts out the content, email, name and subject.  This is then passed to
swish-e for indexing.

The swish.cgi script is used for searching the resulting index.  Configuration settings
are stored in the .swish.cgi file located in the current directory.  By default, swish.cgi
expects the current working directory to be the location of the cgi script.  On other web
servers this may not be the case and you will need to edit swish.cgi to use absolute path
names for .swishcgi.conf and the index files.


=head1 USAGE

See the SYNOPSIS above.

If you do not use the directory structure above you may need to use ReplaceRules in
the swish-e config file to adjust the paths stored in the swish-e index file.


=head1 COPYRIGHT

This library is free software; you can redistribute it
and/or modify it under the same terms as Perl itself.

=head1 SEE ALSO

Hypermail can be downloaded from:

   http://hypermail.org

=head1 AUTHOR

Bill Moseley moseley@hank.org. 2004

=head1 SUPPORT

Please contact the Swish-e discussion email list for support with this module
or with Swish-e.  Please do not contact the developers directly.