Sophie

Sophie

distrib > Mandriva > 2010.0 > i586 > media > contrib-release > by-pkgid > 097dd4e9e72d844830e823e5746f6c3b > files > 18

swish-e-2.4.7-1mdv2010.0.i586.rpm

#!/usr/bin/perl -w
use strict;

# This is a short example that basically does the same
# thing as the default file system access method by
# recursing directories, but also shows how to process different
# file types -- in this example pdf is converted to xml for indexing.

# in this example, only .pdf and .config files are indexed.

# the pdf2xml module is in the prog-bin directory of the swish-e distribution
use lib '../prog-bin';

use File::Find;  # for recursing a directory tree
use pdf2xml;     # example module for pdf to xml conversion
                 # Not that you need IndexContents XML .pdf in the
                 # swish-e config file

# See perldoc File::Find for information on following symbolic links

use constant DEBUG => 0;

# See if a directory was passed in via the SwishProgParameters swish
# directive

my $dir = shift || '.';

find(
    {
        wanted => \&wanted,
        no_chdir => 1,
    },
    $dir,
);

sub wanted {
    return if -d;

    if ( /\.pdf$/ ) {
        print STDERR "Indexing pdf $File::Find::name\n" if DEBUG;
        print ${ pdf2xml( $File::Find::name ) };

    } elsif ( /\.config$/ ) {
        print STDERR "Indexing $File::Find::name\n" if DEBUG;
        print ${ get_content( $File::Find::name ) };

    } else {
        print STDERR "Skipping $File::Find::name\n" if DEBUG;
    }
}


sub get_content {
    my $path = shift;

    my ( $size, $mtime )  = (stat $path )[7,9];
    open FH, $path or die "$path: $!";

    my $content =  <<EOF;
Content-Length: $size
Last-Mtime: $mtime
Path-Name: $path

EOF
    local $/ = undef;
    $content .= <FH>;
    return \$content;
}