Sophie

Sophie

distrib > Mandriva > 2010.0 > i586 > media > contrib-release > by-pkgid > 097dd4e9e72d844830e823e5746f6c3b > files > 35

swish-e-2.4.7-1mdv2010.0.i586.rpm

package doc2txt;
use strict;

=pod

=head1 NAME

doc2txt - swish-e sample module to convert MS Word docs to text

=head1 SYNOPSIS

    use doc2txt;
    my $doc_record_ref = doc2txt( $doc_file_name );

    # or by passing content in a scalar reference
    my $doc_text_ref = doc2txt( \$doc_content );


    

=head1 DESCRIPTION

Sample module for use with other swish-e 'prog' document source programs.

Pass either a file name, or a scalar reference.

The differece is when you pass a reference to a scalar
only the content is returned.  When you pass a file name
an entire record is returned ready to be fed to swish -- this
includes the headers required by swish for indexing.


=head1 REQUIREMENTS

Uses the catdoc program.  http://www.fe.msk.ru/~vitus/catdoc/

You may need to adjust the parameters used to call catdoc.

You will also need the module File::Temp available from CPAN if passing content
to this module (instead of a file name).  I'm not thrilled about how that
currently works...


=head1 AUTHOR

Bill Moseley

=cut

use Symbol;


use vars qw(
    @ISA
    @EXPORT
    $VERSION
);

# $Id: doc2txt.pm 1279 2003-06-12 04:00:45Z whmoseley $
$VERSION = sprintf '%d.%02d', q$Revision: 1279 $ =~ /: (\d+)\.(\d+)/;

require Exporter;
@ISA    = qw(Exporter);
@EXPORT = qw(doc2txt);

my @InfoTags = qw/Title Subject Author CreationDate Creator Producer ModDate Keywords/;

my $catdoc = 'catdoc -a';  # how cat doc is called. Rainer uses catdoc -s8859-1 -d8859-1


sub doc2txt {
    my $file_or_content = shift;


    my $file = ref $file_or_content
    ? create_temp_file( $file_or_content )
    : $file_or_content;

    # This doesn't work
    my $path = $file;
    for ( $path ) {
        s/"/\\"/g;
        $path = qq["$path"];
    }

    my $content = `$catdoc $path`;

    return \$content if ref $file_or_content;

    # otherwise build the headers

    my $mtime  = (stat $file )[9];

    my $size = length $content;

    my $ret = <<EOF;
Content-Length: $size
Last-Mtime: $mtime
Path-Name: $file

EOF

    $ret .= $content;

    return \$ret;
    

}


# This is the portable way to do this, I suppose.
# Otherwise, just create a file in the local directory.

sub create_temp_file {
    my $scalar_ref = shift;

    require "File/Temp.pm";

    my ( $fh, $file_name ) = File::Temp::tempfile( UNLINK => 1 );

    print $fh $$scalar_ref or die $!;


    close $fh or die "Failed to close '$file_name' $!";

    return $file_name;
}