kinosearch1::docs::tutorial(3pm) debian man page

KinoSearch1::Docs::Tutorial(3pm)			User Contributed Perl Documentation			  KinoSearch1::Docs::Tutorial(3pm)

NAME
       KinoSearch1::Docs::Tutorial - sample indexing and search applications

DESCRIPTION
       The following sample code for invindexer.plx and search.cgi can be used to create a simple search engine. It requires the html presentation
       of the US Constitution included in the distribution for KinoSearch1, under "t/us_constitution".

       Note that a proper indexer for html documents would not rely on quick-n-dirty regular expressions for stripping tags, as this one does for
       the sake of brevity -- it would use a dedicated parsing module such as HTML::Parser.

   invindexer.plx
	   #!/usr/bin/perl
	   use strict;
	   use warnings;

	   use File::Spec;
	   use KinoSearch1::InvIndexer;
	   use KinoSearch1::Analysis::PolyAnalyzer;

	   ### In order for invindexer.plx to work correctly, you must modify
	   ### $source_dir, $path_to_invindex, and possibly $base_url.
	   ###
	   ### $source_dir must lead to the directory containing the US
	   ### Constitution html files.
	   ###
	   ### $path_to_invindex is the future location of the invindex.
	   ###
	   ### $base_url should reflect the location of the us_constitution directory
	   ### when accessed via a web browser.
	   my $source_dir	= '';
	   my $path_to_invindex = '';
	   my $base_url 	= '/us_constitution';

	   opendir( my $source_dh, $source_dir )
	       or die "Couldn't opendir '$source_dir': $!";
	   my @filenames = grep {/.html/} readdir $source_dh;
	   closedir $source_dh or die "Couldn't closedir '$source_dir': $!";

	   ### STEP 1: Choose an Analyzer.
	   my $analyzer = KinoSearch1::Analysis::PolyAnalyzer->new(
	       language => 'en',
	   );

	   ### STEP 2: Create a InvIndexer object.
	   my $invindexer = KinoSearch1::InvIndexer->new(
	       analyzer => $analyzer,
	       invindex => $path_to_invindex,
	       create	=> 1,
	   );

	   ### STEP 3: Define fields.
	   $invindexer->spec_field( name => 'title' );
	   $invindexer->spec_field(
	       name	  => 'bodytext',
	       vectorized => 1,
	   );
	   $invindexer->spec_field(
	       name    => 'url',
	       indexed => 0,
	   );

	   foreach my $filename (@filenames) {
	       next if $filename eq 'index.html';
	       my $filepath = File::Spec->catfile( $source_dir, $filename );
	       open( my $fh, '<', $filepath )
		   or die "couldn't open file '$filepath': $!";
	       my $content = do { local $/; <$fh> };

	       ### STEP 4: Start a new document.
	       my $doc = $invindexer->new_doc;

	       $content =~ m#<title>(.*?)</title>#s
		   or die "couldn't isolate title in '$filepath'";
	       my $title = $1;
	       $content =~ m#<div id="bodytext">(.*?)</div><!--bodytext-->#s
		   or die "couldn't isolate bodytext in '$filepath'";
	       my $bodytext = $1;
	       $bodytext =~ s/<.*?>/ /gsm;    # quick and dirty tag stripping

	       ### STEP 5: Set the value for each field.
	       $doc->set_value( url	 => "$base_url/$filename" );
	       $doc->set_value( title	 => $title );
	       $doc->set_value( bodytext => $bodytext );

	       ### STEP 6 Add the document to the invindex.
	       $invindexer->add_doc($doc);

	       ### STEP 7 Repeat steps 3-5 for each document in the collection.
	   }

	   ### STEP 8 Finalize the invindex.
	   $invindexer->finish;

   search.cgi
	   #!/usr/bin/perl -T
	   use strict;
	   use warnings;

	   use CGI;
	   use List::Util qw( max min );
	   use POSIX qw( ceil );
	   use KinoSearch1::Searcher;
	   use KinoSearch1::Analysis::PolyAnalyzer;
	   use KinoSearch1::Highlight::Highlighter;

	   my $cgi	     = CGI->new;
	   my $q	     = $cgi->param('q');
	   my $offset	     = $cgi->param('offset');
	   my $hits_per_page = 10;
	   $q	   = '' unless defined $q;
	   $offset = 0	unless defined $offset;

	   ### In order for search.cgi to work, $path_to_invindex must be modified so
	   ### that it points to the invindex created by invindexer.plx, and
	   ### $base_url may have to change to reflect where a web-browser should
	   ### look for the us_constitution directory.
	   my $path_to_invindex = '';
	   my $base_url 	= '/us_constitution';

	   ### STEP 1: Specify the same Analyzer used to create the invindex.
	   my $analyzer = KinoSearch1::Analysis::PolyAnalyzer->new(
	       language => 'en',
	   );

	   ### STEP 2: Create a Searcher object.
	   my $searcher = KinoSearch1::Searcher->new(
	       invindex => $path_to_invindex,
	       analyzer => $analyzer,
	   );

	   ### STEP 3: Feed a query to the Search object.
	   my $hits = $searcher->search($q);

	   ### STEP 4: Arrange for highlighted excerpts to be created.
	   my $highlighter = KinoSearch1::Highlight::Highlighter->new(
	       excerpt_field => 'bodytext' );
	   $hits->create_excerpts( highlighter => $highlighter );

	   ### STEP 5: Process the search.
	   $hits->seek( $offset, $hits_per_page );

	   ### STEP 6: Format the results however you like.

	   # create result list
	   my $report = '';
	   while ( my $hit = $hits->fetch_hit_hashref ) {
	       my $score = sprintf( "%0.3f", $hit->{score} );
	       $report .= qq|
		   <p>
		       <a href="$hit->{url}"><strong>$hit->{title}</strong></a>
		       <em>$score</em>
		       <br>
		       $hit->{excerpt}
		       <br>
		       <span class="excerptURL">$hit->{url}</span>
		   </p>
		   |;
	   }

	   $q = CGI::escapeHTML($q);

	   # display info about the number of hits, paging links
	   my $total_hits = $hits->total_hits;
	   my $num_hits_info;
	   if ( !length $q ) {
	       # no query, no display
	       $num_hits_info = '';
	   }
	   elsif ( $total_hits == 0 ) {
	       # alert the user that their search failed
	       $num_hits_info = qq|<p>No matches for <strong>$q</strong></p>|;
	   }
	   else {
	       # calculate the nums for the first and last hit to display
	       my $last_result = min( ( $offset + $hits_per_page ), $total_hits );
	       my $first_result = min( ( $offset + 1 ), $last_result );

	       # display the result nums, start paging info
	       $num_hits_info = qq|
		   <p>
		       Results <strong>$first_result-$last_result</strong>
		       of <strong>$total_hits</strong> for <strong>$q</strong>.
		   </p>
		   <p>
		       Results Page:
		   |;

	       # calculate first and last hits pages to display / link to
	       my $current_page = int( $first_result / $hits_per_page ) + 1;
	       my $last_page	= ceil( $total_hits / $hits_per_page );
	       my $first_page	= max( 1, ( $current_page - 9 ) );
	       $last_page = min( $last_page, ( $current_page + 10 ) );

	       # create a url for use in paging links
	       my $href = $cgi->url( -relative => 1 ) . "?" . $cgi->query_string;
	       $href .= ";offset=0" unless $href =~ /offset=/;

	       # generate the "Prev" link;
	       if ( $current_page > 1 ) {
		   my $new_offset = ( $current_page - 2 ) * $hits_per_page;
		   $href =~ s/(?<=offset=)d+/$new_offset/;
		   $num_hits_info .= qq|<a href="$href">&lt;= Prev</a>
|;
	       }

	       # generate paging links
	       for my $page_num ( $first_page .. $last_page ) {
		   if ( $page_num == $current_page ) {
		       $num_hits_info .= qq|$page_num 
|;
		   }
		   else {
		       my $new_offset = ( $page_num - 1 ) * $hits_per_page;
		       $href =~ s/(?<=offset=)d+/$new_offset/;
		       $num_hits_info .= qq|<a href="$href">$page_num</a>
|;
		   }
	       }

	       # generate the "Next" link
	       if ( $current_page != $last_page ) {
		   my $new_offset = $current_page * $hits_per_page;
		   $href =~ s/(?<=offset=)d+/$new_offset/;
		   $num_hits_info .= qq|<a href="$href">Next =&gt;</a>
|;
	       }

	       # finish paging links
	       $num_hits_info .= "</p>
";
	   }

	   # blast it all out
	   print "Content-type: text/html

";
	   print <<END_HTML;
	   <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
	       "http://www.w3.org/TR/html4/loose.dtd">
	   <html>
	   <head>
	       <meta http-equiv="Content-type"
		   content="text/html;charset=ISO-8859-1">
	       <link rel="stylesheet" type="text/css" href="$base_url/uscon.css">
	       <title>KinoSearch: $q</title>
	   </head>

	   <body>

	       <div id="navigation">
		   <form id="usconSearch" action="">
		       <strong>
		       Search the <a href="$base_url/index.html">US Constitution</a>:
		       </strong>
		       <input type="text" name="q" id="q" value="$q">
		       <input type="submit" value="=&gt;">
		       <input type="hidden" name="offset" value="0">
		   </form>
	       </div><!--navigation-->

	       <div id="bodytext">

	       $report

	       $num_hits_info

	       <p style="font-size: smaller; color: #666">
		   <em>Powered by
		       <a href="http://www.rectangular.com/kinosearch/">
			   KinoSearch
		       </a>
		   </em>
	       </p>
	       </div><!--bodytext-->

	   </body>

	   </html>
	   END_HTML

COPYRIGHT
       Copyright 2005-2010 Marvin Humphrey

LICENSE, DISCLAIMER, BUGS, etc.
       See KinoSearch1 version 1.00.

perl v5.14.2							    2011-11-15					  KinoSearch1::Docs::Tutorial(3pm)
kinosearch1::docs::tutorial(3pm) debian man page | unix.com