Speed up find-german-comments: speed up text_cat -s

follow-up commit to https://gerrit.libreoffice.org/#/c/9226/ . text_cat -s was documented as "Not very efficient yet, because language models are re-loaded after each line." So if we want to use text_cat -s for thousands of lines, better read the language models only once and store them. When tested on svl/source, the speed-up was a futher factor 1.5 (reduced from 6 s to 4 s). Change-Id: I654a250b0e369e01c5eac5970b64df1390f0ef35 Reviewed-on: https://gerrit.libreoffice.org/9227 Reviewed-by: Michael Meeks <michael.meeks@collabora.com> Tested-by: Michael Meeks <michael.meeks@collabora.com>
2014-05-01 19:32:17 +02:00
parent 6efd972591
commit cda4ee0c50
1 changed files with 47 additions and 36 deletions
--- a/bin/text_cat/text_cat
+++ b/bin/text_cat/text_cat
@@ -8,6 +8,8 @@ use Getopt::Std;
 use Benchmark;
 my $non_word_characters='0-9\s';
 my @languages; # languages (sorted by name)
 my %ngram_for; # map language x ngram => rang
 # OPTIONS
 getopts('a:d:f:hi:lnst:u:v');
@@ -94,31 +96,11 @@ if ($opt_n) {
    classify(input()); 
 }
-# CLASSIFICATION
+sub read_model {
-sub classify {
+    my ($file) = @_;
-  my ($input)=@_;
+    open(LM,"$file") or  die "cannot open $file: $!\n";
-  my %results=();
+    my %ngram;
  my $maxp = $opt_t;
  # open directory to find which languages are supported
  opendir DIR, "$opt_d" or die "directory $opt_d: $!\n";
  my @languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR));
  closedir DIR;
  @languages or die "sorry, can't read any language models from $opt_d\n" .
    "language models must reside in files with .lm ending\n";
  # create ngrams for input. Note that hash %unknown is not used;
  # it contains the actual counts which are only used under -n: creating
  # new language model (and even then they are not really required).
  my @unknown=create_lm($input);
  # load model and count for each language.
  my $language;
  my $t1 = new Benchmark;
  foreach $language (@languages) {
    # loads the language model into hash %$language.
    my %ngram=();
    my $rang = 1;
    open(LM,"$opt_d/$language.lm") || die "cannot open $language.lm: $!\n";
    while (<LM>) {
 	chomp;
 	# only use lines starting with appropriate character. Others are
@@ -127,14 +109,43 @@ sub classify {
 	    $ngram{$&} = $rang++;
 	}
    }
-    close(LM);
+    return \%ngram;
-    #print STDERR "loaded language model $language\n" if $opt_v;
+}
 sub read_models {
  # open directory to find which languages are supported
  opendir DIR, "$opt_d" or die "directory $opt_d: $!\n";
  @languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR));
  closedir DIR;
  @languages or die "sorry, can't read any language models from $opt_d\n" .
    "language models must reside in files with .lm ending\n";
  foreach my $language (@languages) {
      $ngram_for{$language} = read_model("$opt_d/$language.lm");
  }
 }
 # CLASSIFICATION
 sub classify {
  my ($input)=@_;
  my %results=();
  my $maxp = $opt_t;
  read_models() if  !@languages;
  # create ngrams for input. Note that hash %unknown is not used;
  # it contains the actual counts which are only used under -n: creating
  # new language model (and even then they are not really required).
  my @unknown=create_lm($input);
  my $t1 = new Benchmark;
  foreach my $language (@languages) {
      # compares the language model with input ngrams list
      my $ngram = $ngram_for{$language} or die "no ngrams for $language";
      my ($i,$p)=(0,0);
      while ($i < @unknown) {
-      if ($ngram{$unknown[$i]}) {
+	  if ($ngram->{$unknown[$i]}) {
-	$p=$p+abs($ngram{$unknown[$i]}-$i);
+	      $p=$p+abs($ngram->{$unknown[$i]}-$i);
 	  } else {
 	      $p=$p+$maxp;
 	  }