Speed up find-german-comments: speed up text_cat -s
follow-up commit to https://gerrit.libreoffice.org/#/c/9226/ . text_cat -s was documented as "Not very efficient yet, because language models are re-loaded after each line." So if we want to use text_cat -s for thousands of lines, better read the language models only once and store them. When tested on svl/source, the speed-up was a futher factor 1.5 (reduced from 6 s to 4 s). Change-Id: I654a250b0e369e01c5eac5970b64df1390f0ef35 Reviewed-on: https://gerrit.libreoffice.org/9227 Reviewed-by: Michael Meeks <michael.meeks@collabora.com> Tested-by: Michael Meeks <michael.meeks@collabora.com>
This commit is contained in:
committed by
Michael Meeks
parent
6efd972591
commit
cda4ee0c50
@@ -8,6 +8,8 @@ use Getopt::Std;
|
|||||||
use Benchmark;
|
use Benchmark;
|
||||||
|
|
||||||
my $non_word_characters='0-9\s';
|
my $non_word_characters='0-9\s';
|
||||||
|
my @languages; # languages (sorted by name)
|
||||||
|
my %ngram_for; # map language x ngram => rang
|
||||||
|
|
||||||
# OPTIONS
|
# OPTIONS
|
||||||
getopts('a:d:f:hi:lnst:u:v');
|
getopts('a:d:f:hi:lnst:u:v');
|
||||||
@@ -94,31 +96,11 @@ if ($opt_n) {
|
|||||||
classify(input());
|
classify(input());
|
||||||
}
|
}
|
||||||
|
|
||||||
# CLASSIFICATION
|
sub read_model {
|
||||||
sub classify {
|
my ($file) = @_;
|
||||||
my ($input)=@_;
|
open(LM,"$file") or die "cannot open $file: $!\n";
|
||||||
my %results=();
|
my %ngram;
|
||||||
my $maxp = $opt_t;
|
|
||||||
# open directory to find which languages are supported
|
|
||||||
opendir DIR, "$opt_d" or die "directory $opt_d: $!\n";
|
|
||||||
my @languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR));
|
|
||||||
closedir DIR;
|
|
||||||
@languages or die "sorry, can't read any language models from $opt_d\n" .
|
|
||||||
"language models must reside in files with .lm ending\n";
|
|
||||||
|
|
||||||
|
|
||||||
# create ngrams for input. Note that hash %unknown is not used;
|
|
||||||
# it contains the actual counts which are only used under -n: creating
|
|
||||||
# new language model (and even then they are not really required).
|
|
||||||
my @unknown=create_lm($input);
|
|
||||||
# load model and count for each language.
|
|
||||||
my $language;
|
|
||||||
my $t1 = new Benchmark;
|
|
||||||
foreach $language (@languages) {
|
|
||||||
# loads the language model into hash %$language.
|
|
||||||
my %ngram=();
|
|
||||||
my $rang = 1;
|
my $rang = 1;
|
||||||
open(LM,"$opt_d/$language.lm") || die "cannot open $language.lm: $!\n";
|
|
||||||
while (<LM>) {
|
while (<LM>) {
|
||||||
chomp;
|
chomp;
|
||||||
# only use lines starting with appropriate character. Others are
|
# only use lines starting with appropriate character. Others are
|
||||||
@@ -127,14 +109,43 @@ sub classify {
|
|||||||
$ngram{$&} = $rang++;
|
$ngram{$&} = $rang++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
close(LM);
|
return \%ngram;
|
||||||
#print STDERR "loaded language model $language\n" if $opt_v;
|
}
|
||||||
|
|
||||||
|
sub read_models {
|
||||||
|
# open directory to find which languages are supported
|
||||||
|
opendir DIR, "$opt_d" or die "directory $opt_d: $!\n";
|
||||||
|
@languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR));
|
||||||
|
closedir DIR;
|
||||||
|
@languages or die "sorry, can't read any language models from $opt_d\n" .
|
||||||
|
"language models must reside in files with .lm ending\n";
|
||||||
|
|
||||||
|
foreach my $language (@languages) {
|
||||||
|
$ngram_for{$language} = read_model("$opt_d/$language.lm");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# CLASSIFICATION
|
||||||
|
sub classify {
|
||||||
|
my ($input)=@_;
|
||||||
|
my %results=();
|
||||||
|
my $maxp = $opt_t;
|
||||||
|
read_models() if !@languages;
|
||||||
|
|
||||||
|
# create ngrams for input. Note that hash %unknown is not used;
|
||||||
|
# it contains the actual counts which are only used under -n: creating
|
||||||
|
# new language model (and even then they are not really required).
|
||||||
|
my @unknown=create_lm($input);
|
||||||
|
|
||||||
|
my $t1 = new Benchmark;
|
||||||
|
foreach my $language (@languages) {
|
||||||
# compares the language model with input ngrams list
|
# compares the language model with input ngrams list
|
||||||
|
my $ngram = $ngram_for{$language} or die "no ngrams for $language";
|
||||||
|
|
||||||
my ($i,$p)=(0,0);
|
my ($i,$p)=(0,0);
|
||||||
while ($i < @unknown) {
|
while ($i < @unknown) {
|
||||||
if ($ngram{$unknown[$i]}) {
|
if ($ngram->{$unknown[$i]}) {
|
||||||
$p=$p+abs($ngram{$unknown[$i]}-$i);
|
$p=$p+abs($ngram->{$unknown[$i]}-$i);
|
||||||
} else {
|
} else {
|
||||||
$p=$p+$maxp;
|
$p=$p+$maxp;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user