-
Notifications
You must be signed in to change notification settings - Fork 0
/
TextpressoGeneralTasks.pm
executable file
·99 lines (77 loc) · 1.93 KB
/
TextpressoGeneralTasks.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
package TextpressoGeneralTasks;
# Package provide class and methods for
# tasks related to processing and maintaining
# the Textpresso system. These are routines
# that are used throughout the system.
#
# (c) 2005 Hans-Michael Muller, Caltech, Pasadena.
use strict;
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw(ReadLexica FindRelevantEntries GetLines GetStopWords ascending descending);
sub GetLines {
my $plainfile = shift;
my $chomped = shift;
my @lines = ();
open (PLAIN, "$plainfile") || return @lines;
while (my $line = <PLAIN>) {
if ($chomped) {
chomp($line);
}
push @lines, $line;
}
close (PLAIN);
return @lines;
}
sub GetStopWords {
my $stopwordfile = shift;
my %stopwords = ();
open (IN, "<$stopwordfile") || return %stopwords;
while (my $line = <IN>) {
chomp ($line);
$line =~ s/\s+//g;
$stopwords{$line} = 1;
}
close (IN);
return %stopwords;
}
sub ReadLexica {
use File::Basename;
my $dirin = shift;
my $del = shift;
my %lexicon = ();
my @lexfiles = <$dirin/*>;
foreach my $file (@lexfiles) {
(my $fname, my $fdir, my $fsuf) = fileparse($file, qr{\.\d+-gram});
$fsuf =~ s/^\.(\d+)-gram/$1/;
open (IN, "<$file");
my $inline = '';
while (my $line = <IN>) {
$inline .= $line;
}
my @entries = split (/$del\n/, $inline);
foreach my $entry (@entries) {
my @items = split (/\n/, $entry);
my $ukey = shift(@items);
@{$lexicon{$ukey}{$fname}} = @items;
}
close (IN);
}
return %lexicon;
}
sub FindRelevantEntries {
my $line = shift;
my $pLexicon = shift;
my %list = ();
foreach my $phrase (keys % { $pLexicon }) {
foreach my $category (keys % { $$pLexicon{$phrase} }) {
if ($line =~ m/$phrase/) {
$list{$phrase}{$category} = 1;
}
}
}
return %list;
}
sub ascending { $a <=> $b }
sub descending { $b <=> $a }
1;