-
Notifications
You must be signed in to change notification settings - Fork 0
/
abbreviations.py
executable file
·290 lines (246 loc) · 13.2 KB
/
abbreviations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
# abbreviations.py
# by Hugo Labrande
# Version: 8-Jul-2021
# This script finds very good abbreviations for your Inform game, which helps when space is tight.
# It finds up to 20% more savings than Inform's "-u" switch ; this is about 4k on a 128k story file.
# As a bonus, it's just as fast as that switch when you look at abbreviations of length between 2 and 9.
# Use the Inform 6.36 version of the compiler for even more savings!
# Other programs doing this exist: https://intfiction.org/t/highly-optimized-abbreviations-computed-efficiently/48753
# Thank you to Henrik Asman and Matthew Russotto for the discussion and their programs. You'll find them at https://github.com/heasm66/ZAbbrevMaker and https://gitlab.com/russotto/zilabbrs
# Abbreviation strings still aren't an exact science!
# Due to the fact that the number of 5-bit units has to be a multiple of 3 (and as such gets padded), we can only give estimates
# Input: gametext.txt file containing all the text in your game
# If the flag -f is specified, the input should be the one of Inform 6.35's -r flag with $TRANSCRIPT_FORMAT=1
# If no flag is specified, it should just be a game text, for instance Inform with -r (default format, $TRANSCRIPT_FORMAT=0), or
# cat *.zap | grep -o '".*"' | sed 's/"\(.*\)"/\1/g' >gametext.txt
# (don't forget to exclude the dictionary words though)
# You then have to tweak how many lines at the beginning and the end you want to skip (see line 70ish of this program)
# Output: If no flag is specified, the abbreviations in Inform's format (use Inform 6.35 with MAX_ABBREVS = 96 and MAX_DYNAMIC_STRINGS = 0)
# If you specify the -z flag, a file "mygame_freq.zap" will be created in the correct format, ready to use with ZILF/ZILCH.
# Please tweak these constants by hand; I should at some point get around to making these command-line arguments...
#
# File name
GAMETEXT_FILE_NAME = "gametext.txt"
# Verbosity level:
# 0 = nothing is printed except the abbreviations;
# 1 = the script updates you on its progress;
# 2 = the script tells you in excruciating detail what steps it's doing (how many candidates, what the score is, if they change after re-evaluating, etc)
VERBOSITY_LEVEL = 2
# This is used to discard abbreviations that (due to length and frequency) won't save many units
# A high value will speed up this algorithm but you might run out of candidates
MIN_SCORE = 15
#
# This indicates how many abbreviations should be conmputed
# Anything bigger than 64 but smaller than 96 is possible with Inform 6.35, using "MAX_ABBREVS=96; MAX_DYNAMIC_STRINGS=0;"
NUMBER_ABBR = 96
#
# Set a minimum and maximum length (in number of ascii/unicode characters) for abbreviations
# One-char strings can be 4 units long (for instance ";"), so you could in theory save 2 units per occurence; however at the date of writing, Inform refuses to abbreviate strings made of one ASCII characters.
MIN_LEN = 2
MAX_LEN = 60
# Deal with command-line arguments
# -z = ZILF/ZILCH output
# -f = new gametext format (Inform 6.35 and up with flag -r and $TRANSCRIPT_FORMAT=1)
ZAP_OUTPUT = 0
NEW_GAMETEXT_FORMAT = 0
import sys, getopt
# TODO allow the specification of the name of the file, the lines, etc - I'm being lazy for now
argv = sys.argv[1:]
try:
opts, args = getopt.getopt(argv,"zf")
except getopt.GetoptError:
print ('Usage: python3 abbreviations.py [-z] [-f]')
sys.exit(2)
for opt, arg in opts:
if opt == '-z':
ZAP_OUTPUT = 1
if opt == '-f':
NEW_GAMETEXT_FORMAT = 1
# If you opted for the old gametext format, you might want to cut the first few lines (header and abbreviations) and the last few lines (dictionary words) for best results
FIRST_FEW_LINES = 0
LAST_FEW_LINES = 0
if (NEW_GAMETEXT_FORMAT == 0):
# With I6, skip "6+n" lines, where n is the number of abbreviations you declare
FIRST_FEW_LINES = 70
LAST_FEW_LINES = 259
## Processing starts here
# Helper function (weight of a zchar)
def zchar_weight(c):
if (ord(c) == 32):
return 1 ## space = char 0
elif (ord(c) >= 97 and ord(c) <= 122):
return 1 ## A0 alphabet
elif (ord(c) >= 65 and ord(c) <= 90):
return 2 ## A1 alphabet
elif (c in "^0123456789.,!?_#'~/\-:()"):
return 2 ## A2 alphabet
else:
return 4
f = open(GAMETEXT_FILE_NAME, "r", encoding='ISO-8859-1')
lines = f.readlines()
# filter out some stuff
if (NEW_GAMETEXT_FORMAT == 1):
lines2 = []
for i in range(0, len(lines)):
cha = lines[i][0]
# take only if not a comment, an abbreviation, a dictionary word, an attribute name, or a trace message
if (cha not in "IADSX"):
# remove the first three characters, the "tag"
lines2 = lines2 + [lines[i][3:len(lines[i])]]
lines = lines2
else:
lines = lines[FIRST_FEW_LINES:len(lines)-LAST_FEW_LINES]
# keep an updated version of the abbreviated text
wholetext = ""
for i in range(0,len(lines)):
wholetext += lines[i] + "\n"
L = len(wholetext)
wholetext_weight = 0
for i in range(0, len(wholetext)):
wholetext_weight += zchar_weight(wholetext[i])
l = []
for n in range(MIN_LEN,MAX_LEN+1):
dic = {}
# each step takes around 1 second on my computer
if (VERBOSITY_LEVEL > 0):
print(" Counting frequencies... ("+str(n)+"/"+str(MAX_LEN)+")")
for li in lines:
for i in range(0, len(li)-n):
s = li[i:i+n]
if (s in dic.keys()):
dic[s] = dic[s] + 1
else:
dic[s] = 1
# First score: naive savings
## If you want to use the same formula as inform :
## 2*((abbrev_freqs[i]-1)*abbrev_quality[i])/3 with abbrev_quality = len -2
## A better one is actually counting the units.
for p in dic.items():
i = 0
units = 0
wd = p[0]
while (i < len(wd)):
letter = wd[i]
if (ord(letter) == 32):
units += 1 ## space = char 0
elif (ord(letter) >= 97 and ord(letter) <= 122):
units += 1 ## A0 alphabet
elif (ord(letter) >= 65 and ord(letter) <= 90):
units += 2 ## A1 alphabet
elif (letter in "^0123456789.,!?_#'~/\-:()"):
units += 2 ## A2 alphabet
else:
if (letter == '@'):
## most likely an accented character like @:e : skip the next 2 letters
i+=2
units += 4
i += 1
## number of occurences (-1 since you have to write the abbr somewhere) * units saved (units/2) = total units saved
## we compute the number of units saved
## score = int ((p[1]-1)* (units-2)/3 * 2)
score = (p[1]-1)* (units-2)
## Only add things that save enough units (to speed up the search)
## Add something to say when we last updated the score
if (score > MIN_SCORE):
l += [[p[0], score, units, 0 ]]
# find the first abbreviation
abbr = []
if (VERBOSITY_LEVEL > 0):
print("Determining abbreviation "+str(len(abbr)+1)+"...")
l = sorted(l, key=lambda x: x[1])
if ZAP_OUTPUT == 1:
g = open("mygame_freq.zap", "w")
# given the abbreviation table, what is the optimal way to apply abbreviations for the text?
# this is given in R.A. Wagner , “Common phrases and minimum-space text storage”, Commun. ACM, 16 (3) (1973), pp. 148-152
# this gives us better estimates than just replacing in a text and seeing, because of overlap
def wagner_optimal_parse(text, abblist):
# dynamic programming algorithm
# f(n) = "least space needed to store the abbreviated form of the characters n...len(text)"
N = len(text)
f = [0 for i in range(0, N+1)]
# you can compute f(n) from all the f(n+k); your goal is f(1)
f[N] = 0
I=N-1
while(I >= 0):
# compute f(I) = min( f(I+len(ab))+2, f(I+1)+1) for any matching ab
val = f[I+1]+zchar_weight(text[I])
for j in range(0, len(abblist)):
myabb = abblist[j]
if (I+len(myabb) <= len(text) and text[I:I+len(myabb)] == myabb):
# the abbreviation j could be used here, but is it a good idea
val2 = f[I+len(myabb)]+2
if (val2 < val):
val = val2
f[I] = val
I = I-1
#return the minimal text size with optimal parsing
return f[0]
steps = 0
old_savings = 0
old_textsize = wholetext_weight
while (len(abbr) < NUMBER_ABBR and len(l) > 0):
steps += 1
# determine the leaders
leading_score = l[len(l)-1][1]
n=1
while( l[len(l)-1-n][1] == leading_score ):
n += 1
if (VERBOSITY_LEVEL == 2):
print("Found "+str(n)+" leaders with score "+str(leading_score))
# see if they all have updated score
flag = 1
for i in range(1, n+1):
if (l[len(l)-i][3] != len(abbr)):
l[len(l)-i][3] = len(abbr)
# compute how small the text representation would be with that abbreviation added to the set
# store the difference "without this abbreviation minus with this abbreviation", both computed with optimal parse
l[len(l)-i][1] = old_textsize-wagner_optimal_parse(wholetext, abbr+[l[len(l)-i][0]])
flag = 0
if ( flag == 0 ):
l = sorted(l, key=lambda x: x[1])
if (VERBOSITY_LEVEL == 2):
print("The leaders weren't what we thought; let's sort again...")
else:
# at this point, we have a few candidates with equal actual score
# Matthew Russotto's idea for a tiebreaker: take the high frequency
# one (meaning the one with the most weight)
max = 1
for i in range(2, n+1):
if (l[len(l)-max][2] < l[len(l)-i][2]):
max = i
# we found our abbreviation
winner = l[len(l)-max]
if (VERBOSITY_LEVEL > 0):
print("Found string : '"+str(winner[0])+"' (units saved: "+str(winner[1])+" units)")
# the abbreviated text size is decreased by the savings
old_textsize = old_textsize - winner[1]
if ZAP_OUTPUT == 1:
#g.write(" .FSTR FSTR?"+str(len(abbr)+1)+",\""+str(winner[0])+"\" ; "+str(actual_freq)+"x, saved "+str((actual_freq-1)*(winner[2]-2))+"\n")
g.write(" .FSTR FSTR?"+str(len(abbr)+1)+",\""+str(winner[0])+"\" ; "+"\n")
# the revised score is still better than the next in line's
abbr = abbr + [str(winner[0])]
if (VERBOSITY_LEVEL > 0):
print("Determining abbreviation "+str(len(abbr)+1)+"...")
# update the array
lcopy = []
for i in range(0, len(l)):
# only add to the copy the strings not containing the abbreviated string
if (str(winner[0]) not in str(l[i][0])):
lcopy += [l[i]]
l = lcopy
if (VERBOSITY_LEVEL == 2):
print ("Array updated; now has "+str(len(l))+" entries")
# no need to sort; the order of the score hasn't changed
if ZAP_OUTPUT == 1:
endoffile="WORDS::\n FSTR?1\n FSTR?2\n FSTR?3\n FSTR?4\n FSTR?5\n FSTR?6\n FSTR?7\n FSTR?8\n FSTR?9\n FSTR?10\n FSTR?11\n FSTR?12\n FSTR?13\n FSTR?14\n FSTR?15\n FSTR?16\n FSTR?17\n FSTR?18\n FSTR?19\n FSTR?20\n FSTR?21\n FSTR?22\n FSTR?23\n FSTR?24\n FSTR?25\n FSTR?26\n FSTR?27\n FSTR?28\n FSTR?29\n FSTR?30\n FSTR?31\n FSTR?32\n FSTR?33\n FSTR?34\n FSTR?35\n FSTR?36\n FSTR?37\n FSTR?38\n FSTR?39\n FSTR?40\n FSTR?41\n FSTR?42\n FSTR?43\n FSTR?44\n FSTR?45\n FSTR?46\n FSTR?47\n FSTR?48\n FSTR?49\n FSTR?50\n FSTR?51\n FSTR?52\n FSTR?53\n FSTR?54\n FSTR?55\n FSTR?56\n FSTR?57\n FSTR?58\n FSTR?59\n FSTR?60\n FSTR?61\n FSTR?62\n FSTR?63\n FSTR?64\n FSTR?65\n FSTR?66\n FSTR?67\n FSTR?68\n FSTR?69\n FSTR?70\n FSTR?71\n FSTR?72\n FSTR?73\n FSTR?74\n FSTR?75\n FSTR?76\n FSTR?77\n FSTR?78\n FSTR?79\n FSTR?80\n FSTR?81\n FSTR?82\n FSTR?83\n FSTR?84\n FSTR?85\n FSTR?86\n FSTR?87\n FSTR?88\n FSTR?89\n FSTR?90\n FSTR?91\n FSTR?92\n FSTR?93\n FSTR?94\n FSTR?95\n FSTR?96\n\n .ENDI"
g.write(endoffile)
g.close()
final_savings = wholetext_weight - old_textsize
if (VERBOSITY_LEVEL > 0):
print("Found "+str(NUMBER_ABBR)+" abbreviations in "+str(steps)+" steps; they saved "+str(final_savings)+" units (~"+str(2*int(final_savings/3))+" bytes)")
s = "Abbreviate "
for i in range(0,NUMBER_ABBR):
s = s + '"' + abbr[i] +'" '
s += ";"
print(s.encode('ISO-8859-1').decode('UTF-8'))
f.close()