-
Notifications
You must be signed in to change notification settings - Fork 6
/
marky.py
executable file
·352 lines (324 loc) · 17.1 KB
/
marky.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
#!/usr/bin/env python3
# converts books defined in a json file created by clippy.py into a markdown and csv files
import os
import sys
import argparse
import json
import csv
import copy
import re
from datetime import datetime
from prettytable import PrettyTable
import ClippyKindle
def main():
# parse args:
parser = argparse.ArgumentParser(description='Parses a json file created by clippy.py and creates markdown and csv files for each book as desired.')
parser.add_argument('json_file', type=str, help='(string) path to json file created by clippy.py (e.g. "./collection.json")')
parser.add_argument('out_folder', type=str, help='(string) path of folder to output markdown and csv files (e.g. "./output")')
parser.add_argument('--settings', type=str, help='(string) path to json file containing settings for parsing books (optional). If no settings is provided then the program will offer to create one.')
# https://docs.python.org/dev/library/argparse.html#action
parser.add_argument('--latest-csv', action="store_true", help='Causes only the newly added items (since the last output using --update-outdate) to be outputted to csv files.')
parser.add_argument('--update-outdate', action="store_true", help='Stores the date of the latest item outputted for each book in the settings file.')
parser.add_argument('--omit-notes', action="store_true", help="Omits the user's typed notes for each book in markdown output.")
# (args starting with '--' are made optional)
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
exit(1)
args = parser.parse_args()
outPath = args.out_folder + ("" if args.out_folder.endswith("/") else "/")
if not os.path.isdir(outPath):
os.mkdir(outPath)
bookList = ClippyKindle.ClippyKindle.parseJsonFile(args.json_file)
bookMap = {} # map book titles to its respective Book object
for bookObj in bookList:
bookMap[bookObj.getName()] = {"obj": bookObj, "used": False}
# read json settings from file:
settings = None
saveSettings = True # whether to write settings to file (updating existing if provided)
if args.settings != None:
with open(args.settings) as f:
settings = json.load(f)
settings = updateSettings(bookList, settings, useDefaults=False)
else:
# settings file not provided, so make settings here:
print("No settings file provided, using defaults (creating both a .md and .csv file for every book)...")
useDefaults = not answerYesNo("Or define custom settings now instead (y/n)? ")
settings = updateSettings(bookList, settings=None, useDefaults=useDefaults)
if not answerYesNo("Save settings to file for later use (y/n)? "):
saveSettings = False
else:
args.settings = getAvailableFname("settings", ".json")
print("\nOutputting files based on selected settings...")
for groupName in settings:
#print("at group: " + groupName)
outputMD = (settings[groupName]["outputMD"] == True) # whether to output md file for books in group
outputCSV = (settings[groupName]["outputCSV"] == True) # whether to output csv file for books in group
# filenames for combined output
# (create additional file for everything in group if provided path != "")
combinedMD = settings[groupName]["combinedMD"].strip()
combinedCSV = settings[groupName]["combinedCSV"].strip()
# remove files we will be appending to:
for path in [combinedMD, combinedCSV]:
if path != "" and os.path.exists(args.out_folder + "/" + path):
os.remove(args.out_folder + "/" + path)
# TODO: add settings option for each group "separateFolder": True, (create folder for each group if needed)
# use os.join() to append folder path to the filenames above (not below)
# loop over books in this group
for i in range(len(settings[groupName]["books"])):
bookName = settings[groupName]["books"][i]["name"]
chapters = settings[groupName]["books"][i]["chapters"]
bookMap[bookName]["used"] = True
bookObj = bookMap[bookName]["obj"] # Book object from collection
lastDate = bookObj.getDateRange()[1] # datetime object of latest item added to book
fname = bookObj.getName().replace("/", "|") # sanitize for output filename
outPathMD = "{}{}.md".format(outPath, fname) # output markdown filename
outPathCSV = "{}{}.csv".format(outPath, fname) # output csv filename
mdStr = jsonToMarkdown(bookObj.toDict(), chapters, args.omit_notes)
csvStr = bookObj.toCSV()
if args.latest_csv:
# ensure csv only contains new data since the last time it was outputted
tmp = copy.deepcopy(bookObj)
oldEpoch = settings[groupName]["books"][i].get("lastOutputDate", 0) # default 0
oldEpoch = 0 if oldEpoch == 0 else ClippyKindle.strToDate(oldEpoch).timestamp()
tmp.cutBefore(datetime.fromtimestamp(oldEpoch))
csvStr = tmp.toCSV()
# write markdown file:
if outputMD:
with open(outPathMD, 'w') as f:
f.write(mdStr)
print("created: '{}'".format(outPathMD))
if combinedMD != "":
combinePath = os.path.join(args.out_folder, combinedMD)
existed = os.path.exists(combinePath)
with open(combinePath, 'a+') as f: # append or create file
f.write(mdStr)
if not existed:
print("created: '{}'".format(combinePath)) # print the first time only
# write csv file:
if outputCSV:
with open(outPathCSV, 'w') as f:
csv.writer(f).writerows(csvStr)
print("created: '{}'".format(outPathCSV))
# update last outputted timestamp
if args.update_outdate:
settings[groupName]["books"][i]["lastOutputDate"] = ClippyKindle.dateToStr(lastDate)
if combinedCSV != "":
combinePath = os.path.join(args.out_folder, combinedCSV)
existed = os.path.exists(combinePath)
with open(combinePath, 'a+') as f: # append or create file
csv.writer(f).writerows(csvStr if not existed else csvStr[1:]) # remove header if file already existed
if args.update_outdate:
settings[groupName]["books"][i]["lastOutputDate"] = ClippyKindle.dateToStr(lastDate)
if not existed:
print("created: '{}'".format(combinePath)) # print the first time only
# update settings file:
if saveSettings:
with open(args.settings, 'w') as f:
json.dump(settings, f, indent=2) # write indented json to file
print("\nSettings stored in '{}'".format(args.settings))
#########################################
def jsonToMarkdown(data, chapters=[], omitNotes=False):
"""
creates a markdown representation of a book's highlights/notes/bookmarks
parameters:
data (dict): dict holding data about a book (created with Book.toDict())
chapters (array of dicts): (optional) array storing list of book chapters
e.g. [{"loc": 248, "title": "CHAPTER 1: The cult of the Head Start"}, ...]
return:
(str) markdown representation of provided book data
"""
#DATE_FMT = ClippyKindle.DATE_FMT_OUT # includes time
DATE_FMT = "%B %d, %Y"
titleStr = data["title"]
titleStr += "" if data["author"] == None else " by {}".format(data["author"])
if len(data["items"]) > 0:
locType = "loc" if data["items"][0]["locType"] == "location" else data["items"][0]["locType"]
md = ""
if len(data["items"]) == 0:
dateInfo = "* (No notes taken for this book)"
else:
# simplify formatting of date strings
dateStart = ClippyKindle.strToDate(data["dateStart"]).strftime(DATE_FMT)
dateEnd = ClippyKindle.strToDate(data["dateEnd"]).strftime(DATE_FMT)
dateInfo = "* Notes from: {} - {}".format(dateStart, dateEnd)
md += "# {}\n{}\n---\n\n".format(titleStr, dateInfo)
# current index into chapters (some chapters have subchapters, rightmost value is deepest nested subchapter index)
cIndex = [0] if len(chapters) > 0 else None
for item in data["items"]:
# handle any chapters appearing before this item (that haven't yet been outputted)
while cIndex != None:
chap = getChapterAt(cIndex, chapters)
if chap == None:
cIndex = None
break
if chap["loc"] > item["loc"]:
break
# print number of '#' based on current chapter level
md += "#{} {}\n".format("#" * len(cIndex), chap["title"])
cIndex = cIndexAdvance(cIndex, chapters, verbose=True)
if "content" in item: # escape all '*' as '\*'
item["content"] = item["content"].replace('*', '\*')
if item["type"] == "highlight":
md += "* {} -- [{} {}]\n\n".format(item["content"], locType, item["loc"])
if item["type"] == "note" and not omitNotes:
# two spaces at the end of a line creates a line break after
# https://meta.stackexchange.com/a/186647
tmp = item["content"].replace("\n", " \n> ")
md += "> {} -- [{} {}]\n\n".format(tmp, locType, item["loc"])
if item["type"] == "bookmark":
md += "* [Bookmark -- {} {}]\n\n".format(locType, item["loc"])
# print any chapters not yet reached:
while cIndex != None:
chap = getChapterAt(cIndex, chapters)
if chap == None:
cIndex = None
break
md += "#{} {}\n".format("#" * len(cIndex), chap["title"])
cIndex = cIndexAdvance(cIndex, chapters, verbose=True)
# strip choice utf-8 chars that make xelatex fail (when converting .md -> .pdf later)
md = md.encode('utf-8', errors='replace').decode('utf-8', errors="replace")
#md = md.replace('\x0b', '?')
BAD_PATTERNS = [
(r'\x0b', '?'),
(r'\x07', ' '),
(r'\x08', ' '),
]
for (pattern, replacement) in BAD_PATTERNS:
md = re.sub(pattern, replacement, md)
return md
def cIndexAdvance(cIndex, chapters, verbose=False, _tryDeeper=True):
"""
helper function for iterating through a nested list of chapter dicts
params:
cIndex: array of indices into chapters
chapters: array of chapter dict objects (which individually may or may not have nested chapters
return (array or None): an updated cIndex that refers to the next chapter after the provided cIndex
(in depth first descent, BUT counting parent chapters as visited on the descent down)
returns None if we finish traversing chapters
"""
# try to descend one level deeper:
if _tryDeeper:
if getChapterAt(cIndex + [0], chapters) != None:
return cIndex + [0]
# try advancing within current level:
cIndex[-1] += 1
if getChapterAt(cIndex, chapters) != None:
return cIndex
# try backing out one level (so we can advance within that level):
if len(cIndex) == 1:
return None # reached end of chapters
return cIndexAdvance(cIndex[:-1], chapters, _tryDeeper=False)
def getChapterAt(cur_cIndex, cur_chapters):
"""
helper function for getting a chapter object at a desired location within a nested list of chapter dicts
params:
cur_cIndex: array of indices into cur_chapters
cur_chapters: array of chapter dict objects (which individually may or may not have nested chapters
(i.e. store their own array of chaper dict objects (which are subchapters))
return: the chapter dict object found at provided cIndex (or None if not found)
"""
if len(cur_cIndex) == 1:
# finally returns a chapter dict object (not an array)
return cur_chapters[cur_cIndex[0]] if (cur_cIndex[0] < len(cur_chapters)) else None
if "chapters" not in cur_chapters[cur_cIndex[0]] or not isinstance(cur_chapters[cur_cIndex[0]]["chapters"], list):
return None # unable to descend further as expected
return getChapterAt(cur_cIndex[1:], cur_chapters[cur_cIndex[0]]["chapters"])
def updateSettings(bookList, settings=None, useDefaults=False):
"""
ensures that every book in the provided list exists in the settings
modifies existing settings if provided or creates default settings to modify
params:
bookList: list of ClippyKindle.Book objects for settings to be created for
useDefaults (bool): true when we want to default to outputting a md and csv file for each book
otherwise prompt user to choose the group for each book that needs to be added to settings.
(Ignored if settings != None)
settings (dict): optional existing settings to modify. If not provided, default settings
are created and modified.
return (dict): settings to use for these books
"""
if settings == None:
# default settings groups:
settings = {
"csvOnly": {"outputMD": False, "outputCSV": True, "combinedMD": "", "combinedCSV": "", "books": []},
"both": {"outputMD": True, "outputCSV": True, "combinedMD": "", "combinedCSV": "", "books": []},
"mdOnly": {"outputMD": True, "outputCSV": False, "combinedMD": "", "combinedCSV": "", "books": []},
"skip": {"outputMD": False, "outputCSV": False, "combinedMD": "", "combinedCSV": "", "books": []}
}
else:
useDefaults = False # (group "both" isn't guranteed to exist in this case)
# determine which books aren't in the settings:
tmpMap = {} # map book names -> count of their appearences in settings
for groupName in settings:
for b in settings[groupName]["books"]:
tmpMap[b["name"]] = 1 if (b["name"] not in tmpMap) else tmpMap[b["name"]] + 1
newBooks = [bookObj for bookObj in bookList if bookObj.getName() not in tmpMap]
# print warning for books appearing in settings multipe times:
for name in [bookName for bookName in tmpMap if tmpMap[bookName] > 1]:
print("NOTE: book appears {} times in settings: '{}'".format(tmpMap[name], name))
if len(newBooks) > 0 and not useDefaults:
print("{} book(s) must have their output settings defined...".format(len(newBooks)))
# place each new book under desired group (default is "both"):
for bookIndex, bookObj in zip(range(len(newBooks)), newBooks):
selectedGroup = "both"
if not useDefaults:
prompt = "\nSelect a settings group for book {} of {}: '{}'\n"\
.format(bookIndex+1, len(newBooks), bookObj.getName())
table = PrettyTable() # http://zetcode.com/python/prettytable/
table.field_names = ["Group #", "Group", "md file?", "csv file?", "Combined md for group?", "Combined csv for group?"]
for index, groupName in zip(range(len(settings)), settings):
table.add_row([index+1, groupName, settings[groupName]["outputMD"], settings[groupName]["outputCSV"],
settings[groupName]["combinedMD"] != "", settings[groupName]["combinedCSV"] != ""])
prompt += str(table) + "\nEnter group number ({}-{}): ".format(1, len(settings))
selectedGroup = [g for g in settings][answerMenu(prompt, len(settings))-1]
print()
settings[selectedGroup]["books"].append({
"name": bookObj.getName(),
"chapters": []
})
return settings
def answerMenu(prompt, numOptions):
"""
returns the response to a prompt that expects the user to choose a number
between 1 and numOptions (reprompts until user provides a valid response).
params:
prompt (string): prompt to show user before awaiting input
numOptions (int): number of options user is being asked to choose from
return (int): number
"""
val = ""
while val not in range(1, numOptions+1):
try:
val = int(input(prompt))
except ValueError:
continue
return val
def answerYesNo(prompt):
"""
returns the response to a y/n question prompt (reprompts until user provides a valid response)
params:
prompt (string): prompt to show user before awaiting input
return (bool): True if user responds yes, False if user responds no
"""
val = ""
while val not in ["y", "yes", "n", "no"]:
val = input(prompt).strip().lower()
return val in ["y", "yes"]
def getAvailableFname(prefix, ext):
"""
returns a valid filename (to a file not already existing)
that starts with prefix and ends with the provided extension
(adds a number in between if needed)
parameters:
prefix (str): file prefix (e.g. "./out")
ext (str): file extension including '.' (e.g. ".json")
return (str): file path to use
"""
if not os.path.exists(prefix + ext):
return prefix + ext
prefix += "1"
# increment number at end of prefix until file doesn't already exist
while os.path.exists(prefix + ".json"):
prefix = prefix[:-1] + str(int(prefix[-1]) + 1)
return prefix + ext
if __name__ == "__main__":
main()