-
Notifications
You must be signed in to change notification settings - Fork 79
/
hocr-split
executable file
·34 lines (26 loc) · 1004 Bytes
/
hocr-split
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/usr/bin/env python
# split an hOCR file into individual pages
import argparse
import re
from lxml import etree, html
################################################################
# main program
################################################################
parser = argparse.ArgumentParser(
description="split a multipage hOCR file into single pages")
parser.add_argument("file", help="hOCR file", type=argparse.FileType('r'))
parser.add_argument("pattern", help="naming pattern, e.g. 'base-%%03d.html'")
args = parser.parse_args()
assert re.search('%[0-9]*d', args.pattern)
doc = etree.parse(args.file, html.XHTMLParser())
pages = doc.xpath("//*[@class='ocr_page']")
assert pages != []
container = pages[0].getparent()
index = 1
for new_page in pages:
container_pages = container.xpath("//*[@class='ocr_page']")
for page in container_pages:
container.remove(page)
container.append(new_page)
doc.write((args.pattern % index), pretty_print=True)
index += 1