-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path080-ocr.sh
More file actions
executable file
·61 lines (47 loc) · 1.55 KB
/
Copy path080-ocr.sh
File metadata and controls
executable file
·61 lines (47 loc) · 1.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env bash
set -eu
cd "$(dirname "$0")"
src=070-deskew
dst=$(basename "$0" .sh)
mkdir -p $dst
if true; then
source 030-measure-page-size.txt
else
# 030-measure-page-size.txt
scan_resolution=300
scan_format=tiff
fi
ocr_lang=deu+eng
img_format=jpg # must be compatible with the hocr-editor
img_quality=20%
./tessdata_best.sh $(echo "$ocr_lang" | tr '+' ' ')
# the page image path is relative to the workdir
# <div class='ocr_page' id='page_1' title='image "../070-deskew/005.tiff"; ...'>
# patch paths:
# sed -i -E "s|(<div class='ocr_page' id='page_[0-9]+' title='image \")[^/]+/([0-9]+\.tiff\";)|\1../070-deskew/\2|" 080-ocr/*.hocr
cd "$dst"
for inp in ../"$src"/*."$scan_format"; do
# FIXME use $num_pages and $scan_format
page_number=${inp%.tiff}
page_number=${page_number##*/}
page_number=${page_number#0}
page_number=${page_number#0}
out=${inp##*/}
out=${out%.tiff}
# out=$dst/$out
out1=$out.hocr
if ! [ -e $out1 ]; then
# TODO? use OCRopus https://github.com/ocropus-archive/DUP-ocropy
echo + \
tesseract "$inp" - -c tessedit_create_hocr=1 --dpi "$scan_resolution" -l "$ocr_lang" --oem 1 --psm 6 --tessdata-dir ../tessdata_best
tesseract "$inp" - -c tessedit_create_hocr=1 --dpi "$scan_resolution" -l "$ocr_lang" --oem 1 --psm 6 --tessdata-dir ../tessdata_best >$out1
fi
if false; then
out2=$out."$img_format"; q2=20%
if ! [ -e $out2 ]; then
echo + magick $inp -quality "$img_quality" $out2
magick $inp -quality "$img_quality" $out2
fi
fi
# [ "$page_number" = 10 ] && break # debug
done