root/hodgestar/Scripts/ocr-book

Revision 363, 0.8 kB (checked in by simon, 4 years ago)

Useful scripts (book OCRing script is the first).

Line 
1# scan pages
2for i in $(seq --format=%003.f 1 150); do
3  echo Prepare page $i and press Enter
4  read
5  scanimage --device 'brother2:bus1;dev1' --format=pnm --mode 'True Gray' --resolution 300 -l 90 -t 0 -x 210 -y 200 --brightness -20 --contrast 15 >scan-$i.pnm
6done
7
8# unpaper scans
9for i in $(seq --format=%003.f 1 150); do
10  echo preparing page $i
11  unpaper scan-$i.pnm unpapered-$i
12  convert unpapered-$i.pnm prepared-$i.tif && rm unpapered-$i.pnm
13done
14
15# tesseract
16# (GOCR and Ocrad are crap)
17for i in $(seq --format=%003.f 1 150); do
18  echo doing OCR on page $i
19  tesseract prepared-$i.tif tesseract-$i -l eng
20done
21
22# remove page numbers
23for i in $(seq --format=%003.f 1 150); do
24  tail -n +2 tesseract-$i.txt >text-$i.txt
25done
26
27# newlines to spaces
28tr '\n' ' '
29# only one space character at a time
30sed 's/[[:blank:]]{2,}/ /'
31
Note: See TracBrowser for help on using the browser.