|
Revision 363, 0.8 kB
(checked in by simon, 4 years ago)
|
|
Useful scripts (book OCRing script is the first).
|
| Line | |
|---|
| 1 | # scan pages |
|---|
| 2 | for i in $(seq --format=%003.f 1 150); do |
|---|
| 3 | echo Prepare page $i and press Enter |
|---|
| 4 | read |
|---|
| 5 | scanimage --device 'brother2:bus1;dev1' --format=pnm --mode 'True Gray' --resolution 300 -l 90 -t 0 -x 210 -y 200 --brightness -20 --contrast 15 >scan-$i.pnm |
|---|
| 6 | done |
|---|
| 7 | |
|---|
| 8 | # unpaper scans |
|---|
| 9 | for i in $(seq --format=%003.f 1 150); do |
|---|
| 10 | echo preparing page $i |
|---|
| 11 | unpaper scan-$i.pnm unpapered-$i |
|---|
| 12 | convert unpapered-$i.pnm prepared-$i.tif && rm unpapered-$i.pnm |
|---|
| 13 | done |
|---|
| 14 | |
|---|
| 15 | # tesseract |
|---|
| 16 | # (GOCR and Ocrad are crap) |
|---|
| 17 | for i in $(seq --format=%003.f 1 150); do |
|---|
| 18 | echo doing OCR on page $i |
|---|
| 19 | tesseract prepared-$i.tif tesseract-$i -l eng |
|---|
| 20 | done |
|---|
| 21 | |
|---|
| 22 | # remove page numbers |
|---|
| 23 | for i in $(seq --format=%003.f 1 150); do |
|---|
| 24 | tail -n +2 tesseract-$i.txt >text-$i.txt |
|---|
| 25 | done |
|---|
| 26 | |
|---|
| 27 | # newlines to spaces |
|---|
| 28 | tr '\n' ' ' |
|---|
| 29 | # only one space character at a time |
|---|
| 30 | sed 's/[[:blank:]]{2,}/ /' |
|---|
| 31 | |
|---|