sudo apt-get install tesseract-ocr tesseract-ocr-ita tesseract-ocr-data tesseract-ocr-dev



#!/bin/bash
#
#
##############################################################################
#
# xsane2tess 1.0
#
# *** tesseract made simple ***
#
#
##############################################################################
#
# xsane2tess is a TesseractOCR wrapper to be able to use tesseract with xsane
#
#
#
TEMP_DIR=~/tmp/ # folder for temporary files (TIFF & tesseract data)
ERRORLOG="xsane2tess.log" # file where STDERR goes
if [[ -z "$1" ]]
then
echo "Usage: $0 [OPTIONS]
xsane2tess converts files to TIF, scans them with TesseractOCR
and outputs the text in a file.
OPTIONS:
-i <file1> define input file (any image-format supported)
-o <file2> define output-file (*.txt)
-l <lang> define language-data tesseract should use
Progress- & error-messages will be stored in this logfile:
$TEMP_DIR$ERRORLOG
xsane2tess depends on
- ImageMagick http://www.imagemagick.org/
- TesseractOCR http://code.google.com/p/tesseract-ocr/
Some coding was stolen from 'ocube'
http://www.geocities.com/thierryguy/ocube.html
"
exit
fi
# get options...
while getopts ":i:o:l:" OPTION
do
case $OPTION in
i) # input filename (with path)
FILE_PATH="$OPTARG"
;;
o ) # output filename
FILE_OUT="$OPTARG"
;;
l ) # Language-selection
TES_LANG="$OPTARG"
;;
esac
done
# redirect STDOUT to FILE_OUT
exec 1>>$FILE_OUT
# redirect STDERR to ERRORLOG
exec 2>>$TEMP_DIR$ERRORLOG
# strip path from FILE_PATH, use filename only
IN_FILE=${FILE_PATH##*/}
TIF_FILE="$TEMP_DIR""${IN_FILE%.*}".tif
TXT_FILE="$TEMP_DIR""${IN_FILE%.*}"
# converting image into TIFF (ImageMagick)
convert "$FILE_PATH" -compress none "$TIF_FILE" 1>&2
# start OCR (tesseract expands output with *.txt)
tesseract "$TIF_FILE" "$TXT_FILE" -l "$TES_LANG" 1>&2
# STDOUT scanned text => FILE_OUT
cat "$TXT_FILE".txt
# delete graphic file after use
rm "$TIF_FILE"
# delete tesseract output
rm "$TXT_FILE".txtchmox a+x xsane2tess
sudo cp xsane2tess /usr/bin/
mkdir tmp
Guiodic ha scritto:1. copia e incolla quanto segue su gedit:
- Codice: Seleziona tutto
#!/bin/bash
#
#
##############################################################################
#
# xsane2tess 1.0
#
# *** tesseract made simple ***
#
#
##############################################################################
#
# xsane2tess is a TesseractOCR wrapper to be able to use tesseract with xsane
#
#
#
TEMP_DIR=~/tmp/ # folder for temporary files (TIFF & tesseract data)
ERRORLOG="xsane2tess.log" # file where STDERR goes
if [[ -z "$1" ]]
then
echo "Usage: $0 [OPTIONS]
xsane2tess converts files to TIF, scans them with TesseractOCR
and outputs the text in a file.
OPTIONS:
-i <file1> define input file (any image-format supported)
-o <file2> define output-file (*.txt)
-l <lang> define language-data tesseract should use
Progress- & error-messages will be stored in this logfile:
$TEMP_DIR$ERRORLOG
xsane2tess depends on
- ImageMagick http://www.imagemagick.org/
- TesseractOCR http://code.google.com/p/tesseract-ocr/
Some coding was stolen from 'ocube'
http://www.geocities.com/thierryguy/ocube.html
"
exit
fi
# get options...
while getopts ":i:o:l:" OPTION
do
case $OPTION in
i) # input filename (with path)
FILE_PATH="$OPTARG"
;;
o ) # output filename
FILE_OUT="$OPTARG"
;;
l ) # Language-selection
TES_LANG="$OPTARG"
;;
esac
done
# redirect STDOUT to FILE_OUT
exec 1>>$FILE_OUT
# redirect STDERR to ERRORLOG
exec 2>>$TEMP_DIR$ERRORLOG
# strip path from FILE_PATH, use filename only
IN_FILE=${FILE_PATH##*/}
TIF_FILE="$TEMP_DIR""${IN_FILE%.*}".tif
TXT_FILE="$TEMP_DIR""${IN_FILE%.*}"
# converting image into TIFF (ImageMagick)
convert "$FILE_PATH" -compress none "$TIF_FILE" 1>&2
# start OCR (tesseract expands output with *.txt)
tesseract "$TIF_FILE" "$TXT_FILE" -l "$TES_LANG" 1>&2
# STDOUT scanned text => FILE_OUT
cat "$TXT_FILE".txt
# delete graphic file after use
rm "$TIF_FILE"
# delete tesseract output
rm "$TXT_FILE".txt
2. salva il file nella tua home e chiamalo xsane2tess
3. da terminale dai:
- Codice: Seleziona tutto
chmox a+x xsane2tess
sudo cp xsane2tess /usr/bin/
mkdir tmp
vip@vip:~$ gedit#!/bin/bash
#
#
##############################################################################
#
# xsane2tess 1.0
#
# *** tesseract made simple ***
#
#
##############################################################################
#
# xsane2tess is a TesseractOCR wrapper to be able to use tesseract with xsane
#
#
#
TEMP_DIR=~/tmp/ # folder for temporary files (TIFF & tesseract data)
ERRORLOG="xsane2tess.log" # file where STDERR goes
if [[ -z "$1" ]]
then
echo "Usage: $0 [OPTIONS]
xsane2tess converts files to TIF, scans them with TesseractOCR
and outputs the text in a file.
OPTIONS:
-i <file1> define input file (any image-format supported)
-o <file2> define output-file (*.txt)
-l <lang> define language-data tesseract should use
Progress- & error-messages will be stored in this logfile:
$TEMP_DIR$ERRORLOG
xsane2tess depends on
- ImageMagick http://www.imagemagick.org/
- TesseractOCR http://code.google.com/p/tesseract-ocr/
Some coding was stolen from 'ocube'
http://www.geocities.com/thierryguy/ocube.html
"
exit
fi
# get options...
while getopts ":i:o:l:" OPTION
do
case $OPTION in
i) # input filename (with path)
FILE_PATH="$OPTARG"
;;
o ) # output filename
FILE_OUT="$OPTARG"
;;
l ) # Language-selection
TES_LANG="$OPTARG"
;;
esac
done
# redirect STDOUT to FILE_OUT
exec 1>>$FILE_OUT
# redirect STDERR to ERRORLOG
exec 2>>$TEMP_DIR$ERRORLOG
# strip path from FILE_PATH, use filename only
IN_FILE=${FILE_PATH##*/}
TIF_FILE="$TEMP_DIR""${IN_FILE%.*}".tif
TXT_FILE="$TEMP_DIR""${IN_FILE%.*}"
# converting image into TIFF (ImageMagick)
convert "$FILE_PATH" -compress none "$TIF_FILE" 1>&2
# start OCR (tesseract expands output with *.txt)
tesseract "$TIF_FILE" "$TXT_FILE" -l "$TES_LANG" 1>&2
# STDOUT scanned text => FILE_OUT
cat "$TXT_FILE".txt
# delete graphic file after use
rm "$TIF_FILE"
# delete tesseract output
rm "$TXT_FILE".txtvip@vip:~$ chmox a+x xsane2tess
bash: chmox: comando non trovato
vip@vip:~$ sudo chmox a+x xsane2tess
sudo: chmox: command not found
vip@vip:~$ sudo cp xsane2tess /usr/bin/
vip@vip:~$ mkdir tmp
neolinux ha scritto:....
Apro xsane scasione binaria, file salva OCR come testo e picche perchè manca un programma ocr gocr, confesso che ho dato ordine prima della scansione che nome mettere al file e l'ho messo con estensione.tif (con una f sola proprio come piace a tesseract in linea)

neolinux ha scritto:neolinux ha scritto:....
Apro xsane scasione binaria, file salva OCR come testo e picche perchè manca un programma ocr gocr, confesso che ho dato ordine prima della scansione che nome mettere al file e l'ho messo con estensione.tif (con una f sola proprio come piace a tesseract in linea)
Ma anche spegnendo e riaccendendo il PC non succede niente

neolinux ha scritto:neolinux ha scritto:neolinux ha scritto:....
Apro xsane scasione binaria, file salva OCR come testo e picche perchè manca un programma ocr gocr, confesso che ho dato ordine prima della scansione che nome mettere al file e l'ho messo con estensione.tif (con una f sola proprio come piace a tesseract in linea)
Ma anche spegnendo e riaccendendo il PC non succede niente
Ma in xsane andando su preferenze/ocr/si sostituisce gocr con l'ocr che si vuole (es.xsane2tess), adesso mi crea dei file txt vuoti, sarà un problema di risoluzione.

ORayan ha scritto:Ciao Neolinux,
Ti sei scaricato anche imagemagick?
Prova a leggere la guida che ho segnalato qualche post indietro, cioè questa:
http://www.mepisitalia.org/modules/newb ... 0&start=10


ORayan ha scritto:http://www.mepisitalia.org/modules/newb ... 0&start=10


sudo rm /usr/bin/xsane2tesschmod a+rx xsane2tesssudo cp xsane2tess /usr/binmkdir /home/NOMEUTENTE/tmpcontrollare e contare gli spazi siano esatti con i pulsanti a freccette della tastiera.settate il campo Comando OCR cancellando gocr e scrivendo
- Codice: Seleziona tutto
xsane2tess -l ita
nel campo Opzione file di ingresso lasciate
- Codice: Seleziona tutto
-i
nel campo Opzione file di uscita
- Codice: Seleziona tutto
-o
anche il campo Opzione della GUI non viene interessato, quindi rimarrà come da default:
- Codice: Seleziona tutto
-x


