02-17-2009 07:50 AM
10-11-2009 05:26 PM
06-26-2012 08:40 AM
<?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE beans PUBLIC '-//SPRING//DTD BEAN//EN' 'http://www.springframework.org/dtd/spring-beans.dtd'>
<beans>
<bean id="transformer.worker.ocr.tiff" class="org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerWorker">
<property name="mimetypeService">
<ref bean="mimetypeService" />
</property>
<property name="checkCommand">
<bean class="org.alfresco.util.exec.RuntimeExec">
<property name="commandsAndArguments">
<map>
<entry key=".*">
<list>
<!– <value>tesseract</value> –>
<value>/opt/alfresco-4.0.d/ocr</value>
</list>
</entry>
</map>
</property>
<property name="errorCodes">
<value>2</value>
</property>
</bean>
</property>
<property name="transformCommand">
<bean class="org.alfresco.util.exec.RuntimeExec">
<property name="commandsAndArguments">
<map>
<entry key=".*">
<list>
<!– <value>tesseract</value>
<value>${source}</value>
<value>${target}</value>
<value>-l</value>
<value>deu</value> –>
<value>/opt/alfresco-4.0.d/ocr</value>
<value>${source}</value>
<value>${target}</value>
</list>
</entry>
</map>
</property>
<property name="errorCodes">
<value>1,2</value>
</property>
</bean>
</property>
<property name="explicitTransformations">
<list>
<bean class="org.alfresco.repo.content.transform.ExplictTransformationDetails">
<property name="sourceMimetype"><value>image/tiff</value></property>
<property name="targetMimetype"><value>text/plain</value></property>
</bean>
</list>
</property>
</bean>
<bean id="transformer.ocr.tiff" class="org.alfresco.repo.content.transform.ProxyContentTransformer" parent="baseContentTransformer">
<property name="worker">
<ref bean="transformer.worker.ocr.tiff" />
</property>
</bean>
</beans>
<?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE beans PUBLIC '-//SPRING//DTD BEAN//EN' 'http://www.springframework.org/dtd/spring-beans.dtd'>
<beans>
<bean id="transformer.worker.ocr.pdf" class="org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerWorker">
<property name="mimetypeService">
<ref bean="mimetypeService" />
</property>
<property name="checkCommand">
<bean class="org.alfresco.util.exec.RuntimeExec">
<property name="commandsAndArguments">
<map>
<entry key=".*">
<list>
<!– <value>tesseract</value> –>
<value>/opt/alfresco-4.0.d/ocrPDF</value>
</list>
</entry>
</map>
</property>
<property name="errorCodes">
<value>2</value>
</property>
</bean>
</property>
<property name="transformCommand">
<bean class="org.alfresco.util.exec.RuntimeExec">
<property name="commandsAndArguments">
<map>
<entry key=".*">
<list>
<!– <value>tesseract</value>
<value>${source}</value>
<value>${target}</value>
<value>-l</value>
<value>deu</value> –>
<value>/opt/alfresco-4.0.d/ocrPDF</value>
<value>${source}</value>
<value>${target}</value>
</list>
</entry>
</map>
</property>
<property name="errorCodes">
<value>1,2</value>
</property>
</bean>
</property>
<property name="explicitTransformations">
<list>
<bean class="org.alfresco.repo.content.transform.ExplictTransformationDetails">
<property name="sourceMimetype"><value>application/pdf</value></property>
<property name="targetMimetype"><value>text/plain</value></property>
</bean>
</list>
</property>
</bean>
<bean id="transformer.ocr.pdf" class="org.alfresco.repo.content.transform.ProxyContentTransformer" parent="baseContentTransformer">
<property name="worker">
<ref bean="transformer.worker.ocr.pdf" />
</property>
</bean>
</beans>
#!/bin/bash
# Run OCR on a multi-page PDF file and create a new pdf with the
# extracted text in hidden layer. Requires cuneiform, hocr2pdf, gs.
# Usage: ./dwim.sh input.pdf output.pdf
set -e
input="$1"
output="$2"
echo "$(date)" >>/tmp/ocrtransform.log
echo "ocrPDFfrom $input to $output" >>/tmp/ocrtransform.log
tmpdir="$(mktemp -d)"
# extract images of the pages (note: resolution hard-coded)
gs -SDEVICE=tiffg4 -r300x300 -sOutputFile="$tmpdir/page-%04d.tiff" -dNOPAUSE -dBATCH – "$input"
# OCR each page individually and convert into PDF
for page in "$tmpdir"/page-*.tiff
do
base="${page%.tiff}"
# cuneiform -f hocr -o "$base.html" "$page"
tesseract "$page" "$base" -l deu hocr
hocr2pdf -i "$page" -o "$base.pdf" < "$base.html"
echo "hocr2pdf $page to $base" >>/tmp/ocrtransform.log
done
# combine the pages into one PDF
gs -q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -sOutputFile="$output" "$tmpdir"/page-*.pdf
rm -rf – "$tmpdir"
08-01-2012 10:37 AM
Tags
Find what you came for
We want to make your experience in Hyland Connect as valuable as possible, so we put together some helpful links.