03-28-2016 05:56 AM
<?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE beans PUBLIC '-//SPRING//DTD BEAN//EN' 'http://www.springframework.org/dtd/spring-beans.dtd'>
<beans>
<bean id="transformer.worker.pdfimg2ocrtxt" class="org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerWorker" >
<property name="mimetypeService">
<ref bean="mimetypeService" />
</property>
<property name="checkCommand">
<bean class="org.alfresco.util.exec.RuntimeExec">
<property name="commandsAndArguments">
<map>
<entry key=".*">
<list>
<value>ls</value>
<value>/opt/alfresco-community/pdf.sh</value>
</list>
</entry>
</map>
</property>
</bean>
</property>
<property name="transformCommand">
<bean class="org.alfresco.util.exec.RuntimeExec">
<property name="commandsAndArguments">
<map>
<entry key=".*">
<list>
<value>/opt/alfresco-community/pdf.sh</value>
<value>${source}</value>
<value>${target}</value>
</list>
</entry>
</map>
</property>
<property name="errorCodes">
<value>1,2,3</value>
</property>
</bean>
</property>
</bean>
<bean id="transformer.pdfimg2ocrtxt" class="org.alfresco.repo.content.transform.ProxyContentTransformer" parent="baseContentTransformer">
<property name="worker">
<ref bean="transformer.worker.pdfimg2ocrtxt" />
</property>
</bean>
</beans>
#!/bin/bash
SOURCE=$1
TARGET=$2
TMPDIR=/home/yosri/tmp
name=yosri
TEMP_PDFTXT_FILE=$TMPDIR/pdftext.txt
echo running command "pdftotext -nopgbrk $SOURCE $TEMP_PDFTXT_FILE"
pdftotext -nopgbrk $SOURCE $TEMP_PDFTXT_FILE
FILESIZE=$(stat -c%s "$TEMP_PDFTXT_FILE")
echo "Size of $TEMP_PDFTXT_FILE = $FILESIZE bytes." >> /home/yosri/logfile.txt
# if file exists and has a size bigger than 0 then set wordlist as result of transformation and exit.
if [ -s $TEMP_PDFTXT_FILE ]; then
echo Found wordlist from in $TEMP_PDFTXT_FILE >> /home/yosri/logfile.txt
cat $TEMP_PDFTXT_FILE >> $TARGET
rm -rf $TMPDIR/$name
exit 0;
fi
# splitting to individual pages
gs -dSAFER -dBATCH -dNOPAUSE -sDEVICE=jpeg -r300 -dTextAlphaBits=4 -o out_%04d.jpg -f $SOURCE
# process each page
for f in $( ls *.jpg ); do
# extract text
tesseract $f $TMPDIR/${f%.*} -l eng
cat $TMPDIR/${f%.*}.txt >> $TMPDIR/res.txt
rm -f $TMPDIR/${f%.*}.txt
rm -f $f
done
#combine all pages back to a ${TARGET}
cat $TMPDIR/res.txt >> $TARGET
content.transformer.pdfimg2ocrtxt.priority=30
content.transformer.pdfimg2ocrtxt.extensions.pdf.txt.supported=true
content.transformer.pdfimg2ocrtxt.extensions.pdf.txt.priority=30
content.transformer.pdfimg2ocrtxt.extensions.pdf.txt.maxSourceSizeKBytes.use.index=9999
03-28-2016 09:36 AM
01-16-2017 12:07 PM
Hi,
how I cen do this on Windows and
Alfresco Community (Build: 201612)
===============================
Contains:
- Alfresco Platform: 5.2.d
- Alfresco Share: 5.2.c
Can you help me?
Tags
Find what you came for
We want to make your experience in Hyland Connect as valuable as possible, so we put together some helpful links.