06-02-2019 11:29 AM
Hi,
I am successfully configured OCR with my alfresco (windows installation). But, it only working for PNG, TIFF, JPG & GPEG. But i need it for PDF extension also because most of scanned files are in pdf format.
My tesseract-ocr-transform-context.xml is,
<?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE beans PUBLIC '-//SPRING//DTD BEAN//EN' 'http://www.springframework.org/dtd/spring-beans.dtd'>
<beans default-lazy-init="false" default-autowire="no" default-dependency-check="none">
<bean id="transformer.worker.ocr.tiff" class="org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerWorker" lazy-init="default" autowire="default" dependency-check="default">
<property name="mimetypeService">
<ref bean="mimetypeService" />
</property>
<property name="checkCommand">
<bean class="org.alfresco.util.exec.RuntimeExec" lazy-init="default" autowire="default" dependency-check="default">
<property name="commandsAndArguments">
<map>
<entry key="Windows.*">
<list>
<value>C:\Windows\System32\cmd.exe</value>
<value>/C</value>
<value>dir C:\alfresco-community\ocr.bat</value>
</list>
</entry>
</map>
</property>
<property name="errorCodes">
<value>1</value>
</property>
</bean>
</property>
<property name="transformCommand">
<bean class="org.alfresco.util.exec.RuntimeExec" lazy-init="default" autowire="default" dependency-check="default">
<property name="commandsAndArguments">
<map>
<entry key="Windows.*">
<list>
<value>C:\Windows\System32\cmd.exe</value>
<value>/C</value>
<value>C:\alfresco-community\ocr.bat</value>
<value>"${source}"</value>
<value>"${target}"</value>
</list>
</entry>
</map>
</property>
<property name="errorCodes">
<value>1,2</value>
</property>
</bean>
</property>
<property name="explicitTransformations">
<list>
<bean class="org.alfresco.repo.content.transform.ExplictTransformationDetails" lazy-init="default" autowire="default" dependency-check="default">
<property name="sourceMimetype">
<value>image/tiff</value>
</property>
<property name="targetMimetype">
<value>text/plain</value>
</property>
</bean>
<bean class="org.alfresco.repo.content.transform.ExplictTransformationDetails" lazy-init="default" autowire="default" dependency-check="default">
<property name="sourceMimetype">
<value>image/png</value>
</property>
<property name="targetMimetype">
<value>text/plain</value>
</property>
</bean>
<bean class="org.alfresco.repo.content.transform.ExplictTransformationDetails" lazy-init="default" autowire="default" dependency-check="default">
<property name="sourceMimetype">
<value>image/jpeg</value>
</property>
<property name="targetMimetype">
<value>text/plain</value>
</property>
</bean>
<bean class="org.alfresco.repo.content.transform.ExplictTransformationDetails" lazy-init="default" autowire="default" dependency-check="default">
<property name="sourceMimetype">
<value>image/jpg</value>
</property>
<property name="targetMimetype">
<value>text/plain</value>
</property>
</bean>
</list>
</property>
</bean>
<bean id="transformer.ocr.tiff" class="org.alfresco.repo.content.transform.ProxyContentTransformer" parent="baseContentTransformer" lazy-init="default" autowire="default" dependency-check="default">
<property name="worker">
<ref bean="transformer.worker.ocr.tiff" />
</property>
</bean>
</beans>
I changed it into this including another bean (red colored),
<?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE beans PUBLIC '-//SPRING//DTD BEAN//EN' 'http://www.springframework.org/dtd/spring-beans.dtd'>
<beans default-lazy-init="false" default-autowire="no" default-dependency-check="none">
<bean id="transformer.worker.ocr.tiff" class="org.alfresco.repo.content.transform.RuntimeExecutableContentTransformerWorker" lazy-init="default" autowire="default" dependency-check="default">
<property name="mimetypeService">
<ref bean="mimetypeService" />
</property>
<property name="checkCommand">
<bean class="org.alfresco.util.exec.RuntimeExec" lazy-init="default" autowire="default" dependency-check="default">
<property name="commandsAndArguments">
<map>
<entry key="Windows.*">
<list>
<value>C:\Windows\System32\cmd.exe</value>
<value>/C</value>
<value>dir C:\alfresco-community\ocr.bat</value>
</list>
</entry>
</map>
</property>
<property name="errorCodes">
<value>1</value>
</property>
</bean>
</property>
<property name="transformCommand">
<bean class="org.alfresco.util.exec.RuntimeExec" lazy-init="default" autowire="default" dependency-check="default">
<property name="commandsAndArguments">
<map>
<entry key="Windows.*">
<list>
<value>C:\Windows\System32\cmd.exe</value>
<value>/C</value>
<value>C:\alfresco-community\ocr.bat</value>
<value>"${source}"</value>
<value>"${target}"</value>
</list>
</entry>
</map>
</property>
<property name="errorCodes">
<value>1,2</value>
</property>
</bean>
</property>
<property name="explicitTransformations">
<list>
<bean class="org.alfresco.repo.content.transform.ExplictTransformationDetails" lazy-init="default" autowire="default" dependency-check="default">
<property name="sourceMimetype">
<value>image/tiff</value>
</property>
<property name="targetMimetype">
<value>text/plain</value>
</property>
</bean>
<bean class="org.alfresco.repo.content.transform.ExplictTransformationDetails" lazy-init="default" autowire="default" dependency-check="default">
<property name="sourceMimetype">
<value>image/png</value>
</property>
<property name="targetMimetype">
<value>text/plain</value>
</property>
</bean>
<bean class="org.alfresco.repo.content.transform.ExplictTransformationDetails" lazy-init="default" autowire="default" dependency-check="default">
<property name="sourceMimetype">
<value>image/pdf</value>
</property>
<property name="targetMimetype">
<value>text/plain</value>
</property>
</bean>
<bean class="org.alfresco.repo.content.transform.ExplictTransformationDetails" lazy-init="default" autowire="default" dependency-check="default">
<property name="sourceMimetype">
<value>image/jpeg</value>
</property>
<property name="targetMimetype">
<value>text/plain</value>
</property>
</bean>
<bean class="org.alfresco.repo.content.transform.ExplictTransformationDetails" lazy-init="default" autowire="default" dependency-check="default">
<property name="sourceMimetype">
<value>image/jpg</value>
</property>
<property name="targetMimetype">
<value>text/plain</value>
</property>
</bean>
</list>
</property>
</bean>
<bean id="transformer.ocr.tiff" class="org.alfresco.repo.content.transform.ProxyContentTransformer" parent="baseContentTransformer" lazy-init="default" autowire="default" dependency-check="default">
<property name="worker">
<ref bean="transformer.worker.ocr.tiff" />
</property>
</bean>
</beans>
but, it gives me bellow error.
org.springframework.beans.factory.BeanCreationException: Error creating bean with name 'transformer.ocr.tiff' defined in file [C:\alfresco-community\tomcat\shared\classes\alfresco\extension\tesseract-ocr-transform-context.xml]: Invocation of init method failed; nested exception is java.lang.IllegalArgumentException: content.transformer.ocr.tiff.extensions.bin.txt.priority has been specified more than once
2019-06-02 20:00:46,019 INFO [org.alfresco.repo.management.subsystems.ChildApplicationContextFactory] [localhost-startStop-1] Stopping 'Transformers' subsystem, ID: [Transformers, default]
2019-06-02 20:00:46,020 INFO [org.alfresco.repo.management.subsystems.ChildApplicationContextFactory] [localhost-startStop-1] Stopped 'Transformers' subsystem, ID: [Transformers, default]
2019-06-02 20:00:46,021 INFO [org.alfresco.repo.management.subsystems.ChildApplicationContextFactory] [localhost-startStop-1] Stopping 'Authentication' subsystem, ID: [Authentication, managed, alfrescoNtlm1]
2019-06-02 20:00:46,021 INFO [org.alfresco.repo.management.subsystems.ChildApplicationContextFactory] [localhost-startStop-1] Stopped 'Authentication' subsystem, ID: [Authentication, managed, alfrescoNtlm1]
2019-06-02 20:00:46,030 ERROR [org.springframework.web.context.ContextLoader] [localhost-startStop-1] Context initialization failed
org.springframework.beans.factory.BeanCreationException: Error creating bean with name 'transformer.ocr.tiff' defined in file [C:\alfresco-community\tomcat\shared\classes\alfresco\extension\tesseract-ocr-transform-context.xml]: Invocation of init method failed; nested exception is java.lang.IllegalArgumentException: content.transformer.ocr.tiff.extensions.bin.txt.priority has been specified more than once
at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.initializeBean(AbstractAutowireCapableBeanFactory.java:1514)
at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.doCreateBean(AbstractAutowireCapableBeanFactory.java:521)
at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.createBean(AbstractAutowireCapableBeanFactory.java:458)
at org.springframework.beans.factory.support.AbstractBeanFactory$1.getObject(AbstractBeanFactory.java:293)
at org.springframework.beans.factory.support.DefaultSingletonBeanRegistry.getSingleton(DefaultSingletonBeanRegistry.java:223)
at org.springframework.beans.factory.support.AbstractBeanFactory.doGetBean(AbstractBeanFactory.java:290)
at org.springframework.beans.factory.support.AbstractBeanFactory.getBean(AbstractBeanFactory.java:191)
at org.springframework.beans.factory.support.DefaultListableBeanFactory.preInstantiateSingletons(DefaultListableBeanFactory.java:636)
at org.springframework.context.support.AbstractApplicationContext.finishBeanFactoryInitialization(AbstractApplicationContext.java:938)
at org.springframework.context.support.AbstractApplicationContext.refresh(AbstractApplicationContext.java:479)
at org.springframework.web.context.ContextLoader.configureAndRefreshWebApplicationContext(ContextLoader.java:410)
at org.springframework.web.context.ContextLoader.initWebApplicationContext(ContextLoader.java:306)
at org.springframework.web.context.ContextLoaderListener.contextInitialized(ContextLoaderListener.java:112)
at org.alfresco.web.app.ContextLoaderListener.contextInitialized(ContextLoaderListener.java:70)
at org.apache.catalina.core.StandardContext.listenerStart(StandardContext.java:5118)
at org.apache.catalina.core.StandardContext.startInternal(StandardContext.java:5634)
at org.apache.catalina.util.LifecycleBase.start(LifecycleBase.java:145)
at org.apache.catalina.core.ContainerBase.addChildInternal(ContainerBase.java:899)
at org.apache.catalina.core.ContainerBase.addChild(ContainerBase.java:875)
at org.apache.catalina.core.StandardHost.addChild(StandardHost.java:652)
at org.apache.catalina.startup.HostConfig.deployDescriptor(HostConfig.java:679)
at org.apache.catalina.startup.HostConfig$DeployDescriptor.run(HostConfig.java:1966)
at java.util.concurrent.Executors$RunnableAdapter.call(Unknown Source)
at java.util.concurrent.FutureTask.run(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
Caused by: java.lang.IllegalArgumentException: content.transformer.ocr.tiff.extensions.bin.txt.priority has been specified more than once
at org.alfresco.repo.content.transform.TransformerPropertySetter.setProperties(TransformerPropertySetter.java:119)
at org.alfresco.repo.content.transform.TransformerConfigImpl.setProperties(TransformerConfigImpl.java:239)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at org.alfresco.repo.management.subsystems.SubsystemProxyFactory$1.invoke(SubsystemProxyFactory.java:79)
at org.springframework.aop.framework.ReflectiveMethodInvocation.proceed(ReflectiveMethodInvocation.java:172)
at org.springframework.aop.framework.JdkDynamicAopProxy.invoke(JdkDynamicAopProxy.java:204)
at com.sun.proxy.$Proxy25.setProperties(Unknown Source)
at org.alfresco.repo.content.transform.ContentTransformerHelper.logDeprecatedSetter(ContentTransformerHelper.java:251)
at org.alfresco.repo.content.transform.ProxyContentTransformer.register(ProxyContentTransformer.java:74)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.invokeCustomInitMethod(AbstractAutowireCapableBeanFactory.java:1640)
at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.invokeInitMethods(AbstractAutowireCapableBeanFactory.java:1581)
at org.springframework.beans.factory.support.AbstractAutowireCapableBeanFactory.initializeBean(AbstractAutowireCapableBeanFactory.java:1511)
... 26 more
Did i missed anything or there is another way to do this?
please help me.
Sorry for my English.
Thank you.
06-04-2019 12:44 AM
04-24-2025 03:44 AM
Hi,
I am currently working on integrating OCR functionality into Alfresco 7.2, running on a Windows Server. I have successfully installed the following dependencies:
Tesseract
Ghostscript
OCRmyPDF
I have placed the required JAR files:
simple-ocr-repo-2.3.1.jar
simple-ocr-share-2.3.1.jar
into the appropriate platform and share directories of the Alfresco installation.
The following properties have been added to the alfresco-global.properties file:
I would appreciate your assistance in identifying the cause and guiding me toward a resolution. Please let me know if you require any further logs, configuration files, or additional details.
Thank you in advance for your support.
Explore our Alfresco products with the links below. Use labels to filter content by product module.