<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Not able to index content of large pdfs in Alfresco Forum</title>
    <link>https://connect.hyland.com/t5/alfresco-forum/not-able-to-index-content-of-large-pdfs/m-p/55719#M20322</link>
    <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;P&gt;Hi Martin,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Yes, the pdf are readable not the scanned ones.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks&lt;/P&gt;&lt;P&gt;Hiten Rastogi&lt;/P&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
    <pubDate>Fri, 06 Jul 2018 08:44:40 GMT</pubDate>
    <dc:creator>hiten_rastogi1</dc:creator>
    <dc:date>2018-07-06T08:44:40Z</dc:date>
    <item>
      <title>Not able to index content of large pdfs</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/not-able-to-index-content-of-large-pdfs/m-p/55717#M20320</link>
      <description>Hi All,We are uploading pdf files upto 200MB in our DMS but the content are not getting indexed.&amp;nbsp;After searching we came to know that the maximum limit of pdf files that can be indexed are by default 10MB so we decided to override this prop to 1 GB&amp;nbsp;content.metadataExtracter.pdf.maxDocumentSizeMB=100</description>
      <pubDate>Fri, 06 Jul 2018 07:48:16 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/not-able-to-index-content-of-large-pdfs/m-p/55717#M20320</guid>
      <dc:creator>hiten_rastogi1</dc:creator>
      <dc:date>2018-07-06T07:48:16Z</dc:date>
    </item>
    <item>
      <title>Re: Not able to index content of large pdfs</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/not-able-to-index-content-of-large-pdfs/m-p/55718#M20321</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;P&gt;Just a first question: your documents are pdfs containing extractable text, not just scanned pages without ocr or protected by restricted pdf permissions?&lt;/P&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Fri, 06 Jul 2018 08:42:05 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/not-able-to-index-content-of-large-pdfs/m-p/55718#M20321</guid>
      <dc:creator>mehe</dc:creator>
      <dc:date>2018-07-06T08:42:05Z</dc:date>
    </item>
    <item>
      <title>Re: Not able to index content of large pdfs</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/not-able-to-index-content-of-large-pdfs/m-p/55719#M20322</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;P&gt;Hi Martin,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Yes, the pdf are readable not the scanned ones.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks&lt;/P&gt;&lt;P&gt;Hiten Rastogi&lt;/P&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Fri, 06 Jul 2018 08:44:40 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/not-able-to-index-content-of-large-pdfs/m-p/55719#M20322</guid>
      <dc:creator>hiten_rastogi1</dc:creator>
      <dc:date>2018-07-06T08:44:40Z</dc:date>
    </item>
    <item>
      <title>Re: Not able to index content of large pdfs</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/not-able-to-index-content-of-large-pdfs/m-p/55720#M20323</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;P&gt;any errors in the alfreso or tomcat logs - i.e. java heap space errors?&lt;/P&gt;&lt;P&gt;Maybe you can increase the transformation logging via log4j:&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;log4j.logger.org.alfresco.repo.content.transform.TransformerDebug=DEBUG&lt;/P&gt;&lt;P&gt;log4j.logger.org.alfresco.util.exec.RuntimeExec=DEBUG&lt;/P&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Fri, 06 Jul 2018 08:47:21 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/not-able-to-index-content-of-large-pdfs/m-p/55720#M20323</guid>
      <dc:creator>mehe</dc:creator>
      <dc:date>2018-07-06T08:47:21Z</dc:date>
    </item>
    <item>
      <title>Re: Not able to index content of large pdfs</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/not-able-to-index-content-of-large-pdfs/m-p/55721#M20324</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;P&gt;Hi Martin,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I enabled the logs and found out the below. Please help&amp;nbsp; me in discerning the same.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;log4j.logger.org.alfresco.repo.content.metadata.AbstractMappingMetadataExtracter=DEBUG&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;2018-07-06 15:07:03,442 DEBUG [content.metadata.AbstractMappingMetadataExtracter] [http-apr-8080-exec-1] Starting metadata extraction: &lt;BR /&gt; reader: ContentAccessor[ contentUrl=store://2018/7/6/15/7/08761879-e49c-4fa8-95e3-c22f160074a5.bin, mimetype=application/pdf, size=41637989, encoding=UTF-8, locale=en_GB]&lt;BR /&gt; extracter: org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter@7671e45b&lt;BR /&gt; 2018-07-06 15:07:03,443 DEBUG [content.metadata.AbstractMappingMetadataExtracter] [http-apr-8080-exec-1] Concurrent extractions : 0&lt;BR /&gt; 2018-07-06 15:07:03,443 DEBUG [content.metadata.AbstractMappingMetadataExtracter] [http-apr-8080-exec-1] New extraction accepted. Concurrent extractions : 1&lt;BR /&gt; 2018-07-06 15:07:05,089 DEBUG [content.metadata.AbstractMappingMetadataExtracter] [http-apr-8080-exec-1] Extraction finalized. Remaining concurrent extraction : 0&lt;BR /&gt; 2018-07-06 15:07:05,089 DEBUG [content.metadata.AbstractMappingMetadataExtracter] [http-apr-8080-exec-1] Converted extracted raw values to system values: &lt;BR /&gt; Raw Properties: {pdf&lt;img id="smileytongue" class="emoticon emoticon-smileytongue" src="https://connect.hyland.com/i/smilies/16x16_smiley-tongue.png" alt="Smiley Tongue" title="Smiley Tongue" /&gt;DFVersion=1.5, TIKA_PARSER_PARSE_SHAPES=false, comments=null, dc:subject=null, author=null, xmpTPg:NPages=84, dc:format=application/pdf; version=1.5, title=null, pdf:encrypted=false, Content-Type=application/pdf}&lt;BR /&gt;&lt;SPAN&gt; System Properties: {{&lt;/SPAN&gt;&lt;A class="jive-link-external-small" href="http://www.alfresco.org/model/content/1.0" rel="nofollow noopener noreferrer" target="_blank"&gt;http://www.alfresco.org/model/content/1.0&lt;/A&gt;&lt;SPAN&gt;}title=null, {&lt;/SPAN&gt;&lt;A class="jive-link-external-small" href="http://www.alfresco.org/model/content/1.0" rel="nofollow noopener noreferrer" target="_blank"&gt;http://www.alfresco.org/model/content/1.0&lt;/A&gt;&lt;SPAN&gt;}author=null}&lt;/SPAN&gt;&lt;BR /&gt; 2018-07-06 15:07:05,089 DEBUG [content.metadata.AbstractMappingMetadataExtracter] [http-apr-8080-exec-1] Extracted Metadata from ContentAccessor[ contentUrl=store://2018/7/6/15/7/08761879-e49c-4fa8-95e3-c22f160074a5.bin, mimetype=application/pdf, size=41637989, encoding=UTF-8, locale=en_GB]&lt;BR /&gt; Found: {pdf&lt;img id="smileytongue" class="emoticon emoticon-smileytongue" src="https://connect.hyland.com/i/smilies/16x16_smiley-tongue.png" alt="Smiley Tongue" title="Smiley Tongue" /&gt;DFVersion=1.5, TIKA_PARSER_PARSE_SHAPES=false, comments=null, dc:subject=null, author=null, xmpTPg:NPages=84, dc:format=application/pdf; version=1.5, title=null, pdf:encrypted=false, Content-Type=application/pdf}&lt;BR /&gt;&lt;SPAN&gt; Mapped and Accepted: {{&lt;/SPAN&gt;&lt;A class="jive-link-external-small" href="http://www.alfresco.org/model/content/1.0" rel="nofollow noopener noreferrer" target="_blank"&gt;http://www.alfresco.org/model/content/1.0&lt;/A&gt;&lt;SPAN&gt;}title=null, {&lt;/SPAN&gt;&lt;A class="jive-link-external-small" href="http://www.alfresco.org/model/content/1.0" rel="nofollow noopener noreferrer" target="_blank"&gt;http://www.alfresco.org/model/content/1.0&lt;/A&gt;&lt;SPAN&gt;}author=null}&lt;/SPAN&gt;&lt;BR /&gt; 2018-07-06 15:07:05,090 DEBUG [content.metadata.AbstractMappingMetadataExtracter] [http-apr-8080-exec-1] Completed metadata extraction: &lt;BR /&gt; reader: ContentAccessor[ contentUrl=store://2018/7/6/15/7/08761879-e49c-4fa8-95e3-c22f160074a5.bin, mimetype=application/pdf, size=41637989, encoding=UTF-8, locale=en_GB]&lt;BR /&gt; extracter: org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter@7671e45b&lt;BR /&gt;&lt;SPAN&gt; changed: {{&lt;/SPAN&gt;&lt;A class="jive-link-external-small" href="http://www.alfresco.org/model/content/1.0" rel="nofollow noopener noreferrer" target="_blank"&gt;http://www.alfresco.org/model/content/1.0&lt;/A&gt;&lt;SPAN&gt;}title=null, {&lt;/SPAN&gt;&lt;A class="jive-link-external-small" href="http://www.alfresco.org/model/content/1.0" rel="nofollow noopener noreferrer" target="_blank"&gt;http://www.alfresco.org/model/content/1.0&lt;/A&gt;&lt;SPAN&gt;}author=null}&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;log4j.logger.org.alfresco.repo.content.transform.TransformerDebug=DEBUG&lt;BR /&gt;log4j.logger.org.alfresco.util.exec.RuntimeExec=DEBUG&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;2018-07-06 15:07:19,467 INFO [web.scripts.QuickShareStatus] [http-apr-8080-exec-1] Successfully retrieved quick share information from Alfresco.&lt;BR /&gt; 2018-07-06 15:07:21,396 INFO [web.scripts.MimetypesQuery] [http-apr-8080-exec-8] Successfully retrieved mimetypes information from Alfresco.&lt;BR /&gt; 2018-07-06 15:07:30,029 DEBUG [content.transform.TransformerDebug] [http-bio-8443-exec-3] 33 pdf txt Xerox Scan_19052018115315(1)-2.pdf 39.7 MB -- index -- SolrIndexer NO transformers&lt;BR /&gt; 2018-07-06 15:07:30,037 DEBUG [content.transform.TransformerDebug] [http-bio-8443-exec-3] 33 workspace://SpacesStore/66aa186a-9dc9-44aa-8680-fad46a88105f &lt;BR /&gt; 2018-07-06 15:07:30,038 DEBUG [content.transform.TransformerDebug] [http-bio-8443-exec-3] 33 --a) [50] PdfBox &amp;gt; 25 MB&lt;BR /&gt; 2018-07-06 15:07:30,038 DEBUG [content.transform.TransformerDebug] [http-bio-8443-exec-3] 33 --b) [120] TikaAuto &amp;gt; 25 MB&lt;BR /&gt; 2018-07-06 15:07:30,038 DEBUG [content.transform.TransformerDebug] [http-bio-8443-exec-3] 33 Finished in 10 ms Transformer NOT called&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks&lt;/P&gt;&lt;P&gt;Hiten Rastogi&lt;/P&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Fri, 06 Jul 2018 09:39:27 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/not-able-to-index-content-of-large-pdfs/m-p/55721#M20324</guid>
      <dc:creator>hiten_rastogi1</dc:creator>
      <dc:date>2018-07-06T09:39:27Z</dc:date>
    </item>
    <item>
      <title>Re: Not able to index content of large pdfs</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/not-able-to-index-content-of-large-pdfs/m-p/55722#M20325</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;P&gt;You can see the problem in the log output. Indexing of the content has nothing to do with the metadata extracter, so increasing its limit did not have any impact on your problem. You need to increase the limits of the PDF =&amp;gt; TXT transformers so they are not rejecting the PDF source document.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Check &lt;A _jive_internal="true" href="https://community.alfresco.com/docs/DOC-6111-content-transformation-limits" rel="nofollow noopener noreferrer"&gt;content transformation limits&lt;/A&gt;&amp;nbsp;and &lt;A href="https://docs.alfresco.com/5.2/references/dev-extension-points-content-transformer.html" rel="nofollow noopener noreferrer"&gt;content transformers (and renditions)&lt;/A&gt; for details on how to configure the Transformers subsystem.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;The following lines in your log output show that transformers have a 25 MB source file limit and thus are not acting on a 200 MB PDF:&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN style="color: #727174; background-color: #ffffff;"&gt;2018-07-06 15:07:30,038 DEBUG [content.transform.TransformerDebug] [http-bio-8443-exec-3] 33 --a) [50] PdfBox &amp;gt; 25 MB&lt;/SPAN&gt;&lt;BR style="color: #727174; background-color: #ffffff;" /&gt;&lt;SPAN style="color: #727174; background-color: #ffffff;"&gt;2018-07-06 15:07:30,038 DEBUG [content.transform.TransformerDebug] [http-bio-8443-exec-3] 33 --b) [120] TikaAuto &amp;gt; 25 MB&lt;/SPAN&gt;&lt;BR style="color: #727174; background-color: #ffffff;" /&gt;&lt;SPAN style="color: #727174; background-color: #ffffff;"&gt;2018-07-06 15:07:30,038 DEBUG [content.transform.TransformerDebug] [http-bio-8443-exec-3] 33 Finished in 10 ms Transformer NOT called&lt;/SPAN&gt;&lt;/P&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Fri, 06 Jul 2018 10:00:34 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/not-able-to-index-content-of-large-pdfs/m-p/55722#M20325</guid>
      <dc:creator>afaust</dc:creator>
      <dc:date>2018-07-06T10:00:34Z</dc:date>
    </item>
    <item>
      <title>Re: Not able to index content of large pdfs</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/not-able-to-index-content-of-large-pdfs/m-p/55723#M20326</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;P&gt;Thanks Axel,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;It is working now.&lt;/P&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Fri, 06 Jul 2018 10:33:58 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/not-able-to-index-content-of-large-pdfs/m-p/55723#M20326</guid>
      <dc:creator>hiten_rastogi1</dc:creator>
      <dc:date>2018-07-06T10:33:58Z</dc:date>
    </item>
    <item>
      <title>Re: Not able to index content of large pdfs</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/not-able-to-index-content-of-large-pdfs/m-p/55724#M20327</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;P&gt;...don't forget to comment out the log4j debugging options again - this could be a bit noisy in production...&lt;/P&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Fri, 06 Jul 2018 10:39:13 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/not-able-to-index-content-of-large-pdfs/m-p/55724#M20327</guid>
      <dc:creator>mehe</dc:creator>
      <dc:date>2018-07-06T10:39:13Z</dc:date>
    </item>
    <item>
      <title>Re: Not able to index content of large pdfs</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/not-able-to-index-content-of-large-pdfs/m-p/55725#M20328</link>
      <description>&lt;P&gt;&lt;SPAN class="tlid-translation translation"&gt;&lt;SPAN class=""&gt;Hi hiten_rastogi1,&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN class=""&gt;All right?&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN class=""&gt;What did you do to solve this problem?&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN class=""&gt;I have the same situation.&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;See the catalina.out log&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN class=""&gt;2020-10-01 17: 03: 28,779 WARN [content.metadata.AbstractMappingMetadataExtracter] [http-nio-8080-exec-41] Metadata extraction rejected:&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN class=""&gt;Extracter: org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter@758471b1&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN class=""&gt;Reason: Max doc size exceeded 10.0 MB&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN class=""&gt;2020-10-01 17: 03: 29,193 WARN [content.metadata.AbstractMappingMetadataExtracter] [http-nio-8080-exec-28] Metadata extraction rejected:&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN class=""&gt;Extracter: org.alfresco.repo.content.metadata.PdfBoxMetadataExtracter@758471b1&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;Reason: Max doc size exceeded 10.0 MB&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;Thaks a lot!&lt;/P&gt;</description>
      <pubDate>Fri, 02 Oct 2020 12:27:52 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/not-able-to-index-content-of-large-pdfs/m-p/55725#M20328</guid>
      <dc:creator>jbrasil</dc:creator>
      <dc:date>2020-10-02T12:27:52Z</dc:date>
    </item>
  </channel>
</rss>

