<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: OCR Scanned PDF for Search Indexing in Alfresco Archive</title>
    <link>https://connect.hyland.com/t5/alfresco-archive/ocr-scanned-pdf-for-search-indexing/m-p/247088#M200218</link>
    <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;SPAN&gt;Custom transformer that uses PDFBox to extract text. If the extracted text count is 0 or less than a small value, OCR them. &lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;When we looked into this before, the recommendation to determine if a PDF is an image or a text PDF was to see if there were any embedded fonts. &lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;Ainga&lt;/SPAN&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
    <pubDate>Tue, 29 May 2012 20:34:24 GMT</pubDate>
    <dc:creator>zaizi</dc:creator>
    <dc:date>2012-05-29T20:34:24Z</dc:date>
    <item>
      <title>OCR Scanned PDF for Search Indexing</title>
      <link>https://connect.hyland.com/t5/alfresco-archive/ocr-scanned-pdf-for-search-indexing/m-p/247087#M200217</link>
      <description>I'm relatively new to Alfresco and have recently setup an environment where scanned bitmaps are run through a transformer for text/plain through tesseract OCR. This works brilliantly for single-page documents scanned into PNG, JPEG, TIFF, etc.For multi-page documents my scanner will create a PDF. Ho</description>
      <pubDate>Sun, 27 May 2012 18:46:24 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-archive/ocr-scanned-pdf-for-search-indexing/m-p/247087#M200217</guid>
      <dc:creator>pjaromin</dc:creator>
      <dc:date>2012-05-27T18:46:24Z</dc:date>
    </item>
    <item>
      <title>Re: OCR Scanned PDF for Search Indexing</title>
      <link>https://connect.hyland.com/t5/alfresco-archive/ocr-scanned-pdf-for-search-indexing/m-p/247088#M200218</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;SPAN&gt;Custom transformer that uses PDFBox to extract text. If the extracted text count is 0 or less than a small value, OCR them. &lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;When we looked into this before, the recommendation to determine if a PDF is an image or a text PDF was to see if there were any embedded fonts. &lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;Ainga&lt;/SPAN&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Tue, 29 May 2012 20:34:24 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-archive/ocr-scanned-pdf-for-search-indexing/m-p/247088#M200218</guid>
      <dc:creator>zaizi</dc:creator>
      <dc:date>2012-05-29T20:34:24Z</dc:date>
    </item>
    <item>
      <title>Re: OCR Scanned PDF for Search Indexing</title>
      <link>https://connect.hyland.com/t5/alfresco-archive/ocr-scanned-pdf-for-search-indexing/m-p/247089#M200219</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;SPAN&gt;Hi,&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;We have implemented an OCR server integrated with Alfresco, which can be used as transformer or via Javascript and Java. It runs on&amp;nbsp; a separate OCR server and supports Abbyy and Google OCR. for more informaiton see here - &lt;/SPAN&gt;&lt;A href="https://forums.alfresco.com/en/viewtopic.php?f=33&amp;amp;t=44739" rel="nofollow noopener noreferrer"&gt;https://forums.alfresco.com/en/viewtopic.php?f=33&amp;amp;t=44739&lt;/A&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Wed, 01 Aug 2012 14:39:47 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-archive/ocr-scanned-pdf-for-search-indexing/m-p/247089#M200219</guid>
      <dc:creator>wmay</dc:creator>
      <dc:date>2012-08-01T14:39:47Z</dc:date>
    </item>
  </channel>
</rss>

