<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Quality of Filters for MSOffice in Alfresco Archive</title>
    <link>https://connect.hyland.com/t5/alfresco-archive/quality-of-filters-for-msoffice/m-p/40443#M21629</link>
    <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;SPAN&gt;Text is extracted from MS Office documents using Open Office server. It successfully extracts text from Word, PowerPoint and Excel. PDFBox is used to extract text from PDF documents. Text is extracted from HTML documents using the built in HTML-&amp;gt;text support in the Java Swing library.&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;So the "quality" of extraction is directly related to the quality of those 3rd party libraries and services.&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;Thanks,&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;Kevin&lt;/SPAN&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
    <pubDate>Tue, 15 Aug 2006 08:52:14 GMT</pubDate>
    <dc:creator>kevinr</dc:creator>
    <dc:date>2006-08-15T08:52:14Z</dc:date>
    <item>
      <title>Quality of Filters for MSOffice</title>
      <link>https://connect.hyland.com/t5/alfresco-archive/quality-of-filters-for-msoffice/m-p/40442#M21628</link>
      <description>HelloAssuming that someone stores and indexes MS Office documents in Alfresco, I'd like to know how the quality of the index is. Some DMS are not really perfect in this respect. Thanks for your help!Regards,Jochen</description>
      <pubDate>Mon, 14 Aug 2006 20:08:06 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-archive/quality-of-filters-for-msoffice/m-p/40442#M21628</guid>
      <dc:creator>jochen</dc:creator>
      <dc:date>2006-08-14T20:08:06Z</dc:date>
    </item>
    <item>
      <title>Re: Quality of Filters for MSOffice</title>
      <link>https://connect.hyland.com/t5/alfresco-archive/quality-of-filters-for-msoffice/m-p/40443#M21629</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;SPAN&gt;Text is extracted from MS Office documents using Open Office server. It successfully extracts text from Word, PowerPoint and Excel. PDFBox is used to extract text from PDF documents. Text is extracted from HTML documents using the built in HTML-&amp;gt;text support in the Java Swing library.&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;So the "quality" of extraction is directly related to the quality of those 3rd party libraries and services.&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;Thanks,&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;Kevin&lt;/SPAN&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Tue, 15 Aug 2006 08:52:14 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-archive/quality-of-filters-for-msoffice/m-p/40443#M21629</guid>
      <dc:creator>kevinr</dc:creator>
      <dc:date>2006-08-15T08:52:14Z</dc:date>
    </item>
  </channel>
</rss>

