<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Indexing of web content in Alfresco Archive</title>
    <link>https://connect.hyland.com/t5/alfresco-archive/indexing-of-web-content/m-p/102931#M71706</link>
    <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;SPAN&gt;Thanks for the reply.&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;we are using xpath which uses "concat(/node/text(), ' ',/node2/text())" and mapping this to a text field.&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;It is a shame that we don't have XPath 2.0 support as some of those functions could be quite useful in extracting metadata from xml documents.&lt;/SPAN&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
    <pubDate>Fri, 31 Aug 2007 20:09:45 GMT</pubDate>
    <dc:creator>mark_smithson</dc:creator>
    <dc:date>2007-08-31T20:09:45Z</dc:date>
    <item>
      <title>Indexing of web content</title>
      <link>https://connect.hyland.com/t5/alfresco-archive/indexing-of-web-content/m-p/102927#M71702</link>
      <description>It seems that the indexing of content, (rather than custom attributes) for xml documents is not very intelligent. The tag names are included in the index, meaning that a search which includes one of the tag names will return all the documents of that type.Is there a way of changing this behaviour -</description>
      <pubDate>Sun, 05 Aug 2007 16:28:01 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-archive/indexing-of-web-content/m-p/102927#M71702</guid>
      <dc:creator>mark_smithson</dc:creator>
      <dc:date>2007-08-05T16:28:01Z</dc:date>
    </item>
    <item>
      <title>Re: Indexing of web content</title>
      <link>https://connect.hyland.com/t5/alfresco-archive/indexing-of-web-content/m-p/102928#M71703</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;SPAN&gt;Hi&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;See &lt;/SPAN&gt;&lt;A href="http://wiki.alfresco.com/wiki/Metadata_Extraction#XML_Metadata_Extraction" rel="nofollow noopener noreferrer"&gt;http://wiki.alfresco.com/wiki/Metadata_Extraction#XML_Metadata_Extraction&lt;/A&gt;&lt;SPAN&gt;.&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;The idea is to pull out the data you want as meta data. There is no way to specify tokenisation based on mimetype to tokenise xml with a specific lucene tokeniser.&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;Andy&lt;/SPAN&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Mon, 06 Aug 2007 15:07:24 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-archive/indexing-of-web-content/m-p/102928#M71703</guid>
      <dc:creator>andy</dc:creator>
      <dc:date>2007-08-06T15:07:24Z</dc:date>
    </item>
    <item>
      <title>Re: Indexing of web content</title>
      <link>https://connect.hyland.com/t5/alfresco-archive/indexing-of-web-content/m-p/102929#M71704</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;SPAN&gt;Ah, &lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;So if we had a number of elements whose content we wanted indexed we could exptract that using XPath unions an map that to the cm:content property.&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;Is that what you mean, or am I off track?&lt;/SPAN&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Mon, 06 Aug 2007 19:32:32 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-archive/indexing-of-web-content/m-p/102929#M71704</guid>
      <dc:creator>mark_smithson</dc:creator>
      <dc:date>2007-08-06T19:32:32Z</dc:date>
    </item>
    <item>
      <title>Re: Indexing of web content</title>
      <link>https://connect.hyland.com/t5/alfresco-archive/indexing-of-web-content/m-p/102930#M71705</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;SPAN&gt;Hi&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;Create your own aspect to hold the extracted meta data in properties. Use XPATH expressions to map xml elements to these properties. You could use one hold all property or several, it depends on what you want to do. The properties are likely to be of type d:text. &lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;You can not extract metadata into properties of type d:content.&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;Andy&lt;/SPAN&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Fri, 31 Aug 2007 15:14:15 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-archive/indexing-of-web-content/m-p/102930#M71705</guid>
      <dc:creator>andy</dc:creator>
      <dc:date>2007-08-31T15:14:15Z</dc:date>
    </item>
    <item>
      <title>Re: Indexing of web content</title>
      <link>https://connect.hyland.com/t5/alfresco-archive/indexing-of-web-content/m-p/102931#M71706</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;SPAN&gt;Thanks for the reply.&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;we are using xpath which uses "concat(/node/text(), ' ',/node2/text())" and mapping this to a text field.&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;It is a shame that we don't have XPath 2.0 support as some of those functions could be quite useful in extracting metadata from xml documents.&lt;/SPAN&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Fri, 31 Aug 2007 20:09:45 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-archive/indexing-of-web-content/m-p/102931#M71706</guid>
      <dc:creator>mark_smithson</dc:creator>
      <dc:date>2007-08-31T20:09:45Z</dc:date>
    </item>
  </channel>
</rss>

