<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic SOLR configuration for search tokenization in Alfresco Forum</title>
    <link>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121707#M33382</link>
    <description>&lt;P&gt;Hello all&lt;/P&gt;&lt;P&gt;I am looking for right documentation or steps to deal with a request we have.&lt;/P&gt;&lt;P&gt;We&amp;nbsp;want to disable tokenization on special characters.&amp;nbsp;&lt;/P&gt;&lt;P&gt;I tried searching this forum and documentation but had pointers to proceed. If anyone has done this or knows how to proceed, please guide&lt;/P&gt;</description>
    <pubDate>Wed, 05 Jan 2022 04:57:04 GMT</pubDate>
    <dc:creator>venur</dc:creator>
    <dc:date>2022-01-05T04:57:04Z</dc:date>
    <item>
      <title>SOLR configuration for search tokenization</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121707#M33382</link>
      <description>&lt;P&gt;Hello all&lt;/P&gt;&lt;P&gt;I am looking for right documentation or steps to deal with a request we have.&lt;/P&gt;&lt;P&gt;We&amp;nbsp;want to disable tokenization on special characters.&amp;nbsp;&lt;/P&gt;&lt;P&gt;I tried searching this forum and documentation but had pointers to proceed. If anyone has done this or knows how to proceed, please guide&lt;/P&gt;</description>
      <pubDate>Wed, 05 Jan 2022 04:57:04 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121707#M33382</guid>
      <dc:creator>venur</dc:creator>
      <dc:date>2022-01-05T04:57:04Z</dc:date>
    </item>
    <item>
      <title>Re: SOLR configuration for search tokenization</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121708#M33383</link>
      <description>&lt;P&gt;Any guidance, pointers will be really helpful.&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;hi&amp;nbsp;&lt;A href="https://migration33.stage.lithium.com/t5/user/viewprofilepage/user-id/5487"&gt;@angelborroy&lt;/A&gt;&amp;nbsp;&lt;A href="https://migration33.stage.lithium.com/t5/user/viewprofilepage/user-id/74498"&gt;@abhinavmishra14&lt;/A&gt;&amp;nbsp;&lt;A href="https://migration33.stage.lithium.com/t5/user/viewprofilepage/user-id/16045"&gt;@afaust&lt;/A&gt;&amp;nbsp;if you have any suggestions please share. I am stuck right now&lt;/P&gt;</description>
      <pubDate>Mon, 10 Jan 2022 04:13:35 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121708#M33383</guid>
      <dc:creator>venur</dc:creator>
      <dc:date>2022-01-10T04:13:35Z</dc:date>
    </item>
    <item>
      <title>Re: SOLR configuration for search tokenization</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121709#M33384</link>
      <description>&lt;P&gt;Can you provide additional details on you requirement?&lt;/P&gt;</description>
      <pubDate>Mon, 10 Jan 2022 09:42:26 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121709#M33384</guid>
      <dc:creator>angelborroy</dc:creator>
      <dc:date>2022-01-10T09:42:26Z</dc:date>
    </item>
    <item>
      <title>Re: SOLR configuration for search tokenization</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121710#M33385</link>
      <description>&lt;P&gt;hi&amp;nbsp;&lt;A href="https://migration33.stage.lithium.com/t5/user/viewprofilepage/user-id/82702"&gt;@venur&lt;/A&gt;&amp;nbsp; i have not dealt with such scenarios, I will have to check.&amp;nbsp;&lt;A href="https://migration33.stage.lithium.com/t5/user/viewprofilepage/user-id/5487"&gt;@angelborroy&lt;/A&gt;&amp;nbsp; may be able to provide some guidance. As mentioned by angel, please share what exactly you want to achieve so we can try the scenario.&lt;/P&gt;
&lt;P&gt;Found couple of links on the internet but not sure if they fit your requirement:&lt;/P&gt;
&lt;P&gt;&lt;A href="https://prowave.io/indexing-special-terms-using-solr" target="_blank" rel="nofollow noopener noreferrer"&gt;https://prowave.io/indexing-special-terms-using-solr&lt;/A&gt;&lt;/P&gt;
&lt;P&gt;&lt;A href="https://soft29.ru/blog/entry/alfresco-solr-enable-search-of" target="_blank" rel="nofollow noopener noreferrer"&gt;https://soft29.ru/blog/entry/alfresco-solr-enable-search-of&lt;/A&gt;&lt;/P&gt;
&lt;P&gt;&lt;A href="https://stackoverflow.com/questions/18277609/search-in-solr-with-special-characters" target="_blank" rel="nofollow noopener noreferrer"&gt;https://stackoverflow.com/questions/18277609/search-in-solr-with-special-characters&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 10 Jan 2022 20:58:42 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121710#M33385</guid>
      <dc:creator>abhinavmishra14</dc:creator>
      <dc:date>2022-01-10T20:58:42Z</dc:date>
    </item>
    <item>
      <title>Re: SOLR configuration for search tokenization</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121711#M33386</link>
      <description>&lt;P&gt;Thank you for responding&amp;nbsp;&lt;A href="https://migration33.stage.lithium.com/t5/user/viewprofilepage/user-id/5487"&gt;@angelborroy&lt;/A&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;We are importing images and video files from a third party Dam to alfresco repo. Several images and files have special characters in their names and they are on purpose for some business use cases.&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;some examples special characters as below-&lt;/P&gt;&lt;P&gt;$&lt;/P&gt;&lt;P&gt;-&lt;/P&gt;&lt;P&gt;_&lt;/P&gt;&lt;P&gt;and&lt;/P&gt;&lt;P&gt;!&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;Solr is tokenizing the names by default whenever name has these special characters and treating it as white spaces. I read in some doc that says this is a default behavior. But in our case we get a lot of search result if user tries to search for one file name with identical prefix/postfix.&lt;/P&gt;&lt;P&gt;For testing i tried this to show you the results i am getting&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN class="lia-inline-image-display-wrapper lia-image-align-center" image-alt="4A7258A0-0B5B-48B4-8785-634B5243A7D2.jpeg" style="width: 381px;"&gt;&lt;span class="lia-inline-image-display-wrapper" image-alt="image"&gt;&lt;img src="https://connect.hyland.com/t5/image/serverpage/image-id/1327iA8CB41343ED0AED5/image-size/large?v=v2&amp;amp;px=999" role="button" title="image" alt="image" /&gt;&lt;/span&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;you see above i get all the files that I don't need in results.&amp;nbsp;&lt;BR /&gt;i also try with "" but result remains same.&lt;/P&gt;&lt;P&gt;Please can you guide how to change this default behavior&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 11 Jan 2022 04:53:22 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121711#M33386</guid>
      <dc:creator>venur</dc:creator>
      <dc:date>2022-01-11T04:53:22Z</dc:date>
    </item>
    <item>
      <title>Re: SOLR configuration for search tokenization</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121712#M33387</link>
      <description>&lt;P&gt;Thank you&amp;nbsp;&lt;A href="https://migration33.stage.lithium.com/t5/user/viewprofilepage/user-id/74498"&gt;@abhinavmishra14&lt;/A&gt;&amp;nbsp;i will check also&lt;/P&gt;</description>
      <pubDate>Tue, 11 Jan 2022 04:54:09 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121712#M33387</guid>
      <dc:creator>venur</dc:creator>
      <dc:date>2022-01-11T04:54:09Z</dc:date>
    </item>
    <item>
      <title>Re: SOLR configuration for search tokenization</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121713#M33388</link>
      <description>&lt;P&gt;I guess you can't change that behaviour, since they are special SOLR characters.&lt;/P&gt;
&lt;P&gt;You may try escaping that characters in your search string:&lt;/P&gt;
&lt;P&gt;&lt;A href="https://solr.apache.org/guide/6_6/the-standard-query-parser.html#TheStandardQueryParser-EscapingSpecialCharacters" target="_blank" rel="nofollow noopener noreferrer"&gt;https://solr.apache.org/guide/6_6/the-standard-query-parser.html#TheStandardQueryParser-EscapingSpecialCharacters&lt;/A&gt;&lt;/P&gt;
&lt;P&gt;Apart from that, I don't see any other alternative.&lt;/P&gt;</description>
      <pubDate>Tue, 11 Jan 2022 09:42:06 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121713#M33388</guid>
      <dc:creator>angelborroy</dc:creator>
      <dc:date>2022-01-11T09:42:06Z</dc:date>
    </item>
    <item>
      <title>Re: SOLR configuration for search tokenization</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121714#M33389</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;A href="https://migration33.stage.lithium.com/t5/user/viewprofilepage/user-id/5487"&gt;@angelborroy&lt;/A&gt;&amp;nbsp;tx for the response.&amp;nbsp;&lt;/P&gt;&lt;P&gt;We also thought this option, but we can't escape characters now right? after indexes are already created by Solr by bypassing special characters and considering all as whitespaces. Based of what i read so far, there won't be a index for the word at all that includes those special characters e.g. :&lt;/P&gt;&lt;P&gt;restored$image.png&lt;/P&gt;&lt;P&gt;Do you mean still solr would have one index for the whole name with special characters I mentioned? Or am i understanding something wrongly&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 12 Jan 2022 03:43:09 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121714#M33389</guid>
      <dc:creator>venur</dc:creator>
      <dc:date>2022-01-12T03:43:09Z</dc:date>
    </item>
    <item>
      <title>Re: SOLR configuration for search tokenization</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121715#M33390</link>
      <description>&lt;P&gt;I guess you're right. I don't see any alternative out of the box to get that results including special characters.&lt;/P&gt;</description>
      <pubDate>Wed, 12 Jan 2022 08:22:12 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121715#M33390</guid>
      <dc:creator>angelborroy</dc:creator>
      <dc:date>2022-01-12T08:22:12Z</dc:date>
    </item>
    <item>
      <title>Re: SOLR configuration for search tokenization</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121716#M33391</link>
      <description>&lt;P&gt;Thanks&amp;nbsp;&lt;A href="https://migration33.stage.lithium.com/t5/user/viewprofilepage/user-id/5487"&gt;@angelborroy&lt;/A&gt;&amp;nbsp; for response. Yeah we know its not possible by default and that is what we are looking extend.&amp;nbsp;&lt;BR /&gt;we are aware of default behavior, and looking for steps to change this behavior either from solr or alfresco.&lt;/P&gt;&lt;P&gt;Your inputs or directions will be helpful&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 14 Jan 2022 00:07:54 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121716#M33391</guid>
      <dc:creator>venur</dc:creator>
      <dc:date>2022-01-14T00:07:54Z</dc:date>
    </item>
    <item>
      <title>Re: SOLR configuration for search tokenization</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121717#M33392</link>
      <description>&lt;P&gt;&lt;A href="https://migration33.stage.lithium.com/t5/user/viewprofilepage/user-id/82702"&gt;@venur&lt;/A&gt;&amp;nbsp;Been curious about this and have had some time spent on this issue in last couple of weeks. I think, i have a solution that may fit your case. It works for me in few tests that i did.&amp;nbsp;&lt;/P&gt;
&lt;P&gt;It is based on the links i shared &lt;A href="https://hub.alfresco.com/t5/alfresco-content-services-forum/solr-configuration-for-search-tokenization/m-p/310988/highlight/true#M26464" target="_self" rel="nofollow noopener noreferrer"&gt;above&lt;/A&gt;.&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Here is what i did:&lt;/P&gt;
&lt;UL&gt;
&lt;LI&gt;In your $SOLR_HOME\alfresco\conf (you may have a different setup) you need to add following configs to tweak the tokenization process, e.g. "C:\alfresco-search-services\solrhome\alfresco\conf"&amp;nbsp;
&lt;UL&gt;
&lt;LI&gt;Create a file named "Latin-break-only-on-whitespace.rbbi" in $SOLR_HOME\alfresco\conf&lt;/LI&gt;
&lt;LI&gt;Add following content:&lt;/LI&gt;
&lt;/UL&gt;
&lt;/LI&gt;
&lt;/UL&gt;

&lt;PRE&gt;!!forward;

$Whitespace = [\p{Whitespace}];
$NonWhitespace = [\P{Whitespace}];
$Letter = [\p{Letter}];
$Number = [\p{Number}];

# Default rule status is {0}=RBBI.WORD_NONE =&amp;gt; not tokenized by ICUTokenizer
$Whitespace;

# Assign rule status {200}=RBBI.WORD_LETTER when the token contains a letter char
# Mapped to &amp;lt;ALPHANUM&amp;gt; token type by DefaultICUTokenizerConfig
$NonWhitespace* $Letter $NonWhitespace*   {200};

# Assign rule status {100}=RBBI.WORD_NUM when the token contains a numeric char
# Mapped to &amp;lt;NUM&amp;gt; token type by DefaultICUTokenizerConfig
$NonWhitespace* $Number $NonWhitespace*   {100};

# Assign rule status {1} (no RBBI equivalent) when the token contains neither a letter nor a numeric char
# Mapped to &amp;lt;OTHER&amp;gt; token type by DefaultICUTokenizerConfig
$NonWhitespace+   {1};&lt;/PRE&gt;

&lt;UL&gt;
&lt;LI&gt;Create a file named "characters.txt" in $SOLR_HOME\alfresco\conf
&lt;UL&gt;
&lt;LI&gt;Add following content:&lt;/LI&gt;
&lt;/UL&gt;
&lt;/LI&gt;
&lt;/UL&gt;
&lt;PRE&gt;_ =&amp;gt; ALPHA 
- =&amp;gt; ALPHA 
$ =&amp;gt; ALPHA 
! =&amp;gt; ALPHA &lt;/PRE&gt;
&lt;UL&gt;
&lt;LI&gt;Edit the $SOLR_HOME/alfresco/schema.xml
&lt;UL&gt;
&lt;LI&gt;find the "fieldType" with below analyzer settings:
&lt;UL&gt;
&lt;LI&gt;&lt;EM&gt;&amp;lt;fieldType name="text___" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="false"&amp;gt;&lt;/EM&gt;&lt;/LI&gt;
&lt;/UL&gt;
&lt;/LI&gt;
&lt;LI&gt;Update the tokenizer settings "&amp;lt;tokenizer class="solr.ICUTokenizerFactory" ....&amp;gt;" as below:
&lt;UL&gt;
&lt;LI&gt;
&lt;PRE&gt;&amp;lt;fieldType name="text___" class="solr.TextField" positionIncrementGap="100" indexed="true" stored="false"&amp;gt;
      &amp;lt;analyzer&amp;gt;
        &amp;lt;charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\x{0000}.*\x{0000}" replacement=""/&amp;gt;
        &amp;lt;charFilter class="solr.PatternReplaceCharFilterFactory" pattern="(#0;.*#0;)" replacement=""/&amp;gt;
&lt;STRONG&gt;        &amp;lt;tokenizer class="solr.ICUTokenizerFactory" &lt;FONT color="#0000FF"&gt;rulefiles="Latn:Latin-break-only-on-whitespace.rbbi"&lt;/FONT&gt;/&amp;gt;
&lt;/STRONG&gt;        &amp;lt;!-- &amp;lt;tokenizer class="org.apache.solr.analysis.WhitespaceTokenizerFactory" /&amp;gt; --&amp;gt;
        &amp;lt;filter class="org.apache.solr.analysis.WordDelimiterFilterFactory"
                generateWordParts="1"
                generateNumberParts="1"
                catenateWords="1"
                catenateNumbers="1"
                catenateAll="1"
                splitOnCaseChange="1"
                splitOnNumerics="1"
                preserveOriginal="1"
                stemEnglishPossessive="1"
&lt;FONT color="#0000FF"&gt;&lt;STRONG&gt;		types="characters.txt"/&amp;gt;
&lt;/STRONG&gt;&lt;/FONT&gt;        &amp;lt;filter class="solr.ICUFoldingFilterFactory"/&amp;gt;
      &amp;lt;/analyzer&amp;gt;
    &amp;lt;/fieldType&amp;gt;&lt;/PRE&gt;
&lt;/LI&gt;
&lt;/UL&gt;
&lt;/LI&gt;
&lt;/UL&gt;
&lt;/LI&gt;
&lt;/UL&gt;
&lt;P&gt;If you want to configure same settings for archite store, then follow the same steps for "$SOLR_HOME\archive\conf".&lt;/P&gt;
&lt;P&gt;&lt;STRONG&gt;Note: You will have to full re-index in order to allow these setting handle the tokennization.&lt;/STRONG&gt;&lt;/P&gt;
&lt;P&gt;Hope this helps.&lt;/P&gt;</description>
      <pubDate>Sat, 05 Mar 2022 04:46:14 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121717#M33392</guid>
      <dc:creator>abhinavmishra14</dc:creator>
      <dc:date>2022-03-05T04:46:14Z</dc:date>
    </item>
    <item>
      <title>Re: SOLR configuration for search tokenization</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121718#M33393</link>
      <description>&lt;P&gt;Thank you very very much&amp;nbsp;&lt;A href="https://migration33.stage.lithium.com/t5/user/viewprofilepage/user-id/74498"&gt;@abhinavmishra14&lt;/A&gt;&amp;nbsp;for support, this work. We are not able to implement it so far so left it. but your solution work. We did full re-index also as you said.&lt;/P&gt;</description>
      <pubDate>Tue, 08 Mar 2022 04:22:16 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/121718#M33393</guid>
      <dc:creator>venur</dc:creator>
      <dc:date>2022-03-08T04:22:16Z</dc:date>
    </item>
    <item>
      <title>Re: SOLR configuration for search tokenization</title>
      <link>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/496105#M40746</link>
      <description>&lt;P&gt;Great writeup. I managed to replicate it with ease, but this tokenizer now behaves a lot like whitespace tokenizer, due to how you have set up the RBBI rules.&amp;nbsp;&lt;/P&gt;&lt;P&gt;How would one make one that resembles more classic or standard tokenizer, but doesn't split at specific character, like hyphens?&lt;/P&gt;&lt;P&gt;I tried to adapt the RBBI file but I keep ending up rewriting the entire tokenization rules and only the RBBI then apply, which shrink my token output to a very small subset of what standard tokenizer would normally do. Thank you for any help.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Mon, 02 Feb 2026 08:29:57 GMT</pubDate>
      <guid>https://connect.hyland.com/t5/alfresco-forum/solr-configuration-for-search-tokenization/m-p/496105#M40746</guid>
      <dc:creator>quantumbit</dc:creator>
      <dc:date>2026-02-02T08:29:57Z</dc:date>
    </item>
  </channel>
</rss>

